diff --git a/packages/rol/CMakeLists.txt b/packages/rol/CMakeLists.txt
index b6e045207d7b..b6ca87c49e42 100644
--- a/packages/rol/CMakeLists.txt
+++ b/packages/rol/CMakeLists.txt
@@ -1,131 +1,124 @@
-INCLUDE(TribitsPackageMacros)
-INCLUDE(TribitsAddOptionAndDefine)
-
-
-#
-# A) Define the package
-#
-
-TRIBITS_PACKAGE(ROL)
-
-#
-# B) Set up package-specific options
-#
-
-TRIBITS_ADD_OPTION_AND_DEFINE(${PACKAGE_NAME}_ENABLE_DEBUG
-  HAVE_ROL_DEBUG
-  "Enable a host of runtime debug checking."
-  ${${PROJECT_NAME}_ENABLE_DEBUG}
-  )
-
-TRIBITS_ADD_OPTION_AND_DEFINE(${PACKAGE_NAME}_ENABLE_TIMERS
-  ROL_TIMERS
-  "Build ROL with Teuchos TimeMonitors enabled."
-  OFF
-  )
-
-TRIBITS_ADD_OPTION_AND_DEFINE(${PACKAGE_NAME}_ENABLE_PYROL
-  ENABLE_PYBIND11_PYROL
-  "Build ROL with PyROL interface."
-  OFF
-  )
-
-TRIBITS_ADD_OPTION_AND_DEFINE(${PACKAGE_NAME}_ENABLE_PARAMETERLIST_VALIDATION
-  ENABLE_PARAMETERLIST_VALIDATION
-  "Build ROL with ParameterList validation."
-  OFF
+# Rapid Optimization Library
+
+if( COMMAND TRIBITS_PACKAGE )
+  set( STANDALONE_ROL FALSE )
+else()
+  set( STANDALONE_ROL TRUE )
+endif()
+
+if( STANDALONE_ROL )
+
+  cmake_minimum_required(VERSION 3.23)
+
+  set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+  project( ROL
+           VERSION 2.0
+           DESCRIPTION "Rapid Optimization Library"
+           LANGUAGES CXX )
+  set(CMAKE_CXX_STANDARD 17)
+  set(CMAKE_CXX_STANDARD_REQUIRED ON)
+  set(CMAKE_CXX_EXTENSIONS OFF)
+  set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH})
+
+  find_package(BLAS REQUIRED)
+  find_package(LAPACK REQUIRED)
+
+  if(NOT CMAKE_BUILD_TYPE)
+      set(CMAKE_BUILD_TYPE RELEASE CACHE STRING "Build type" FORCE)
+  endif()
+
+  # Set F77_BLAS_MANGLE macro based on Fortran-C interface (similar to Kokkos Kernels)
+  if("${F77_BLAS_MANGLE}" STREQUAL "")
+    enable_language(C)
+    enable_language(Fortran)
+    include(FortranCInterface)
+    FortranCInterface_HEADER(${CMAKE_CURRENT_BINARY_DIR}/FortranCInterface.h MACRO_NAMESPACE "F77_")
+    if(FortranCInterface_GLOBAL_SUFFIX STREQUAL "")
+      set(F77_BLAS_MANGLE "(name,NAME) ${FortranCInterface_GLOBAL_PREFIX}name")
+    else()
+      set(F77_BLAS_MANGLE "(name,NAME) ${FortranCInterface_GLOBAL_PREFIX}name ## ${FortranCInterface_GLOBAL_SUFFIX}")
+    endif()
+    message(STATUS "Detected Fortran name mangling: ${F77_BLAS_MANGLE}")
+  endif()
+
+  # Add pugixml for XML parameter file support
+  include(FetchContent)
+  FetchContent_Declare(
+    pugixml
+    GIT_REPOSITORY https://github.com/zeux/pugixml.git
+    GIT_TAG        v1.14
   )
-
-# Build Options
-SET( CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
-INCLUDE(BuildOptions)
-GET_PROPERTY( STACKTRACE_STRING GLOBAL PROPERTY STACKTRACE_IMPL )
-#IF( ${STACKTRACE_STRING} STREQUAL "backward-cpp" )
-#INCLUDE(BackwardConfig)
-#ENDIF()
-
-
-include(ROLParameters)
-
-
-#
-# C) Add the libraries, tests, and examples
-#
-
-ADD_SUBDIRECTORY(src)
-
-IF( ROL_ENABLE_Sacado ) 
-ADD_SUBDIRECTORY(adapters/sacado)
-ENDIF()
-
-IF (ROL_ENABLE_ArrayFireCPU)
-  ADD_SUBDIRECTORY(adapters/arrayfire)
-ENDIF()
-
-IF( ROL_ENABLE_PYROL ) 
-  ADD_SUBDIRECTORY(pyrol)
-ENDIF()
-
-# Adapters which require use of Teuchos
-
-#GET_PROPERTY( PTR_STRING GLOBAL PROPERTY PTR_IMPL )
-#GET_PROPERTY( PARAMETERLIST_STRING GLOBAL PROPERTY PARAMETERLIST_IMPL )
-#IF( PTR_STRING STREQUAL "Teuchos::RCP" )
-#IF( PARAMETERLIST_STRING STREQUAL "Teuchos::ParameterList" )
-
-IF( ${USING_TEUCHOS_ALL} )
-
-
-ADD_SUBDIRECTORY(adapters/teuchos)
-
-IF( TPL_ENABLE_MPI ) 
-ADD_SUBDIRECTORY(adapters/mpi)
-ENDIF()
-
-IF( ROL_ENABLE_Epetra )
-ADD_SUBDIRECTORY(adapters/epetra)
-ENDIF()
-
-IF( ROL_ENABLE_Tpetra )
-ADD_SUBDIRECTORY(adapters/tpetra)
-ENDIF()
-
-IF( ROL_ENABLE_Belos )
-ADD_SUBDIRECTORY(adapters/belos)
-ENDIF()
-
-IF( ROL_ENABLE_Thyra )
-ADD_SUBDIRECTORY(adapters/thyra)
-ENDIF()
-
-IF( ROL_ENABLE_TriKota )
-ADD_SUBDIRECTORY(adapters/trikota)
-ENDIF()
-
-IF( ROL_ENABLE_pebbl )
-ADD_SUBDIRECTORY(adapters/pebbl)
-ENDIF()
-
-IF (ROL_ENABLE_Boost AND ROL_ENABLE_MiniTensor)
-  ADD_SUBDIRECTORY(adapters/minitensor)
-ENDIF()
-
-ENDIF( ${USING_TEUCHOS_ALL} )
-
-#ENDIF( PARAMETERLIST_STRING STREQUAL "Teuchos::ParameterList" )
-#ENDIF( PTR_STRING           STREQUAL "Teuchos::RCP" )
-
-IF (ROL_ENABLE_Eigen)
-  ADD_SUBDIRECTORY(adapters/eigen)
-ENDIF()
-
-
-TRIBITS_ADD_TEST_DIRECTORIES(test)
-TRIBITS_ADD_EXAMPLE_DIRECTORIES(example)
-TRIBITS_ADD_EXAMPLE_DIRECTORIES(tutorial)
-
-#
-# D) Do standard postprocessing
-#
-
-TRIBITS_PACKAGE_POSTPROCESS()
+  FetchContent_MakeAvailable(pugixml)
+
+  include(ROLUtils)
+
+  # Configure the config header
+  configure_file("${PROJECT_SOURCE_DIR}/cmake/ROL_config.h.in" ROL_config.h @ONLY)
+
+  add_library(rol SHARED)
+
+  set(SRC ${PROJECT_SOURCE_DIR}/src)
+
+  # Set up compatibility includes (specific directories to avoid conflicts)
+  set(ROL_COMPATIBILITY_INCLUDES ${SRC}/compatibility/teuchos/blas
+                                 ${SRC}/compatibility/teuchos/la
+                                 ${SRC}/compatibility/simple/lapack
+                                 ${SRC}/compatibility/simple/mpi
+                                 ${SRC}/compatibility/simple/parameterlist
+                                 ${SRC}/compatibility/std/shared_ptr
+                                 ${SRC}/compatibility/teuchos/stacktrace
+                                 ${SRC}/compatibility/teuchos-lite)
+
+  add_library(Teuchos_BLAS OBJECT ${SRC}/compatibility/teuchos-lite/Teuchos_BLAS.cpp)
+  add_library(Teuchos_CompObject OBJECT ${SRC}/compatibility/teuchos-lite/Teuchos_CompObject.cpp)
+  add_library(Teuchos_TestForException OBJECT ${SRC}/compatibility/teuchos-lite/Teuchos_TestForException.cpp)
+
+  # Use GLOB_RECURSE to find all directories in core components
+  set(ROL_CORE_DIRS ${SRC}/algorithm
+                    ${SRC}/elementwise
+                    ${SRC}/function
+                    ${SRC}/oed
+                    ${SRC}/sol
+                    ${SRC}/status
+                    ${SRC}/step
+                    ${SRC}/utils
+                    ${SRC}/vector
+                    ${SRC}/zoo)
+
+  set(ROL_CORE_INCLUDES)
+  foreach(core_dir ${ROL_CORE_DIRS})
+    file(GLOB_RECURSE subdirs LIST_DIRECTORIES true "${core_dir}/*")
+    list(APPEND ROL_CORE_INCLUDES ${core_dir})
+    foreach(subdir ${subdirs})
+      if(IS_DIRECTORY ${subdir})
+        list(APPEND ROL_CORE_INCLUDES ${subdir})
+      endif()
+    endforeach()
+  endforeach()
+
+  target_include_directories(rol PUBLIC ${CMAKE_CURRENT_BINARY_DIR}
+                                        ${ROL_COMPATIBILITY_INCLUDES}
+                                        ${ROL_CORE_INCLUDES})
+
+  target_link_libraries(rol PUBLIC ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES}
+                                    pugixml
+                                    Teuchos_BLAS Teuchos_CompObject Teuchos_TestForException)
+
+  add_library(rol::rol ALIAS rol)
+
+  option(ENABLE_TESTS OFF)
+  option(ENABLE_EXAMPLES OFF)
+
+  if(ENABLE_TESTS)
+    enable_testing()
+    add_subdirectory(test)
+  endif()
+
+  if(ENABLE_EXAMPLES)
+    add_subdirectory(example)
+  endif()
+
+else() # Build with Trilinos
+  include(cmake/TrilinosROL.cmake)
+endif()
diff --git a/packages/rol/README.md b/packages/rol/README.md
index dc666260920a..c6b48f7ddaf4 100644
--- a/packages/rol/README.md
+++ b/packages/rol/README.md
@@ -1,6 +1,6 @@
-# Rapid Optimization Library (ROL) Package
+# Rapid Optimization Library (ROL)
 
-![Rapid Optimization Library](rol.png)
+![Rapid Optimization Library](https://raw.githubusercontent.com/sandialabs/rol/refs/heads/develop/rol.png)
 
 **ROL** (as in rock and _roll_) is a high-performance C++ library for numerical optimization.
 ROL brings an extensive collection of state-of-the-art optimization algorithms to virtually
@@ -20,16 +20,17 @@ Feature highlights:
 4. Fast and robust algorithms for nonsmooth optimization
 5. Trust-region methods for inexact and adaptive computations
 6. PDE-OPT application development kit for PDE-constrained optimization
+7. Interfaces and algorithms for optimal experimental design
 
 
 ## Copyright and License
-See rol/COPYRIGHT, rol/LICENSE, https://trilinos.github.io/license.html and individual file headers for additional information.
+See COPYRIGHT and LICENSE.
 
 
-## Questions? 
-Contact lead developers:
+## Questions?
+Contact team or developers:
 
-* ROL team     (GitHub handle: @trilinos/rol)
+* ROL Team     (GitHub handle: @sandialabs/rol)
 * Drew Kouri   (GitHub handle: [dpkouri](https://github.com/dpkouri) or dpkouri@sandia.gov)
 * Denis Ridzal (GitHub handle: [dridzal](https://github.com/dridzal) or dridzal@sandia.gov)
 
diff --git a/packages/rol/adapters/arrayfire/test/vector/test_01.cpp b/packages/rol/adapters/arrayfire/test/vector/test_01.cpp
index 27b5429d9b97..d3a72df10234 100644
--- a/packages/rol/adapters/arrayfire/test/vector/test_01.cpp
+++ b/packages/rol/adapters/arrayfire/test/vector/test_01.cpp
@@ -15,7 +15,7 @@
 #include "ROL_StdVector.hpp"
 #include "ROL_Types.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 
@@ -24,7 +24,7 @@ typedef float ElementT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/adapters/belos/test/vector/test_01.cpp b/packages/rol/adapters/belos/test/vector/test_01.cpp
index 9637aeca34ef..83d41afa4e57 100644
--- a/packages/rol/adapters/belos/test/vector/test_01.cpp
+++ b/packages/rol/adapters/belos/test/vector/test_01.cpp
@@ -20,7 +20,7 @@
 #include "ROL_StatusTest.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Teuchos_XMLParameterListHelpers.hpp"
 
 #include <cstdlib>
@@ -29,7 +29,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv,0);
+  ROL::GlobalMPISession mpiSession(&argc, &argv,0);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/adapters/eigen/test/vector/test_01.cpp b/packages/rol/adapters/eigen/test/vector/test_01.cpp
index a0830a2a3311..04675ed52d29 100644
--- a/packages/rol/adapters/eigen/test/vector/test_01.cpp
+++ b/packages/rol/adapters/eigen/test/vector/test_01.cpp
@@ -12,7 +12,7 @@
 #include "ROL_RandomVector.hpp"
 #include "ROL_StdVector.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 
@@ -26,7 +26,7 @@ int main(int argc, char *argv[]) {
   using E3V = ROL::Eigen3Vector<RealT>;
   using EigenVector = Eigen::Matrix<RealT,Eigen::Dynamic,1>;
  
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/adapters/epetra/test/sol/test_01.cpp b/packages/rol/adapters/epetra/test/sol/test_01.cpp
index 723a7664243e..b149b53fc0a3 100644
--- a/packages/rol/adapters/epetra/test/sol/test_01.cpp
+++ b/packages/rol/adapters/epetra/test/sol/test_01.cpp
@@ -127,7 +127,7 @@ void printSolution(const std::vector<RealT> &x,
 int main(int argc, char* argv[]) {
   ROL::Ptr<Epetra_Comm> comm;
 #ifdef HAVE_MPI
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv,0);
+  ROL::GlobalMPISession mpiSession(&argc, &argv,0);
   comm = ROL::makePtr<Epetra_MpiComm>(MPI_COMM_WORLD);
 #else 
   comm = ROL::makePtr<Epetra_SerialComm>();
diff --git a/packages/rol/adapters/epetra/test/sol/test_02.cpp b/packages/rol/adapters/epetra/test/sol/test_02.cpp
index c4efe0c95fa3..c8483f90d26e 100644
--- a/packages/rol/adapters/epetra/test/sol/test_02.cpp
+++ b/packages/rol/adapters/epetra/test/sol/test_02.cpp
@@ -10,6 +10,7 @@
 #include "Teuchos_ParameterList.hpp"
 #include "Teuchos_XMLParameterListHelpers.hpp"
 #include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #ifdef HAVE_MPI
 #include "Epetra_MpiComm.h"
@@ -26,7 +27,7 @@ typedef double RealT;
 int main(int argc, char* argv[]) {
   ROL::Ptr<Epetra_Comm> comm;
 #ifdef HAVE_MPI
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv,0);
+  ROL::GlobalMPISession mpiSession(&argc, &argv,0);
   comm = ROL::makePtr<Epetra_MpiComm>(MPI_COMM_WORLD);
 #else
   comm = ROL::makePtr<Epetra_SerialComm>();
diff --git a/packages/rol/adapters/epetra/test/vector/test_01.cpp b/packages/rol/adapters/epetra/test/vector/test_01.cpp
index 98689ef3bd2e..1becdccc391f 100644
--- a/packages/rol/adapters/epetra/test/vector/test_01.cpp
+++ b/packages/rol/adapters/epetra/test/vector/test_01.cpp
@@ -15,7 +15,7 @@
 #include "ROL_Types.hpp"
 #include "Epetra_Map.h"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #ifdef HAVE_MPI
 #include "Epetra_MpiComm.h"
 #else
@@ -30,7 +30,7 @@ typedef double ElementT;
 int main(int argc, char *argv[]) {
 
 #ifdef HAVE_MPI
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv,0);
+  ROL::GlobalMPISession mpiSession(&argc, &argv,0);
   Epetra_MpiComm Comm(MPI_COMM_WORLD);
 #else
   Epetra_SerialComm Comm;
diff --git a/packages/rol/adapters/mpi/src/function/ROL_PinTCommunicationUtilities.hpp b/packages/rol/adapters/mpi/src/function/ROL_PinTCommunicationUtilities.hpp
index 4c9ec8288638..c9f36c19e43f 100644
--- a/packages/rol/adapters/mpi/src/function/ROL_PinTCommunicationUtilities.hpp
+++ b/packages/rol/adapters/mpi/src/function/ROL_PinTCommunicationUtilities.hpp
@@ -13,7 +13,7 @@
 #include <vector>
 #include <mpi.h>
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "ROL_TimeStamp.hpp"
 #include "ROL_PinTCommunicators.hpp"
diff --git a/packages/rol/adapters/mpi/src/vector/ROL_PinTVector.hpp b/packages/rol/adapters/mpi/src/vector/ROL_PinTVector.hpp
index 0b6f2e7765e7..40764f2e2564 100644
--- a/packages/rol/adapters/mpi/src/vector/ROL_PinTVector.hpp
+++ b/packages/rol/adapters/mpi/src/vector/ROL_PinTVector.hpp
@@ -590,7 +590,7 @@ class PinTVector
   */
   virtual void plus( const Vector<Real> &x ) override
   {
-    typedef PinTVector<Real> lPinTVector; 
+    typedef PinTVector<Real> lPinTVector;
     const lPinTVector &xs = dynamic_cast<const lPinTVector&>(x);
 
     stepVectors_->plus(*xs.stepVectors_);
@@ -621,7 +621,7 @@ class PinTVector
   virtual Real dot( const Vector<Real> &x ) const override
   {
     // this is probably very inefficient way to do this... oh well!
-    typedef PinTVector<Real> lPinTVector; 
+    typedef PinTVector<Real> lPinTVector;
     const lPinTVector &xs = dynamic_cast<const lPinTVector&>(x);
 
     // this won't work for Real!=double...oh well!
@@ -698,7 +698,7 @@ class PinTVector
              ---
   */
   virtual void set( const Vector<Real> &x ) override {
-    typedef PinTVector<Real> lPinTVector; 
+    typedef PinTVector<Real> lPinTVector;
     const lPinTVector &xs = dynamic_cast<const lPinTVector&>(x);
 
     stepVectors_->set(*xs.stepVectors_);
@@ -716,7 +716,7 @@ class PinTVector
              ---
   */
   virtual void axpy( const Real alpha, const Vector<Real> &x ) override {
-    typedef PinTVector<Real> lPinTVector; 
+    typedef PinTVector<Real> lPinTVector;
     const lPinTVector &xs = dynamic_cast<const lPinTVector&>(x);
 
     stepVectors_->axpy(alpha,*xs.stepVectors_);
diff --git a/packages/rol/adapters/mpi/test/function/test_hierarchy_coarse_rebalance.cpp b/packages/rol/adapters/mpi/test/function/test_hierarchy_coarse_rebalance.cpp
index 1e5d8600cbad..25ea7799adf4 100644
--- a/packages/rol/adapters/mpi/test/function/test_hierarchy_coarse_rebalance.cpp
+++ b/packages/rol/adapters/mpi/test/function/test_hierarchy_coarse_rebalance.cpp
@@ -10,7 +10,7 @@
 #include <vector>
 #include <mpi.h>
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "ROL_Stream.hpp"
 #include "ROL_PinTCommunicationUtilities.hpp"
@@ -73,7 +73,7 @@ void testHiearchyCoarseTimeStamps(MPI_Comm comm, const ROL::Ptr<std::ostream> &
 
 int main(int argc, char* argv[]) 
 {
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/adapters/mpi/test/function/test_restrict_prolong.cpp b/packages/rol/adapters/mpi/test/function/test_restrict_prolong.cpp
index 90daee89db34..c37ffd23ba58 100644
--- a/packages/rol/adapters/mpi/test/function/test_restrict_prolong.cpp
+++ b/packages/rol/adapters/mpi/test/function/test_restrict_prolong.cpp
@@ -10,7 +10,7 @@
 #include <vector>
 #include <mpi.h>
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "ROL_Stream.hpp"
 #include "ROL_PinTCommunicationUtilities.hpp"
@@ -26,7 +26,7 @@ void testRestrictionProlong_OptVector(MPI_Comm comm, const ROL::Ptr<std::ostream
 
 int main(int argc, char* argv[]) 
 {
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/adapters/mpi/test/function/test_restrict_prolong_rebalance.cpp b/packages/rol/adapters/mpi/test/function/test_restrict_prolong_rebalance.cpp
index 5674530647ad..f9cda5efaffd 100644
--- a/packages/rol/adapters/mpi/test/function/test_restrict_prolong_rebalance.cpp
+++ b/packages/rol/adapters/mpi/test/function/test_restrict_prolong_rebalance.cpp
@@ -10,7 +10,7 @@
 #include <vector>
 #include <mpi.h>
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "ROL_Stream.hpp"
 #include "ROL_PinTCommunicationUtilities.hpp"
@@ -26,7 +26,7 @@ void testRestrictionProlong_OptVector(MPI_Comm comm, const ROL::Ptr<std::ostream
 
 int main(int argc, char* argv[]) 
 {
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/adapters/mpi/test/function/test_timestamp_coarsening.cpp b/packages/rol/adapters/mpi/test/function/test_timestamp_coarsening.cpp
index da5245ce71de..a5284a5ec8b9 100644
--- a/packages/rol/adapters/mpi/test/function/test_timestamp_coarsening.cpp
+++ b/packages/rol/adapters/mpi/test/function/test_timestamp_coarsening.cpp
@@ -10,7 +10,7 @@
 #include <vector>
 #include <mpi.h>
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "ROL_Stream.hpp"
 #include "ROL_PinTCommunicationUtilities.hpp"
@@ -25,7 +25,7 @@ void run_VectorExportToFine_test(MPI_Comm comm, const ROL::Ptr<std::ostream> & o
 
 int main(int argc, char* argv[]) 
 {
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/adapters/mpi/test/vector/test_01.cpp b/packages/rol/adapters/mpi/test/vector/test_01.cpp
index df0b52190eba..937556dabf0e 100644
--- a/packages/rol/adapters/mpi/test/vector/test_01.cpp
+++ b/packages/rol/adapters/mpi/test/vector/test_01.cpp
@@ -8,7 +8,7 @@
 // @HEADER
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "ROL_PinTVector.hpp"
 #include "ROL_PinTVectorCommunication_StdVector.hpp"
@@ -20,7 +20,7 @@ int main(int argc, char* argv[])
 
   typedef ROL::Ptr<ROL::Vector<RealT>> PtrVector;
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/adapters/mpi/test/vector/test_02.cpp b/packages/rol/adapters/mpi/test/vector/test_02.cpp
index a2f9035aea36..af7dc681baf6 100644
--- a/packages/rol/adapters/mpi/test/vector/test_02.cpp
+++ b/packages/rol/adapters/mpi/test/vector/test_02.cpp
@@ -8,7 +8,7 @@
 // @HEADER
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "ROL_PinTVector.hpp"
 #include "ROL_PinTVectorCommunication_StdVector.hpp"
@@ -20,7 +20,7 @@ int main(int argc, char* argv[])
 
 //  typedef ROL::Ptr<ROL::Vector<RealT>> PtrVector;
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/adapters/pebbl/test/test_01.cpp b/packages/rol/adapters/pebbl/test/test_01.cpp
index 180a8b0e5b1c..09995422ffb9 100644
--- a/packages/rol/adapters/pebbl/test/test_01.cpp
+++ b/packages/rol/adapters/pebbl/test/test_01.cpp
@@ -13,7 +13,7 @@ typedef double RealT;
 
 int main(int argc, char* argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint = argc - 1;
diff --git a/packages/rol/adapters/pebbl/test/test_01.hpp b/packages/rol/adapters/pebbl/test/test_01.hpp
index 34816fbb4223..056f28b1debe 100644
--- a/packages/rol/adapters/pebbl/test/test_01.hpp
+++ b/packages/rol/adapters/pebbl/test/test_01.hpp
@@ -11,7 +11,7 @@
 #include "Teuchos_XMLParameterListHelpers.hpp"
 #include "Teuchos_oblackholestream.hpp"
 #include "Teuchos_LAPACK.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Teuchos_Comm.hpp"
 #include "Teuchos_DefaultComm.hpp"
 #include "Teuchos_CommHelpers.hpp"
diff --git a/packages/rol/adapters/pebbl/test/test_02.cpp b/packages/rol/adapters/pebbl/test/test_02.cpp
index 246d192193e2..a9870f21077a 100644
--- a/packages/rol/adapters/pebbl/test/test_02.cpp
+++ b/packages/rol/adapters/pebbl/test/test_02.cpp
@@ -10,7 +10,7 @@
 #include "Teuchos_ParameterList.hpp"
 #include "Teuchos_XMLParameterListHelpers.hpp"
 #include "Teuchos_oblackholestream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Teuchos_Comm.hpp"
 #include "Teuchos_DefaultComm.hpp"
 #include "Teuchos_CommHelpers.hpp"
@@ -22,7 +22,7 @@ typedef double RealT;
 
 int main(int argc, char* argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
   ROL::Ptr<const Teuchos::Comm<int> > comm
     = Teuchos::DefaultComm<int>::getComm();
 
diff --git a/packages/rol/adapters/pebbl/test/test_03.cpp b/packages/rol/adapters/pebbl/test/test_03.cpp
index 2276d1d7499d..9f72e5b6fb65 100644
--- a/packages/rol/adapters/pebbl/test/test_03.cpp
+++ b/packages/rol/adapters/pebbl/test/test_03.cpp
@@ -13,7 +13,7 @@ typedef double RealT;
 
 int main(int argc, char* argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint = argc - 1;
diff --git a/packages/rol/adapters/pebbl/test/test_04.cpp b/packages/rol/adapters/pebbl/test/test_04.cpp
index 8e1bff28a37f..a17deb61042e 100644
--- a/packages/rol/adapters/pebbl/test/test_04.cpp
+++ b/packages/rol/adapters/pebbl/test/test_04.cpp
@@ -15,7 +15,7 @@ typedef double RealT;
 
 int main(int argc, char* argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint = argc - 1;
diff --git a/packages/rol/adapters/pebbl/test/test_05.cpp b/packages/rol/adapters/pebbl/test/test_05.cpp
index a356e1773390..6780fad52612 100644
--- a/packages/rol/adapters/pebbl/test/test_05.cpp
+++ b/packages/rol/adapters/pebbl/test/test_05.cpp
@@ -15,7 +15,7 @@ typedef double RealT;
 
 int main(int argc, char* argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint = argc - 1;
diff --git a/packages/rol/adapters/pebbl/test/test_06.cpp b/packages/rol/adapters/pebbl/test/test_06.cpp
index 17e3ae415d20..5c6467ca2bf6 100644
--- a/packages/rol/adapters/pebbl/test/test_06.cpp
+++ b/packages/rol/adapters/pebbl/test/test_06.cpp
@@ -15,7 +15,7 @@ typedef double RealT;
 
 int main(int argc, char* argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint = argc - 1;
diff --git a/packages/rol/adapters/pebbl/test/test_06.hpp b/packages/rol/adapters/pebbl/test/test_06.hpp
index 25f0c891160f..d792397f0a46 100644
--- a/packages/rol/adapters/pebbl/test/test_06.hpp
+++ b/packages/rol/adapters/pebbl/test/test_06.hpp
@@ -11,7 +11,7 @@
 #include "Teuchos_XMLParameterListHelpers.hpp"
 #include "Teuchos_oblackholestream.hpp"
 #include "Teuchos_LAPACK.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Teuchos_Comm.hpp"
 #include "Teuchos_DefaultComm.hpp"
 #include "Teuchos_CommHelpers.hpp"
diff --git a/packages/rol/adapters/sacado/test/test_01.cpp b/packages/rol/adapters/sacado/test/test_01.cpp
index 9466a9afbf15..64d13e782414 100644
--- a/packages/rol/adapters/sacado/test/test_01.cpp
+++ b/packages/rol/adapters/sacado/test/test_01.cpp
@@ -22,7 +22,7 @@
 #include "ROL_ValidParameters.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Teuchos_XMLParameterListHelpers.hpp"
 
 #include <iomanip>
@@ -42,7 +42,7 @@ int main(int argc, char *argv[]) {
   typedef NonlinearProgram<RealT>    NLP;
   typedef AlgorithmState<RealT>      STATE;
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/adapters/teuchos/test/sol/test_03.cpp b/packages/rol/adapters/teuchos/test/sol/test_03.cpp
index d0a360fd1685..697b2c119625 100644
--- a/packages/rol/adapters/teuchos/test/sol/test_03.cpp
+++ b/packages/rol/adapters/teuchos/test/sol/test_03.cpp
@@ -9,7 +9,7 @@
 
 #include "ROL_ParameterList.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Teuchos_Comm.hpp"
 #include "Teuchos_DefaultComm.hpp"
 #include "Teuchos_CommHelpers.hpp"
@@ -116,7 +116,7 @@ void printSolution(const std::vector<RealT> &x,
 
 int main(int argc, char* argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
   ROL::Ptr<const Teuchos::Comm<int> > commptr =
     ROL::toPtr(Teuchos::DefaultComm<int>::getComm());
 
diff --git a/packages/rol/adapters/teuchos/test/sol/test_04.cpp b/packages/rol/adapters/teuchos/test/sol/test_04.cpp
index 363c3a850861..4242c08ab2a0 100644
--- a/packages/rol/adapters/teuchos/test/sol/test_04.cpp
+++ b/packages/rol/adapters/teuchos/test/sol/test_04.cpp
@@ -9,7 +9,7 @@
 
 #include "ROL_ParameterList.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Teuchos_Comm.hpp"
 #include "Teuchos_DefaultComm.hpp"
 #include "Teuchos_CommHelpers.hpp"
@@ -24,7 +24,7 @@ typedef double RealT;
 
 int main(int argc, char* argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
   ROL::Ptr<const Teuchos::Comm<int> > commptr =
     ROL::toPtr(Teuchos::DefaultComm<int>::getComm());
 
diff --git a/packages/rol/adapters/teuchos/test/sol/test_05.cpp b/packages/rol/adapters/teuchos/test/sol/test_05.cpp
index 3e1e482d1ab9..dd56d3dca94c 100644
--- a/packages/rol/adapters/teuchos/test/sol/test_05.cpp
+++ b/packages/rol/adapters/teuchos/test/sol/test_05.cpp
@@ -16,7 +16,7 @@
 
 #include "ROL_Stream.hpp"
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Teuchos_Comm.hpp"
 #include "Teuchos_DefaultComm.hpp"
 #include "Teuchos_CommHelpers.hpp"
@@ -38,7 +38,7 @@ typedef H1VectorPrimal<RealT> DualConstraintVector;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
   ROL::Ptr<const Teuchos::Comm<int> > comm
     = ROL::toPtr(Teuchos::DefaultComm<int>::getComm());
 
diff --git a/packages/rol/adapters/teuchos/test/vector/test_02.cpp b/packages/rol/adapters/teuchos/test/vector/test_02.cpp
index 6b3b6365372f..8f3f2105ffe2 100644
--- a/packages/rol/adapters/teuchos/test/vector/test_02.cpp
+++ b/packages/rol/adapters/teuchos/test/vector/test_02.cpp
@@ -16,7 +16,7 @@
 #include "ROL_StdVector.hpp"
 #include "ROL_Types.hpp"
 #include "Teuchos_oblackholestream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 
@@ -25,7 +25,7 @@ typedef double ElementT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/adapters/thyra/test/vector/test_01.cpp b/packages/rol/adapters/thyra/test/vector/test_01.cpp
index 6842d63493df..b9a4ea7ffda7 100644
--- a/packages/rol/adapters/thyra/test/vector/test_01.cpp
+++ b/packages/rol/adapters/thyra/test/vector/test_01.cpp
@@ -13,7 +13,7 @@
 #include "ROL_Types.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Thyra_DefaultSpmdVectorSpace.hpp"
 #include "Thyra_VectorSpaceBase_decl.hpp"
diff --git a/packages/rol/adapters/thyra/test/vector/test_02.cpp b/packages/rol/adapters/thyra/test/vector/test_02.cpp
index ad8a51ff124f..09f01994183f 100644
--- a/packages/rol/adapters/thyra/test/vector/test_02.cpp
+++ b/packages/rol/adapters/thyra/test/vector/test_02.cpp
@@ -16,7 +16,7 @@
 #include "ROL_ScaledThyraVector.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Thyra_DefaultSpmdVectorSpace.hpp"
 #include "Thyra_VectorSpaceBase_decl.hpp"
@@ -28,7 +28,7 @@ int main(int argc, char *argv[]) {
   
   
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv,0);
+  ROL::GlobalMPISession mpiSession(&argc, &argv,0);
 
   int iprint = argc - 1;
   ROL::nullstream bhs; // outputs nothing
diff --git a/packages/rol/adapters/tpetra/src/vector/ROL_TpetraMultiVector.hpp b/packages/rol/adapters/tpetra/src/vector/ROL_TpetraMultiVector.hpp
index 0fc35fbaaaff..9068fe933391 100644
--- a/packages/rol/adapters/tpetra/src/vector/ROL_TpetraMultiVector.hpp
+++ b/packages/rol/adapters/tpetra/src/vector/ROL_TpetraMultiVector.hpp
@@ -29,13 +29,9 @@ namespace TPMultiVector {
 
 
   // Locally define a Kokkos wrapper functor for UnaryFunction
-  template <class Real,
-          class LO=Tpetra::Map<>::local_ordinal_type,
-          class GO=Tpetra::Map<>::global_ordinal_type,
-          class Node=Tpetra::Map<>::node_type >
+template <class Real,
+          class ViewType>
   struct unaryFunc {
-    typedef typename Tpetra::MultiVector<Real,LO,GO,Node>::dual_view_type::t_dev ViewType;
-    typedef typename ViewType::execution_space execution_space;
     ViewType X_;
     const Elementwise::UnaryFunction<Real>* const f_;
 
@@ -53,13 +49,9 @@ namespace TPMultiVector {
 
   // Locally define a Kokkos wrapper functor for BinaryFunction
   template <class Real,
-            class LO=Tpetra::Map<>::local_ordinal_type,
-            class GO=Tpetra::Map<>::global_ordinal_type,
-            class Node=Tpetra::Map<>::node_type >
+            class ViewType,
+            class ConstViewType>
    struct binaryFunc {
-    typedef typename Tpetra::MultiVector<Real,LO,GO,Node>::dual_view_type::t_dev ViewType;
-    typedef typename Tpetra::MultiVector<const Real,LO,GO,Node>::dual_view_type::t_dev ConstViewType;
-    typedef typename ViewType::execution_space execution_space;
     ViewType X_;
     ConstViewType Y_;
     const Elementwise::BinaryFunction<Real>* const f_;
@@ -78,12 +70,8 @@ namespace TPMultiVector {
 
   // Locally define a Kokkos wrapper functor for ReductionOp
   template <class Real,
-            class LO=Tpetra::Map<>::local_ordinal_type,
-            class GO=Tpetra::Map<>::global_ordinal_type,
-            class Node=Tpetra::Map<>::node_type >
+            class ConstViewType>
   struct reduceFunc {
-    typedef typename Tpetra::MultiVector<const Real,LO,GO,Node>::dual_view_type::t_dev ConstViewType;
-    typedef typename ConstViewType::execution_space execution_space;
     ConstViewType X_;
     const Elementwise::ReductionOp<Real>* const r_;
 
@@ -257,12 +245,12 @@ class TpetraMultiVector : public Vector<Real> {
   typedef typename Tpetra::MultiVector<const Real,LO,GO,Node>::dual_view_type::t_dev ConstViewType;
 public:
   void applyUnary( const Elementwise::UnaryFunction<Real> &f ) {
-    ViewType v_lcl =  tpetra_vec_->getLocalViewDevice(Tpetra::Access::ReadWrite);
+    auto v_lcl =  tpetra_vec_->getLocalViewHost(Tpetra::Access::ReadWrite);
 
     int lclDim = tpetra_vec_->getLocalLength();
-    TPMultiVector::unaryFunc<Real,LO,GO,Node> func(v_lcl, &f);
+    TPMultiVector::unaryFunc<Real, decltype(v_lcl)> func(v_lcl, &f);
 
-    Kokkos::parallel_for(lclDim,func);
+    Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0, lclDim),func);
   }
 
   void applyBinary( const Elementwise::BinaryFunction<Real> &f, const Vector<Real> &x ) {
@@ -274,27 +262,27 @@ class TpetraMultiVector : public Vector<Real> {
    const TpetraMultiVector &ex = dynamic_cast<const TpetraMultiVector&>(x);
    Ptr<const Tpetra::MultiVector<Real,LO,GO,Node> > xp = ex.getVector();
 
-    ViewType v_lcl = tpetra_vec_->getLocalViewDevice(Tpetra::Access::ReadWrite);
-    ConstViewType x_lcl = xp->getLocalViewDevice(Tpetra::Access::ReadOnly);
+    auto v_lcl = tpetra_vec_->getLocalViewHost(Tpetra::Access::ReadWrite);
+    auto x_lcl = xp->getLocalViewHost(Tpetra::Access::ReadOnly);
 
     int lclDim = tpetra_vec_->getLocalLength();
 
-    TPMultiVector::binaryFunc<Real,LO,GO,Node> func(v_lcl,x_lcl,&f);
+    TPMultiVector::binaryFunc<Real, decltype(v_lcl), decltype(x_lcl)> func(v_lcl,x_lcl,&f);
 
-    Kokkos::parallel_for(lclDim,func);
+    Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0, lclDim),func);
 
   }
 
   Real reduce( const Elementwise::ReductionOp<Real> &r ) const {
-    ConstViewType v_lcl = tpetra_vec_->getLocalViewDevice(Tpetra::Access::ReadOnly);
+    auto v_lcl = tpetra_vec_->getLocalViewHost(Tpetra::Access::ReadOnly);
 
     int lclDim = tpetra_vec_->getLocalLength();
-    TPMultiVector::reduceFunc<Real,LO,GO,Node> func(v_lcl, &r);
+    TPMultiVector::reduceFunc<Real, decltype(v_lcl)> func(v_lcl, &r);
 
     Real lclValue;
 
     // Reduce for this MPI process
-    Kokkos::parallel_reduce(lclDim,func,lclValue);
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0, lclDim), func,lclValue);
 
     Real gblValue;
 
diff --git a/packages/rol/adapters/tpetra/test/function/test_01.cpp b/packages/rol/adapters/tpetra/test/function/test_01.cpp
index d51e129f7e96..59ae64673535 100644
--- a/packages/rol/adapters/tpetra/test/function/test_01.cpp
+++ b/packages/rol/adapters/tpetra/test/function/test_01.cpp
@@ -18,7 +18,7 @@
 #include "ROL_Types.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
  
@@ -84,7 +84,7 @@ int main(int argc, char *argv[]) {
     int iprint     = argc - 1;
     ROL::Ptr<std::ostream> outStream;
     ROL::nullstream bhs; // outputs nothing
-    Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+    ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
 
     if (iprint > 0)
         ROL::makePtrFromRef(std::cout);
diff --git a/packages/rol/adapters/tpetra/test/sol/test_01.cpp b/packages/rol/adapters/tpetra/test/sol/test_01.cpp
index 71b1d14a3a76..14cdf18aff98 100644
--- a/packages/rol/adapters/tpetra/test/sol/test_01.cpp
+++ b/packages/rol/adapters/tpetra/test/sol/test_01.cpp
@@ -23,7 +23,7 @@ Real random(const ROL::Ptr<const Teuchos::Comm<int> > &comm) {
 
 int main(int argc, char* argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
   ROL::Ptr<const Teuchos::Comm<int> > comm
     = Teuchos::DefaultComm<int>::getComm();
 
diff --git a/packages/rol/adapters/tpetra/test/sol/test_01.hpp b/packages/rol/adapters/tpetra/test/sol/test_01.hpp
index 2f37561809f1..fd207eaee3e3 100644
--- a/packages/rol/adapters/tpetra/test/sol/test_01.hpp
+++ b/packages/rol/adapters/tpetra/test/sol/test_01.hpp
@@ -11,7 +11,7 @@
 #include "Teuchos_XMLParameterListHelpers.hpp"
 #include "ROL_Stream.hpp"
 #include "Teuchos_LAPACK.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Teuchos_Comm.hpp"
 #include "Teuchos_DefaultComm.hpp"
 #include "Teuchos_CommHelpers.hpp"
diff --git a/packages/rol/adapters/tpetra/test/sol/test_02.cpp b/packages/rol/adapters/tpetra/test/sol/test_02.cpp
index 675349799448..8f087c4f315e 100644
--- a/packages/rol/adapters/tpetra/test/sol/test_02.cpp
+++ b/packages/rol/adapters/tpetra/test/sol/test_02.cpp
@@ -23,7 +23,7 @@ Real random(const ROL::Ptr<const Teuchos::Comm<int> > &comm) {
 
 int main(int argc, char* argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
   ROL::Ptr<const Teuchos::Comm<int> > comm
     = Teuchos::DefaultComm<int>::getComm();
 
diff --git a/packages/rol/adapters/tpetra/test/sol/test_03.cpp b/packages/rol/adapters/tpetra/test/sol/test_03.cpp
index 45f6b4c7bead..c1f62fbb9431 100644
--- a/packages/rol/adapters/tpetra/test/sol/test_03.cpp
+++ b/packages/rol/adapters/tpetra/test/sol/test_03.cpp
@@ -23,7 +23,7 @@ Real random(const ROL::Ptr<const Teuchos::Comm<int> > &comm) {
 
 int main(int argc, char* argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
   ROL::Ptr<const Teuchos::Comm<int> > comm
     = Teuchos::DefaultComm<int>::getComm();
 
diff --git a/packages/rol/adapters/tpetra/test/vector/test_01.cpp b/packages/rol/adapters/tpetra/test/vector/test_01.cpp
index fda732cdcc11..3c9244d5b52a 100644
--- a/packages/rol/adapters/tpetra/test/vector/test_01.cpp
+++ b/packages/rol/adapters/tpetra/test/vector/test_01.cpp
@@ -20,7 +20,7 @@
 #include "ROL_Zakharov.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 
@@ -39,7 +39,7 @@ int main(int argc, char *argv[]) {
      
     typedef ROL::Ptr<MV> MVP;
 
-    Teuchos::GlobalMPISession mpiSession(&argc, &argv,0);
+    ROL::GlobalMPISession mpiSession(&argc, &argv,0);
 
     ROL::Ptr<const Teuchos::Comm<int> > comm = Tpetra::getDefaultComm();
 
diff --git a/packages/rol/adapters/tpetra/test/vector/test_02.cpp b/packages/rol/adapters/tpetra/test/vector/test_02.cpp
index 7194c43de12a..d2fd4560602c 100644
--- a/packages/rol/adapters/tpetra/test/vector/test_02.cpp
+++ b/packages/rol/adapters/tpetra/test/vector/test_02.cpp
@@ -17,7 +17,7 @@
 #include "ROL_StdVector.hpp"
 #include "ROL_Types.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Teuchos_Comm.hpp"
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
diff --git a/packages/rol/adapters/tpetra/test/vector/test_03.cpp b/packages/rol/adapters/tpetra/test/vector/test_03.cpp
index 48470f29e267..00cf4fa4109f 100644
--- a/packages/rol/adapters/tpetra/test/vector/test_03.cpp
+++ b/packages/rol/adapters/tpetra/test/vector/test_03.cpp
@@ -16,7 +16,7 @@
 #include "ROL_StdVector.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 
@@ -34,7 +34,7 @@ typedef ROL::Ptr<Vec> VP;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv,0);
+  ROL::GlobalMPISession mpiSession(&argc, &argv,0);
   ROL::Ptr<const Teuchos::Comm<int> > comm = Tpetra::getDefaultComm();
 
   int iprint = argc - 1;
diff --git a/packages/rol/adapters/tpetra/test/vector/test_04.cpp b/packages/rol/adapters/tpetra/test/vector/test_04.cpp
index 9574978c7197..e9dddc6b9727 100644
--- a/packages/rol/adapters/tpetra/test/vector/test_04.cpp
+++ b/packages/rol/adapters/tpetra/test/vector/test_04.cpp
@@ -8,7 +8,7 @@
 // @HEADER
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Teuchos_DefaultMpiComm.hpp"
 
 #include "ROL_PinTVector.hpp"
@@ -31,7 +31,7 @@ int main(int argc, char* argv[])
   typedef ROL::Ptr<MV> MVP;
   typedef ROL::Ptr<ROL::Vector<RealT>> PtrVector;
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
   ROL::Ptr<const Teuchos::Comm<int> > comm = Tpetra::getDefaultComm();
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
diff --git a/packages/rol/adapters/trikota/test/sol/test_01.cpp b/packages/rol/adapters/trikota/test/sol/test_01.cpp
index 259901ed1407..ec1457ede071 100644
--- a/packages/rol/adapters/trikota/test/sol/test_01.cpp
+++ b/packages/rol/adapters/trikota/test/sol/test_01.cpp
@@ -23,7 +23,7 @@ Real random(const ROL::Ptr<const Teuchos::Comm<int> > &comm) {
 
 int main(int argc, char* argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
   ROL::Ptr<const Teuchos::Comm<int> > comm
     = Teuchos::DefaultComm<int>::getComm();
 
diff --git a/packages/rol/adapters/trikota/test/sol/test_01.hpp b/packages/rol/adapters/trikota/test/sol/test_01.hpp
index 07d1e215af28..00ff02d32ac6 100644
--- a/packages/rol/adapters/trikota/test/sol/test_01.hpp
+++ b/packages/rol/adapters/trikota/test/sol/test_01.hpp
@@ -10,7 +10,7 @@
 #include "Teuchos_ParameterList.hpp"
 #include "Teuchos_XMLParameterListHelpers.hpp"
 #include "Teuchos_LAPACK.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Teuchos_Comm.hpp"
 #include "Teuchos_DefaultComm.hpp"
 #include "Teuchos_CommHelpers.hpp"
diff --git a/packages/rol/adapters/trikota/test/sol/test_02.cpp b/packages/rol/adapters/trikota/test/sol/test_02.cpp
index fe088e311ceb..4cf1b6b4bdf6 100644
--- a/packages/rol/adapters/trikota/test/sol/test_02.cpp
+++ b/packages/rol/adapters/trikota/test/sol/test_02.cpp
@@ -8,7 +8,7 @@
 // @HEADER
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Teuchos_Comm.hpp"
 #include "Teuchos_DefaultComm.hpp"
 #include "Teuchos_CommHelpers.hpp"
@@ -42,7 +42,7 @@ class TestIntegrand : public Integrand<Real> {
 
 int main(int argc, char* argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
   ROL::Ptr<const Teuchos::Comm<int> > comm
     = Teuchos::DefaultComm<int>::getComm();
 
diff --git a/packages/rol/cmake/BuildOptions.cmake b/packages/rol/cmake/BuildOptions.cmake
index 6589818eecb8..09575e621e72 100644
--- a/packages/rol/cmake/BuildOptions.cmake
+++ b/packages/rol/cmake/BuildOptions.cmake
@@ -10,10 +10,12 @@ ENDIF( ROL_ENABLE_TIMERS )
 SET_PROPERTY( GLOBAL PROPERTY TEUCHOS_PARAMETERLIST "Teuchos::ParameterList" )
 SET_PROPERTY( GLOBAL PROPERTY TEUCHOS_STACKTRACE    "Teuchos::stacktrace"    )
 SET_PROPERTY( GLOBAL PROPERTY TEUCHOS_RCP           "Teuchos::RCP"           )
+SET_PROPERTY( GLOBAL PROPERTY TEUCHOS_MPI           "Teuchos::GlobalMPISession" )
 
 SET_PROPERTY( GLOBAL PROPERTY BACKWARD_CPP          "backward-cpp"           )
 
 SET_PROPERTY( GLOBAL PROPERTY SIMPLE_STACKTRACE     "simple"                 )
+SET_PROPERTY( GLOBAL PROPERTY SIMPLE_MPI            "simple"                 )
 
 SET_PROPERTY( GLOBAL PROPERTY BOOST_PROPERTY_TREE   "boost::property_tree"   )
 SET_PROPERTY( GLOBAL PROPERTY BOOST_STACKTRACE      "boost::stacktrace"      )
@@ -26,6 +28,7 @@ SET_DEFAULT( ROL_stacktrace    "Teuchos::stacktrace"    )
 SET_DEFAULT( ROL_LinearAlgebra "Teuchos::SerialDense"   )
 SET_DEFAULT( ROL_LAPACK        "Teuchos::LAPACK"        )
 SET_DEFAULT( ROL_BLAS          "Teuchos::BLAS"          )
+SET_DEFAULT( ROL_MPI           "Teuchos::GlobalMPISession" )
 
 SET( USING_TEUCHOS_ALL ON )
 SET( USING_TEUCHOS_ALL ON PARENT_SCOPE )
@@ -122,3 +125,14 @@ ELSE()
   SET_PROPERTY( GLOBAL PROPERTY BLAS_IMPL "Teuchos::BLAS"       )
   SET_PROPERTY( GLOBAL PROPERTY BLAS_DIR  "${DIR}/teuchos/blas" )
 ENDIF()
+
+# Set MPI
+IF( ROL_MPI STREQUAL "simple")
+  SET_PROPERTY( GLOBAL PROPERTY MPI_IMPL "simple"       )
+  SET_PROPERTY( GLOBAL PROPERTY MPI_DIR  "${DIR}/simple/mpi" )
+  SET( USING_TEUCHOS_ALL OFF )
+  SET( USING_TEUCHOS_ALL OFF PARENT_SCOPE )
+ELSE()
+  SET_PROPERTY( GLOBAL PROPERTY MPI_IMPL "Teuchos::GlobalMPISession"       )
+  SET_PROPERTY( GLOBAL PROPERTY MPI_DIR  "${DIR}/teuchos/mpi" )
+ENDIF()
diff --git a/packages/rol/cmake/ColoredOutput.cmake b/packages/rol/cmake/ColoredOutput.cmake
new file mode 100644
index 000000000000..986251b58160
--- /dev/null
+++ b/packages/rol/cmake/ColoredOutput.cmake
@@ -0,0 +1,37 @@
+string(ASCII 27 Esc)
+
+if(WIN32)
+  set(ColorReset  "")
+  set(ColorBold   "")
+  set(Red         "")
+  set(Green       "")
+  set(Yellow      "")
+  set(Blue        "")
+  set(Magenta     "")
+  set(Cyan        "")
+  set(White       "")
+  set(BoldRed     "")
+  set(BoldGreen   "")
+  set(BoldYellow  "")
+  set(BoldBlue    "")
+  set(BoldMagenta "")
+  set(BoldCyan    "")
+  set(BoldWhite   "")
+else()
+  set(ColorReset  "${Esc}[m")
+  set(ColorBold   "${Esc}[1m")
+  set(Red         "${Esc}[31m")
+  set(Green       "${Esc}[32m")
+  set(Yellow      "${Esc}[33m")
+  set(Blue        "${Esc}[34m")
+  set(Magenta     "${Esc}[35m")
+  set(Cyan        "${Esc}[36m")
+  set(White       "${Esc}[37m")
+  set(BoldRed     "${Esc}[1;31m")
+  set(BoldGreen   "${Esc}[1;32m")
+  set(BoldYellow  "${Esc}[1;33m")
+  set(BoldBlue    "${Esc}[1;34m")
+  set(BoldMagenta "${Esc}[1;35m")
+  set(BoldCyan    "${Esc}[1;36m")
+  set(BoldWhite   "${Esc}[1;37m")
+endif()
diff --git a/packages/rol/cmake/Dependencies.cmake b/packages/rol/cmake/Dependencies.cmake
index 569496dd7a2d..31d74397c021 100644
--- a/packages/rol/cmake/Dependencies.cmake
+++ b/packages/rol/cmake/Dependencies.cmake
@@ -4,7 +4,7 @@
 #)      
 
 SET(LIB_REQUIRED_DEP_PACKAGES Teuchos)
-SET(LIB_OPTIONAL_DEP_PACKAGES Belos Epetra Tpetra Thyra Sacado Intrepid MiniTensor Shards Amesos Amesos2 Ifpack2 MueLu TriKota Tempus)
+SET(LIB_OPTIONAL_DEP_PACKAGES Belos Epetra Tpetra Thyra Sacado Intrepid Intrepid2 MiniTensor Shards Amesos Amesos2 Ifpack2 MueLu TriKota Tempus)
 SET(TEST_REQUIRED_DEP_PACKAGES)
 SET(TEST_OPTIONAL_DEP_PACKAGES Gtest)
 SET(LIB_REQUIRED_DEP_TPLS)
diff --git a/packages/rol/cmake/FindTPLEigen.cmake b/packages/rol/cmake/FindTPLEigen.cmake
deleted file mode 100644
index 6ede5ada7e00..000000000000
--- a/packages/rol/cmake/FindTPLEigen.cmake
+++ /dev/null
@@ -1,4 +0,0 @@
-
-TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( Eigen
-  REQUIRED_HEADERS Eigen/Dense
-  )
diff --git a/packages/rol/cmake/ROLUtils.cmake b/packages/rol/cmake/ROLUtils.cmake
new file mode 100644
index 000000000000..2d0b9ce14f9c
--- /dev/null
+++ b/packages/rol/cmake/ROLUtils.cmake
@@ -0,0 +1,137 @@
+# ROL CMake Utilities
+# Provides compatibility between TriBITS and standalone CMake builds
+
+# Function to add executable and test, compatible with both TriBITS and standalone CMake
+function(ROL_ADD_EXECUTABLE_AND_TEST TARGET_NAME)
+  # Parse arguments
+  set(options ADD_DIR_TO_NAME)
+  set(oneValueArgs NUM_MPI_PROCS PASS_REGULAR_EXPRESSION COMM)
+  set(multiValueArgs SOURCES ARGS)
+  
+  cmake_parse_arguments(ROLET "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  
+  # Check if we're in TriBITS environment
+  if(COMMAND TRIBITS_ADD_EXECUTABLE_AND_TEST)
+    # Forward to TriBITS function with all original arguments
+    tribits_add_executable_and_test(${TARGET_NAME} ${ARGN})
+  else()
+    # Standalone CMake implementation
+    
+    # Handle ADD_DIR_TO_NAME by prepending current directory name
+    set(ACTUAL_TARGET_NAME ${TARGET_NAME})
+    if(ROLET_ADD_DIR_TO_NAME)
+      get_filename_component(DIR_NAME ${CMAKE_CURRENT_SOURCE_DIR} NAME)
+      set(ACTUAL_TARGET_NAME "${DIR_NAME}_${TARGET_NAME}")
+    endif()
+    
+    # Create the executable
+    add_executable(${ACTUAL_TARGET_NAME} ${ROLET_SOURCES})
+    
+    # Link against ROL if available
+    if(TARGET rol)
+      target_link_libraries(${ACTUAL_TARGET_NAME} rol)
+    endif()
+    
+    # Add the test
+    add_test(NAME ${ACTUAL_TARGET_NAME} COMMAND ${ACTUAL_TARGET_NAME} ${ROLET_ARGS})
+    
+    # Set test properties
+    if(ROLET_PASS_REGULAR_EXPRESSION)
+      set_tests_properties(${ACTUAL_TARGET_NAME} PROPERTIES 
+        PASS_REGULAR_EXPRESSION "${ROLET_PASS_REGULAR_EXPRESSION}")
+    endif()
+    
+    # Handle MPI if specified (basic implementation)
+    if(ROLET_NUM_MPI_PROCS AND ROLET_NUM_MPI_PROCS GREATER 1)
+      find_package(MPI QUIET)
+      if(MPI_FOUND)
+        set_tests_properties(${ACTUAL_TARGET_NAME} PROPERTIES 
+          PROCESSORS ${ROLET_NUM_MPI_PROCS})
+      endif()
+    endif()
+  endif()
+endfunction()
+
+# Function to add executable only (no test)
+function(ROL_ADD_EXECUTABLE TARGET_NAME)
+  # Parse arguments
+  set(options ADD_DIR_TO_NAME)
+  set(oneValueArgs)
+  set(multiValueArgs SOURCES)
+  
+  cmake_parse_arguments(ROLE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  
+  # Check if we're in TriBITS environment
+  if(COMMAND TRIBITS_ADD_EXECUTABLE)
+    # Forward to TriBITS function with all original arguments
+    tribits_add_executable(${TARGET_NAME} ${ARGN})
+  else()
+    # Standalone CMake implementation
+    
+    # Handle ADD_DIR_TO_NAME by prepending current directory name
+    set(ACTUAL_TARGET_NAME ${TARGET_NAME})
+    if(ROLE_ADD_DIR_TO_NAME)
+      get_filename_component(DIR_NAME ${CMAKE_CURRENT_SOURCE_DIR} NAME)
+      set(ACTUAL_TARGET_NAME "${DIR_NAME}_${TARGET_NAME}")
+    endif()
+    
+    add_executable(${ACTUAL_TARGET_NAME} ${ROLE_SOURCES})
+    
+    # Link against ROL if available
+    if(TARGET rol)
+      target_link_libraries(${ACTUAL_TARGET_NAME} rol)
+    endif()
+  endif()
+endfunction()
+
+# Function to copy files to binary directory
+function(ROL_COPY_FILES_TO_BINARY_DIR TARGET_NAME)
+  # Parse arguments
+  set(options)
+  set(oneValueArgs SOURCE_DIR DEST_DIR)
+  set(multiValueArgs SOURCE_FILES)
+  
+  cmake_parse_arguments(ROLCOPY "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  
+  # Check if we're in TriBITS environment
+  if(COMMAND TRIBITS_COPY_FILES_TO_BINARY_DIR)
+    # Forward to TriBITS function with all original arguments
+    tribits_copy_files_to_binary_dir(${TARGET_NAME} ${ARGN})
+  else()
+    # Standalone CMake implementation
+    if(NOT ROLCOPY_SOURCE_DIR)
+      set(ROLCOPY_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+    endif()
+    
+    if(NOT ROLCOPY_DEST_DIR)
+      set(ROLCOPY_DEST_DIR ${CMAKE_CURRENT_BINARY_DIR})
+    endif()
+    
+    foreach(FILE ${ROLCOPY_SOURCE_FILES})
+      configure_file(
+        ${ROLCOPY_SOURCE_DIR}/${FILE}
+        ${ROLCOPY_DEST_DIR}/${FILE}
+        COPYONLY
+      )
+    endforeach()
+  endif()
+endfunction()
+
+# Function to include directories, compatible with both TriBITS and standalone CMake
+function(ROL_INCLUDE_DIRECTORIES)
+  # Check if we're in TriBITS environment
+  if(COMMAND TRIBITS_INCLUDE_DIRECTORIES)
+    # Forward to TriBITS function with all arguments
+    tribits_include_directories(${ARGN})
+  else()
+    # Standalone CMake implementation
+    cmake_parse_arguments(
+      INC
+      "REQUIRED_DURING_INSTALLATION_TESTING"
+      ""
+      ""
+      ${ARGN}
+    )
+    include_directories(${INC_UNPARSED_ARGUMENTS})
+  endif()
+endfunction()
\ No newline at end of file
diff --git a/packages/rol/cmake/ROL_config.h.in b/packages/rol/cmake/ROL_config.h.in
index e00923fea354..81ea8aed9046 100644
--- a/packages/rol/cmake/ROL_config.h.in
+++ b/packages/rol/cmake/ROL_config.h.in
@@ -14,3 +14,8 @@
 
 /* Define support for automated ParameterList validation. */
 #cmakedefine ENABLE_PARAMETERLIST_VALIDATION
+
+/* Define the Fortran name mangling to be used for BLAS/LAPACK */
+#ifndef F77_BLAS_MANGLE
+ #define F77_BLAS_MANGLE@F77_BLAS_MANGLE@
+#endif
diff --git a/packages/rol/cmake/TrilinosROL.cmake b/packages/rol/cmake/TrilinosROL.cmake
new file mode 100644
index 000000000000..d61201ea2064
--- /dev/null
+++ b/packages/rol/cmake/TrilinosROL.cmake
@@ -0,0 +1,108 @@
+INCLUDE(TribitsPackageMacros)
+INCLUDE(TribitsAddOptionAndDefine)
+
+TRIBITS_PACKAGE(ROL)
+
+TRIBITS_ADD_OPTION_AND_DEFINE(${PACKAGE_NAME}_ENABLE_DEBUG
+  HAVE_ROL_DEBUG
+  "Enable a host of runtime debug checking."
+  ${${PROJECT_NAME}_ENABLE_DEBUG}
+  )
+
+TRIBITS_ADD_OPTION_AND_DEFINE(${PACKAGE_NAME}_ENABLE_TIMERS
+  ROL_TIMERS
+  "Build ROL with Teuchos TimeMonitors enabled."
+  OFF
+  )
+
+TRIBITS_ADD_OPTION_AND_DEFINE(${PACKAGE_NAME}_ENABLE_PYROL
+  ENABLE_PYBIND11_PYROL
+  "Build ROL with PyROL interface."
+  OFF
+  )
+
+TRIBITS_ADD_OPTION_AND_DEFINE(${PACKAGE_NAME}_ENABLE_PARAMETERLIST_VALIDATION
+  ENABLE_PARAMETERLIST_VALIDATION
+  "Build ROL with ParameterList validation."
+  OFF
+  )
+
+# Build Options
+SET( CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
+INCLUDE(ROLUtils)
+INCLUDE(BuildOptions)
+GET_PROPERTY( STACKTRACE_STRING GLOBAL PROPERTY STACKTRACE_IMPL )
+#IF( ${STACKTRACE_STRING} STREQUAL "backward-cpp" )
+#INCLUDE(BackwardConfig)
+#ENDIF()
+
+
+include(ROLParameters)
+
+
+#
+# C) Add the libraries, tests, and examples
+#
+
+ADD_SUBDIRECTORY(src)
+
+IF( ROL_ENABLE_Sacado ) 
+ADD_SUBDIRECTORY(adapters/sacado)
+ENDIF()
+
+IF (ROL_ENABLE_ArrayFireCPU)
+  ADD_SUBDIRECTORY(adapters/arrayfire)
+ENDIF()
+
+IF( ROL_ENABLE_PYROL ) 
+  ADD_SUBDIRECTORY(pyrol)
+ENDIF()
+
+IF( ${USING_TEUCHOS_ALL} )
+
+ADD_SUBDIRECTORY(adapters/teuchos)
+
+IF( TPL_ENABLE_MPI ) 
+ADD_SUBDIRECTORY(adapters/mpi)
+ENDIF()
+
+IF( ROL_ENABLE_Epetra )
+ADD_SUBDIRECTORY(adapters/epetra)
+ENDIF()
+
+IF( ROL_ENABLE_Tpetra )
+ADD_SUBDIRECTORY(adapters/tpetra)
+ENDIF()
+
+IF( ROL_ENABLE_Belos )
+ADD_SUBDIRECTORY(adapters/belos)
+ENDIF()
+
+IF( ROL_ENABLE_Thyra )
+ADD_SUBDIRECTORY(adapters/thyra)
+ENDIF()
+
+IF( ROL_ENABLE_TriKota )
+ADD_SUBDIRECTORY(adapters/trikota)
+ENDIF()
+
+IF( ROL_ENABLE_pebbl )
+ADD_SUBDIRECTORY(adapters/pebbl)
+ENDIF()
+
+IF (ROL_ENABLE_Boost AND ROL_ENABLE_MiniTensor)
+  ADD_SUBDIRECTORY(adapters/minitensor)
+ENDIF()
+
+ENDIF( ${USING_TEUCHOS_ALL} )
+
+IF (ROL_ENABLE_Eigen)
+  ADD_SUBDIRECTORY(adapters/eigen)
+ENDIF()
+
+TRIBITS_ADD_TEST_DIRECTORIES(test)
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(example)
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(tutorial)
+
+TRIBITS_PACKAGE_POSTPROCESS()
+
diff --git a/packages/rol/example/CMakeLists.txt b/packages/rol/example/CMakeLists.txt
index 5c58be7b2fb9..f301538a0db9 100644
--- a/packages/rol/example/CMakeLists.txt
+++ b/packages/rol/example/CMakeLists.txt
@@ -18,12 +18,13 @@ ADD_SUBDIRECTORY(complex)
 ADD_SUBDIRECTORY(binary-design)
 ADD_SUBDIRECTORY(paraboloid-circle)
 ADD_SUBDIRECTORY(putting)
-ADD_SUBDIRECTORY(PinT)
 ADD_SUBDIRECTORY(dense-hessian)
 ADD_SUBDIRECTORY(stream-buffer)
 ADD_SUBDIRECTORY(oed)
 ADD_SUBDIRECTORY(lincon-test)
 
+IF(NOT STANDALONE_ROL)
+
 IF(${PACKAGE_NAME}_ENABLE_Sacado)
 ADD_SUBDIRECTORY(sacado)
 ENDIF()
@@ -39,6 +40,7 @@ IF( ${USING_TEUCHOS_ALL} )
 
 MESSAGE("-- Enabling ROL examples with required dependency on Teuchos::RCP or Teuchos::ParameterList")
 
+ADD_SUBDIRECTORY(PinT)
 ADD_SUBDIRECTORY(burgers-control)
 ADD_SUBDIRECTORY(parabolic-control)
 ADD_SUBDIRECTORY(poisson-control)
@@ -60,5 +62,4 @@ ENDIF( ${USING_TEUCHOS_ALL} )
 #ENDIF()
 #ENDIF()
 
-
-
+ENDIF()
diff --git a/packages/rol/example/PDE-OPT/0ld/TOOLS/PDE_FEM.hpp b/packages/rol/example/PDE-OPT/0ld/TOOLS/PDE_FEM.hpp
index faf6ba71eb46..4c2e30d8c5c6 100644
--- a/packages/rol/example/PDE-OPT/0ld/TOOLS/PDE_FEM.hpp
+++ b/packages/rol/example/PDE-OPT/0ld/TOOLS/PDE_FEM.hpp
@@ -15,7 +15,7 @@
 #ifndef ROL_PDEOPT_PDE_FEM_H
 #define ROL_PDEOPT_PDE_FEM_H
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Teuchos_TimeMonitor.hpp"
 
 #include "Tpetra_MultiVector.hpp"
diff --git a/packages/rol/example/PDE-OPT/0ld/adv-diff-react/CMakeLists.txt b/packages/rol/example/PDE-OPT/0ld/adv-diff-react/CMakeLists.txt
index 42ba1795c5a6..66ab4b945c67 100644
--- a/packages/rol/example/PDE-OPT/0ld/adv-diff-react/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/0ld/adv-diff-react/CMakeLists.txt
@@ -8,7 +8,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   # Need ROL_TpetraMultiVector.hpp
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
 
-  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  ROL_ADD_EXECUTABLE_AND_TEST(
     example_01
     SOURCES example_01.cpp
     ARGS PrintItAll
@@ -18,7 +18,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  ROL_ADD_EXECUTABLE_AND_TEST(
     example_02
     SOURCES example_02.cpp
     ARGS PrintItAll
@@ -28,7 +28,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     OldAdvDiffReactDataCopy
     SOURCE_FILES
       input.xml inputOED.xml plotresults.m tightfig.m
diff --git a/packages/rol/example/PDE-OPT/0ld/adv-diff-react/data.hpp b/packages/rol/example/PDE-OPT/0ld/adv-diff-react/data.hpp
index 1183313e304e..67cbc6af097a 100644
--- a/packages/rol/example/PDE-OPT/0ld/adv-diff-react/data.hpp
+++ b/packages/rol/example/PDE-OPT/0ld/adv-diff-react/data.hpp
@@ -15,7 +15,7 @@
 #ifndef ROL_PDEOPT_POISSON_DATA_H
 #define ROL_PDEOPT_POISSON_DATA_H
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Teuchos_TimeMonitor.hpp"
 
 #include "Tpetra_MultiVector.hpp"
diff --git a/packages/rol/example/PDE-OPT/0ld/adv-diff-react/example_01.cpp b/packages/rol/example/PDE-OPT/0ld/adv-diff-react/example_01.cpp
index 0ff12af516f3..beac41c1a6cf 100644
--- a/packages/rol/example/PDE-OPT/0ld/adv-diff-react/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/0ld/adv-diff-react/example_01.cpp
@@ -14,8 +14,8 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
+#include "ROL_XMLReader.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -60,7 +60,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int> > comm = Tpetra::getDefaultComm();
   const int myRank = comm->getRank();
   if ((iprint > 0) && (myRank == 0)) {
diff --git a/packages/rol/example/PDE-OPT/0ld/adv-diff-react/example_02.cpp b/packages/rol/example/PDE-OPT/0ld/adv-diff-react/example_02.cpp
index 02b1aadb4583..a92ccd6ea10e 100644
--- a/packages/rol/example/PDE-OPT/0ld/adv-diff-react/example_02.cpp
+++ b/packages/rol/example/PDE-OPT/0ld/adv-diff-react/example_02.cpp
@@ -15,8 +15,8 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
+#include "ROL_XMLReader.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -97,7 +97,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int> > comm = Tpetra::getDefaultComm();
   const int myRank = comm->getRank();
   if ((iprint > 0) && (myRank == 0)) {
diff --git a/packages/rol/example/PDE-OPT/0ld/elasticity/CMakeLists.txt b/packages/rol/example/PDE-OPT/0ld/elasticity/CMakeLists.txt
index 83dc4b71aabc..44f173c7a1e3 100644
--- a/packages/rol/example/PDE-OPT/0ld/elasticity/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/0ld/elasticity/CMakeLists.txt
@@ -8,7 +8,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   # Need ROL_TpetraMultiVector.hpp
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
 
-  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  ROL_ADD_EXECUTABLE_AND_TEST(
     example_01
     SOURCES example_01.cpp
     ARGS PrintItAll
@@ -18,7 +18,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     ADD_DIR_TO_NAME
   )
   
-  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  ROL_ADD_EXECUTABLE_AND_TEST(
     example_02
     SOURCES example_02.cpp
     ARGS PrintItAll
@@ -28,7 +28,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     OldElasticityDataCopy
     SOURCE_FILES
       input.xml
diff --git a/packages/rol/example/PDE-OPT/0ld/elasticity/example_01.cpp b/packages/rol/example/PDE-OPT/0ld/elasticity/example_01.cpp
index 091becab7449..c58e87d5bca3 100644
--- a/packages/rol/example/PDE-OPT/0ld/elasticity/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/0ld/elasticity/example_01.cpp
@@ -13,8 +13,8 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
+#include "ROL_XMLReader.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -40,7 +40,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int> > comm = Tpetra::getDefaultComm();
   const int myRank = comm->getRank();
   if ((iprint > 0) && (myRank == 0)) {
diff --git a/packages/rol/example/PDE-OPT/0ld/elasticity/example_02.cpp b/packages/rol/example/PDE-OPT/0ld/elasticity/example_02.cpp
index a7c1a762e689..c9ecc5522e12 100644
--- a/packages/rol/example/PDE-OPT/0ld/elasticity/example_02.cpp
+++ b/packages/rol/example/PDE-OPT/0ld/elasticity/example_02.cpp
@@ -13,8 +13,8 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
+#include "ROL_XMLReader.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -40,7 +40,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int> > comm = Tpetra::getDefaultComm();
   const int myRank = comm->getRank();
   if ((iprint > 0) && (myRank == 0)) {
diff --git a/packages/rol/example/PDE-OPT/0ld/elasticitySIMP_topologyOptimization/CMakeLists.txt b/packages/rol/example/PDE-OPT/0ld/elasticitySIMP_topologyOptimization/CMakeLists.txt
index 04a36a8fa260..d4aa3d29a651 100644
--- a/packages/rol/example/PDE-OPT/0ld/elasticitySIMP_topologyOptimization/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/0ld/elasticitySIMP_topologyOptimization/CMakeLists.txt
@@ -10,13 +10,13 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_01
     SOURCES example_01.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_02
     SOURCES example_02.cpp
     ADD_DIR_TO_NAME
@@ -32,7 +32,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     CATEGORIES HEAVY  # Currently takes 18m to run (see #442)!
   )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     OldElasticitySIMPOperatorsCopy
     SOURCE_FILES
       input.xml stochastic.xml plotdensity.m plotstate.m plotnodaldensity.m
diff --git a/packages/rol/example/PDE-OPT/0ld/elasticitySIMP_topologyOptimization/build_sampler.hpp b/packages/rol/example/PDE-OPT/0ld/elasticitySIMP_topologyOptimization/build_sampler.hpp
index fa6b139d3fbf..7e54175e6eaa 100644
--- a/packages/rol/example/PDE-OPT/0ld/elasticitySIMP_topologyOptimization/build_sampler.hpp
+++ b/packages/rol/example/PDE-OPT/0ld/elasticitySIMP_topologyOptimization/build_sampler.hpp
@@ -12,8 +12,8 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
+#include "ROL_XMLReader.hpp"
 
 // SOL Inputs
 #include "ROL_MonteCarloGenerator.hpp"
diff --git a/packages/rol/example/PDE-OPT/0ld/elasticitySIMP_topologyOptimization/example_01.cpp b/packages/rol/example/PDE-OPT/0ld/elasticitySIMP_topologyOptimization/example_01.cpp
index f765744eb46e..b631aff2dbf9 100644
--- a/packages/rol/example/PDE-OPT/0ld/elasticitySIMP_topologyOptimization/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/0ld/elasticitySIMP_topologyOptimization/example_01.cpp
@@ -13,8 +13,8 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
+#include "ROL_XMLReader.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -59,7 +59,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int> > comm = Tpetra::getDefaultComm();
   const int myRank = comm->getRank();
   if ((iprint > 0) && (myRank == 0)) {
diff --git a/packages/rol/example/PDE-OPT/0ld/elasticitySIMP_topologyOptimization/example_02.cpp b/packages/rol/example/PDE-OPT/0ld/elasticitySIMP_topologyOptimization/example_02.cpp
index 8b3a91e89dee..0d33e07761bb 100644
--- a/packages/rol/example/PDE-OPT/0ld/elasticitySIMP_topologyOptimization/example_02.cpp
+++ b/packages/rol/example/PDE-OPT/0ld/elasticitySIMP_topologyOptimization/example_02.cpp
@@ -13,8 +13,8 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
+#include "ROL_XMLReader.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -58,7 +58,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int> > comm = Tpetra::getDefaultComm();
   const int myRank = comm->getRank();
   if ((iprint > 0) && (myRank == 0)) {
diff --git a/packages/rol/example/PDE-OPT/0ld/elasticitySIMP_topologyOptimization/filter.hpp b/packages/rol/example/PDE-OPT/0ld/elasticitySIMP_topologyOptimization/filter.hpp
index 0e4cd324d022..26550113cb9d 100644
--- a/packages/rol/example/PDE-OPT/0ld/elasticitySIMP_topologyOptimization/filter.hpp
+++ b/packages/rol/example/PDE-OPT/0ld/elasticitySIMP_topologyOptimization/filter.hpp
@@ -14,7 +14,7 @@
 #ifndef ROL_DENSITY_FILTER_H
 #define ROL_DENSITY_FILTER_H
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Teuchos_TimeMonitor.hpp"
 
 #include "Tpetra_MultiVector.hpp"
diff --git a/packages/rol/example/PDE-OPT/0ld/poisson/CMakeLists.txt b/packages/rol/example/PDE-OPT/0ld/poisson/CMakeLists.txt
index 6adef2a50283..7027090596d7 100644
--- a/packages/rol/example/PDE-OPT/0ld/poisson/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/0ld/poisson/CMakeLists.txt
@@ -8,7 +8,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   # Need ROL_TpetraMultiVector.hpp
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
 
-  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  ROL_ADD_EXECUTABLE_AND_TEST(
     example_01
     SOURCES example_01.cpp
     ARGS PrintItAll
@@ -18,7 +18,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     OldPoissonDataCopy
     SOURCE_FILES
       input.xml plotresults.m
diff --git a/packages/rol/example/PDE-OPT/0ld/poisson/data.hpp b/packages/rol/example/PDE-OPT/0ld/poisson/data.hpp
index 9b1b46c98e80..f125f4383864 100644
--- a/packages/rol/example/PDE-OPT/0ld/poisson/data.hpp
+++ b/packages/rol/example/PDE-OPT/0ld/poisson/data.hpp
@@ -15,7 +15,7 @@
 #ifndef ROL_PDEOPT_POISSON_DATA_H
 #define ROL_PDEOPT_POISSON_DATA_H
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Teuchos_TimeMonitor.hpp"
 
 #include "Tpetra_MultiVector.hpp"
diff --git a/packages/rol/example/PDE-OPT/0ld/poisson/example_01.cpp b/packages/rol/example/PDE-OPT/0ld/poisson/example_01.cpp
index a89cd7f1520b..67e712541eb5 100644
--- a/packages/rol/example/PDE-OPT/0ld/poisson/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/0ld/poisson/example_01.cpp
@@ -13,8 +13,8 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
+#include "ROL_XMLReader.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -45,7 +45,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int> > comm = Tpetra::getDefaultComm();
   const int myRank = comm->getRank();
   if ((iprint > 0) && (myRank == 0)) {
diff --git a/packages/rol/example/PDE-OPT/0ld/stefan-boltzmann/CMakeLists.txt b/packages/rol/example/PDE-OPT/0ld/stefan-boltzmann/CMakeLists.txt
index 531bd1fba2b9..8a7abd4710ee 100644
--- a/packages/rol/example/PDE-OPT/0ld/stefan-boltzmann/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/0ld/stefan-boltzmann/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-IF(${PROJECT_NAME}_ENABLE_Intrepid AND 
+IF(${PROJECT_NAME}_ENABLE_Intrepid AND
    ${PROJECT_NAME}_ENABLE_Epetra   AND
    ${PROJECT_NAME}_ENABLE_Amesos   AND
    ${PROJECT_NAME}_ENABLE_Amesos2  AND
@@ -11,29 +11,27 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
 
-#  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+#  ROL_ADD_EXECUTABLE_AND_TEST(
 #    example_01
 #    SOURCES example_01.cpp
 #    ARGS PrintItAll
 #    NUM_MPI_PROCS 4
 #    NUM_TOTAL_CORES_USED 4
-#    CATEGORIES CONTINUOUS
 #    PASS_REGULAR_EXPRESSION "TEST PASSED"
 #    ADD_DIR_TO_NAME
 #  )
 
-#  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+#  ROL_ADD_EXECUTABLE_AND_TEST(
 #    example_02
 #    SOURCES example_02.cpp
 #    ARGS PrintItAll
 #    NUM_MPI_PROCS 4
 #    NUM_TOTAL_CORES_USED 4
-#    CATEGORIES CONTINUOUS
 #    PASS_REGULAR_EXPRESSION "TEST PASSED"
 #    ADD_DIR_TO_NAME
 #  )
 
-  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  ROL_ADD_EXECUTABLE_AND_TEST(
     example_03
     SOURCES example_03.cpp
     ARGS PrintItAll
@@ -43,7 +41,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     OldStefanBoltzmannDataCopy
     SOURCE_FILES
       input.xml
diff --git a/packages/rol/example/PDE-OPT/0ld/stefan-boltzmann/data.hpp b/packages/rol/example/PDE-OPT/0ld/stefan-boltzmann/data.hpp
index 855354c58b40..8b726407ffa6 100644
--- a/packages/rol/example/PDE-OPT/0ld/stefan-boltzmann/data.hpp
+++ b/packages/rol/example/PDE-OPT/0ld/stefan-boltzmann/data.hpp
@@ -15,7 +15,7 @@
 #ifndef ROL_PDEOPT_STEFANBOLTZMANN_DATA_H
 #define ROL_PDEOPT_STEFANBOLTZMANN_DATA_H
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Teuchos_TimeMonitor.hpp"
 
 #include "Tpetra_MultiVector.hpp"
diff --git a/packages/rol/example/PDE-OPT/0ld/stefan-boltzmann/example_01.cpp b/packages/rol/example/PDE-OPT/0ld/stefan-boltzmann/example_01.cpp
index e0b3b898ab28..81ff80d7932a 100644
--- a/packages/rol/example/PDE-OPT/0ld/stefan-boltzmann/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/0ld/stefan-boltzmann/example_01.cpp
@@ -13,8 +13,8 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
+#include "ROL_XMLReader.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -44,7 +44,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int> > comm = Tpetra::getDefaultComm();
   const int myRank = comm->getRank();
   if ((iprint > 0) && (myRank == 0)) {
diff --git a/packages/rol/example/PDE-OPT/0ld/stefan-boltzmann/example_02.cpp b/packages/rol/example/PDE-OPT/0ld/stefan-boltzmann/example_02.cpp
index bf60ceb765cb..7d7b9d58ad54 100644
--- a/packages/rol/example/PDE-OPT/0ld/stefan-boltzmann/example_02.cpp
+++ b/packages/rol/example/PDE-OPT/0ld/stefan-boltzmann/example_02.cpp
@@ -13,8 +13,8 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
+#include "ROL_XMLReader.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -45,7 +45,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int> > comm = Tpetra::getDefaultComm();
   const int myRank = comm->getRank();
   if ((iprint > 0) && (myRank == 0)) {
diff --git a/packages/rol/example/PDE-OPT/0ld/stefan-boltzmann/example_03.cpp b/packages/rol/example/PDE-OPT/0ld/stefan-boltzmann/example_03.cpp
index 276b79262260..b701606ed660 100644
--- a/packages/rol/example/PDE-OPT/0ld/stefan-boltzmann/example_03.cpp
+++ b/packages/rol/example/PDE-OPT/0ld/stefan-boltzmann/example_03.cpp
@@ -13,8 +13,8 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
+#include "ROL_XMLReader.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -44,7 +44,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int> > comm
     = Tpetra::getDefaultComm();
   const int myRank = comm->getRank();
diff --git a/packages/rol/example/PDE-OPT/0ld/stoch-adv-diff/CMakeLists.txt b/packages/rol/example/PDE-OPT/0ld/stoch-adv-diff/CMakeLists.txt
index 68bf150d219e..36443e7a4237 100644
--- a/packages/rol/example/PDE-OPT/0ld/stoch-adv-diff/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/0ld/stoch-adv-diff/CMakeLists.txt
@@ -14,7 +14,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   # Need ROL_TpetraTeuchosBatchManager.hpp
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
 
-  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  ROL_ADD_EXECUTABLE_AND_TEST(
     example_01
     SOURCES example_01.cpp
     ARGS PrintItAll
@@ -24,7 +24,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     OldStochAdvDiffDataCopy
     SOURCE_FILES
       input.xml plotresults.m
diff --git a/packages/rol/example/PDE-OPT/0ld/stoch-adv-diff/data.hpp b/packages/rol/example/PDE-OPT/0ld/stoch-adv-diff/data.hpp
index 2229f05ac52d..798a89408a12 100644
--- a/packages/rol/example/PDE-OPT/0ld/stoch-adv-diff/data.hpp
+++ b/packages/rol/example/PDE-OPT/0ld/stoch-adv-diff/data.hpp
@@ -15,7 +15,7 @@
 #ifndef ROL_PDEOPT_STOCH_ADV_DIFF_DATA_H
 #define ROL_PDEOPT_STOCH_ADV_DIFF_DATA_H
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Teuchos_TimeMonitor.hpp"
 
 #include "Tpetra_MultiVector.hpp"
diff --git a/packages/rol/example/PDE-OPT/0ld/stoch-adv-diff/example_01.cpp b/packages/rol/example/PDE-OPT/0ld/stoch-adv-diff/example_01.cpp
index 43cd8483c9cf..99b53e37dcd3 100644
--- a/packages/rol/example/PDE-OPT/0ld/stoch-adv-diff/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/0ld/stoch-adv-diff/example_01.cpp
@@ -14,8 +14,8 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
+#include "ROL_XMLReader.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -65,7 +65,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int> > comm
     = Tpetra::getDefaultComm();
   ROL::Ptr<const Teuchos::Comm<int> > serial_comm
diff --git a/packages/rol/example/PDE-OPT/CMakeLists.txt b/packages/rol/example/PDE-OPT/CMakeLists.txt
index 5a97166300d1..60b9c774c599 100644
--- a/packages/rol/example/PDE-OPT/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/CMakeLists.txt
@@ -1,18 +1,31 @@
 SET(SOURCES "")
+SET(SOURCES_KOKKOS "")
 SET(HEADERS "")
 SET(NOINSTALLHEADERS "")
+SET(NOINSTALLHEADERS_KOKKOS "")
 
 APPEND_SET(SOURCES
   TOOLS/solver.cpp
   TOOLS/assembler.cpp
 )
 
+APPEND_SET(SOURCES_KOKKOS
+  TOOLS/solver.cpp
+  TOOLS/assemblerK.cpp
+)
+
 APPEND_SET(NOINSTALLHEADERS
   TOOLS/solver.hpp
   TOOLS/solver_def.hpp
   TOOLS/assembler.hpp
 )
 
+APPEND_SET(NOINSTALLHEADERS_KOKKOS
+  TOOLS/solver.hpp
+  TOOLS/solver_def.hpp
+  TOOLS/assemblerK.hpp
+)
+
 IF(${PROJECT_NAME}_ENABLE_Intrepid AND
    ${PROJECT_NAME}_ENABLE_Ifpack2  AND
    ${PROJECT_NAME}_ENABLE_MueLu    AND
@@ -28,6 +41,21 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
 
 ENDIF()
 
+IF(${PROJECT_NAME}_ENABLE_Intrepid2 AND
+   ${PROJECT_NAME}_ENABLE_Ifpack2  AND
+   ${PROJECT_NAME}_ENABLE_MueLu    AND
+   ${PROJECT_NAME}_ENABLE_Amesos2  AND
+   ${PROJECT_NAME}_ENABLE_Tpetra )
+
+    TRIBITS_ADD_LIBRARY(
+      rol-pdeopt-kokkos
+      HEADERS ${HEADERS}
+      NOINSTALLHEADERS ${NOINSTALLHEADERS_KOKKOS}
+      SOURCES ${SOURCES_KOKKOS}
+    )
+
+ENDIF()
+
 ADD_SUBDIRECTORY(TEST)
 #ADD_SUBDIRECTORY(0ld)
 ADD_SUBDIRECTORY(poisson)
diff --git a/packages/rol/example/PDE-OPT/OED/example/helmholtz/CMakeLists.txt b/packages/rol/example/PDE-OPT/OED/example/helmholtz/CMakeLists.txt
index 1c7c086e48a8..9837262fe9ab 100644
--- a/packages/rol/example/PDE-OPT/OED/example/helmholtz/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/OED/example/helmholtz/CMakeLists.txt
@@ -1,7 +1,7 @@
 ADD_SUBDIRECTORY(singleFrequency)
 ADD_SUBDIRECTORY(multipleFrequency)
 
-TRIBITS_COPY_FILES_TO_BINARY_DIR(
+ROL_COPY_FILES_TO_BINARY_DIR(
   OEDHelmholtzOCTRealDataCopy
   SOURCE_FILES
     meshfiles/octagon.jou
diff --git a/packages/rol/example/PDE-OPT/OED/example/helmholtz/multipleFrequency/CMakeLists.txt b/packages/rol/example/PDE-OPT/OED/example/helmholtz/multipleFrequency/CMakeLists.txt
index cf89f3daa6e9..a32d4d07ed11 100644
--- a/packages/rol/example/PDE-OPT/OED/example/helmholtz/multipleFrequency/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/OED/example/helmholtz/multipleFrequency/CMakeLists.txt
@@ -11,13 +11,13 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/teuchos/src/sol)
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_01
     SOURCES example_01.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     OEDHelmholtzOCTRealMultiFreqDataCopy
     SOURCE_FILES
       input.xml plotresults.m
@@ -25,5 +25,32 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
   )
 
+ENDIF()
+
+IF(${PROJECT_NAME}_ENABLE_Intrepid2 AND 
+   ${PROJECT_NAME}_ENABLE_Amesos2   AND
+   ${PROJECT_NAME}_ENABLE_Ifpack2   AND
+   ${PROJECT_NAME}_ENABLE_MueLu     AND
+   ${PROJECT_NAME}_ENABLE_Teuchos   AND
+   ${PROJECT_NAME}_ENABLE_Tpetra )
+
+  # Need ROL_TpetraMultiVector.hpp
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/teuchos/src/sol)
+
+  ROL_ADD_EXECUTABLE(
+    example_01_Kokkos
+    SOURCES example_01K.cpp
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_COPY_FILES_TO_BINARY_DIR(
+    OEDHelmholtzOCTRealMultiFreqDataCopyK
+    SOURCE_FILES
+      input.xml plotresults.m
+    SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}"
+    DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
+  )
 
 ENDIF()
diff --git a/packages/rol/example/PDE-OPT/OED/example/helmholtz/multipleFrequency/example_01.cpp b/packages/rol/example/PDE-OPT/OED/example/helmholtz/multipleFrequency/example_01.cpp
index 0d914bb27544..70877e07ab99 100644
--- a/packages/rol/example/PDE-OPT/OED/example/helmholtz/multipleFrequency/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/OED/example/helmholtz/multipleFrequency/example_01.cpp
@@ -13,8 +13,7 @@
 
 #include "Teuchos_Comm.hpp"
 #include "Teuchos_Time.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -107,7 +106,7 @@ int main(int argc, char *argv[]) {
   using RealT = double;
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
   ROL::Ptr<const Teuchos::Comm<int>> dcomm, ocomm;
 #ifdef HAVE_MPI
   OED_SplitComm<int,Teuchos::MpiComm<int>> splitcomm(1);
diff --git a/packages/rol/example/PDE-OPT/OED/example/helmholtz/multipleFrequency/example_01K.cpp b/packages/rol/example/PDE-OPT/OED/example/helmholtz/multipleFrequency/example_01K.cpp
new file mode 100644
index 000000000000..d5efce4d0e49
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/OED/example/helmholtz/multipleFrequency/example_01K.cpp
@@ -0,0 +1,256 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_01.cpp
+    \brief Shows how to solve the optimal control of Helmholtz problem.
+*/
+
+#include "Teuchos_Comm.hpp"
+#include "Teuchos_Time.hpp"
+#include "ROL_GlobalMPISession.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+
+#include "ROL_Stream.hpp"
+#include "ROL_Solver.hpp"
+#include "ROL_UserInputGenerator.hpp"
+#include "ROL_ScalarLinearConstraint.hpp"
+#include "ROL_Bounds.hpp"
+#include "ROL_StdTeuchosBatchManager.hpp"
+#include "ROL_Reduced_Objective_SimOpt.hpp"
+#include "ROL_PrimalDualRisk.hpp"
+
+#include "../../../../TOOLS/linearpdeconstraintK.hpp"
+#include "../../../../TOOLS/pdeobjectiveK.hpp"
+#include "../../../../TOOLS/pdevectorK.hpp"
+#include "../../../../TOOLS/meshreaderK.hpp"
+
+#include "ROL_OED_Factory.hpp"
+#include "ROL_OED_StdMomentOperator.hpp"
+#include "../../../utilities/OED_SplitComm.hpp"
+
+#include "../src/pde_helmholtzK.hpp"
+#include "../src/obj_helmholtzK.hpp"
+#include "../src/noise.hpp"
+#include "model.hpp"
+#include "predfun.hpp"
+
+template<typename Real>
+void solve(const ROL::Ptr<ROL::OED::Factory<Real>>      &factory,
+           const ROL::Ptr<ROL::Objective<Real>>         &predfun,
+           const ROL::Ptr<ROL::SampleGenerator<Real>>   &dsampler,
+           const ROL::Ptr<ROL::SampleGenerator<Real>>   &osampler,
+           ROL::ParameterList                           &list,
+           const std::string                            &dtype,
+           std::ostream                                 &stream     = std::cout,
+           const bool                                    useUIG     = true,
+           const bool                                    checkDeriv = false) {
+  int nsampd = dsampler->numGlobalSamples();
+  if (useUIG) {
+    factory->setDesign(1.0/static_cast<Real>(nsampd));
+  }
+  else {
+    std::stringstream dfile;
+    dfile << dtype << "_optimal_design.txt";
+    int err = factory->loadDesign(dfile.str(),2,nsampd);
+    stream << "  Factory::loadDesign exited with error " << err << std::endl;
+  }
+  stream << "  Solve " << dtype << "-optimal design problem." << std::endl;
+  std::clock_t timer = std::clock();
+  ROL::Ptr<ROL::Problem<Real>> problem = factory->get(list,osampler,predfun);
+  std::string type = list.sublist("OED").get("Optimality Type","I");
+  bool usePD = list.sublist("OED").sublist("R-Optimality").get("Use Primal-Dual Algorithm",false);
+  if (type=="R" && usePD) {
+    ROL::PrimalDualRisk<Real> solver(problem,osampler,list);
+    if (checkDeriv) {
+      factory->check(stream);
+      //solver.check(stream);
+      factory->reset();
+    }
+    solver.run(stream);
+  }
+  else {
+    ROL::Solver<Real> solver(problem,list);
+    if (checkDeriv) {
+      factory->check(stream);
+      problem->check(true,stream);
+      factory->reset();
+    }
+    solver.solve(stream);
+  }
+  stream << "  " << dtype << "-optimal design time:      "
+    << static_cast<Real>(std::clock()-timer)/static_cast<Real>(CLOCKS_PER_SEC)
+    << " seconds" << std::endl;
+  factory->profile(stream);
+  factory->reset();
+  std::stringstream dname, pname;
+  dname << dtype << "_optimal_design";
+  factory->printDesign(dname.str());
+  pname << dtype << "_optimal_prediction_variance_samples";
+  factory->printPredictionVariance(osampler,pname.str());
+}
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+int main(int argc, char *argv[]) {
+//  feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
+
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
+  Kokkos::ScopeGuard kokkosScope (argc, argv);
+  ROL::Ptr<const Teuchos::Comm<int>> dcomm, ocomm;
+#ifdef HAVE_MPI
+  OED_SplitComm<int,Teuchos::MpiComm<int>> splitcomm(1);
+  dcomm = splitcomm.getDesignComm();
+  ocomm = splitcomm.getSampleComm();
+#else
+  dcomm = ROL::makePtr<Teuchos::SerialComm<int>>();
+  ocomm = Tpetra::getDefaultComm();
+#endif
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  const int drank = dcomm->getRank();
+  const int orank = ocomm->getRank();
+  bool iprint = (argc > 1) && (drank == 0) && (orank == 0);
+  iprint = ((argc > 2) ? true : iprint);
+  auto outStream = ROL::makeStreamPtr( std::cout, iprint );
+
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+    //RealT tol(1e-8);// one(1);
+
+    /*************************************************************************/
+    /******* BUILD LINEAR REGRESSION MODEL BASED ON HELMHOLTZ ****************/
+    /*************************************************************************/
+    /*** Read in XML input ***/
+    auto parlist = ROL::getParametersFromXmlFile("input.xml");
+    int   numSpeakers = 8;
+    bool   checkDeriv = parlist->sublist("Problem").get("Check Derivatives",  false);
+    int     verbosity = parlist->sublist("General").get("Print Verbosity",0);
+    bool      alPrint = parlist->sublist("Step").sublist("Augmented Lagrangian").get("Print Intermediate Optimization History",false);
+    verbosity         = (iprint ? verbosity : 0);
+    alPrint           = (iprint ? alPrint : false);
+    parlist->sublist("General").set("Print Verbosity",verbosity);
+    parlist->sublist("Step").sublist("Augmented Lagrangian").set("Print Intermediate Optimization History",alPrint);
+
+    // Initialize PDE describing Helmholtz equation.
+    ROL::Ptr<PDE_Helmholtz_OCT<RealT,DeviceT>> pde;
+    std::vector<ROL::Ptr<Linear_PDE_Constraint<RealT,DeviceT>>> cons(5);
+    auto meshMgr = ROL::makePtr<MeshReader<RealT,DeviceT>>(*parlist);
+    for (int i = 0; i < 5; ++i) {
+      parlist->sublist("Problem").set("Frequency",static_cast<RealT>((i+1)*100));
+      pde = ROL::makePtr<PDE_Helmholtz_OCT<RealT,DeviceT>>(*parlist);
+      cons[i] = ROL::makePtr<Linear_PDE_Constraint<RealT,DeviceT>>(pde,meshMgr,dcomm,*parlist,*outStream);
+    }
+    auto assembler = cons[0]->getAssembler();
+    cons[0]->printMeshData(*outStream);
+
+    // Create state vector.
+    auto u_ptr = assembler->createStateVector();
+    auto r_ptr = assembler->createResidualVector();
+    auto up = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(u_ptr,pde,assembler,*parlist);
+    auto rp = ROL::makePtr<PDE_DualSimVector<RealT,DeviceT>>(r_ptr,pde,assembler,*parlist);
+    auto zp = ROL::makePtr<ROL::StdVector<RealT>>(numSpeakers);
+    auto theta = ROL::makePtr<ROL::StdVector<RealT>>(numSpeakers,1);
+    auto ovec  = ROL::makePtr<ROL::StdVector<RealT>>(5,0);
+
+    // Create observation function
+    auto qoi = ROL::makePtr<QoI_Helmholtz_Observation<RealT,DeviceT>>(pde->getFE(),*parlist);
+    auto obs = ROL::makePtr<PDE_Objective<RealT,DeviceT>>(qoi,assembler);
+
+    // Build nonlinear model and parameter vector
+    std::vector<ROL::Ptr<ROL::Objective<RealT>>> objs(5);
+    for (int i = 0; i < 5; ++i)
+      objs[i] = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(obs,cons[i],up,theta,up,false);
+    auto model = ROL::makePtr<Model<RealT>>(objs);
+
+    // Build prediction function
+    auto vdata = ROL::makePtr<std::vector<RealT>>(5);
+    (*vdata)[0] = static_cast<RealT>(0);
+    (*vdata)[1] = static_cast<RealT>(1);
+    (*vdata)[2] = static_cast<RealT>(2);
+    (*vdata)[3] = static_cast<RealT>(3);
+    (*vdata)[4] = static_cast<RealT>(4);
+    auto vec = ROL::makePtr<ROL::StdVector<RealT>>(vdata);
+    auto predfun = ROL::makePtr<PredFun<RealT>>(model,vec);
+
+    // Run derivative checks
+    if ( checkDeriv ) {
+      auto dzp = zp->clone();   dzp->randomize();
+      auto rop = ovec->clone(); rop->randomize();   
+      auto rzp = zp->clone();   rzp->randomize();
+      std::vector<RealT> param(2,0);
+      model->setParameter(param);
+      *outStream << "\n\nCheck Jacobian of model\n";
+      model->checkApplyJacobian(*rzp,*dzp,*rop,true,*outStream);
+      *outStream << "\n\nCheck gradient of prediction function\n";
+      model->checkAdjointConsistencyJacobian(*rop,*rzp,*dzp,true,*outStream);
+      predfun->checkGradient(*rzp,*dzp,true,*outStream);
+    }
+
+    /*************************************************************************/
+    /******* BUILD EXPERIMENTAL DESIGN PROBLEM AND SOLVE *********************/
+    /*************************************************************************/
+    // Build samplers for experiment space
+    std::string pd = parlist->sublist("Problem").sublist("Design").get("Points File",          "points_5000.txt");
+    std::string wd = parlist->sublist("Problem").sublist("Design").get("Weights File",         "weights_5000.txt");
+    int nsampd     = parlist->sublist("Problem").sublist("Design").get("Number of Samples",    5000);
+    std::string po = parlist->sublist("Problem").sublist("Objective").get("Points File",       "points_200.txt");
+    std::string wo = parlist->sublist("Problem").sublist("Objective").get("Weights File",      "weights_200.txt");
+    int nsampo     = parlist->sublist("Problem").sublist("Objective").get("Number of Samples", 200);
+    *outStream << std::endl;
+    *outStream << "  Design Points File:     " << pd << std::endl;
+    *outStream << "  Design Weights File:    " << wd << std::endl;
+    *outStream << "  Objective Points File:  " << po << std::endl;
+    *outStream << "  Objective Weights File: " << wo << std::endl;
+    *outStream << std::endl;
+    auto dbman  = ROL::makePtr<ROL::StdTeuchosBatchManager<RealT,int>>(dcomm);
+    auto obman  = ROL::makePtr<ROL::StdTeuchosBatchManager<RealT,int>>(ocomm);
+    auto dsampler  = ROL::makePtr<ROL::UserInputGenerator<RealT>>(pd,wd,nsampd,2,dbman);
+    auto osampler  = ROL::makePtr<ROL::UserInputGenerator<RealT>>(po,wo,nsampo,2,obman);
+
+    // Build OED problem factory.
+    bool useUIG, homNoise;
+    std::string regType, dtype;
+    homNoise = parlist->sublist("Problem").get("Homoscedastic Noise",true);
+    regType  = parlist->sublist("Problem").get("Regression Type","Least Squares"); 
+    auto type    = ROL::OED::StringToRegressionType(regType);
+    auto noise   = ROL::makePtr<Helmholtz_Noise<RealT>>();
+    auto cov     = ROL::makePtr<ROL::OED::StdMomentOperator<RealT>>(type,homNoise,noise);
+    auto factory = ROL::makePtr<ROL::OED::Factory<RealT>>(model,dsampler,theta,ovec,cov,*parlist);
+    obman->barrier();
+
+    // Solve OED problem
+    dtype = parlist->sublist("OED").get("Optimality Type","A");
+    useUIG = parlist->sublist("Problem").get("Uniform Initial Guess", false);
+    solve<RealT>(factory,predfun,dsampler,osampler,*parlist,dtype,*outStream,useUIG,checkDeriv);
+
+    // Get a summary from the time monitor.
+    Teuchos::TimeMonitor::summarize();
+  }
+  catch (std::logic_error &err) {
+    std::cout << "Design MPI Rank = " << drank
+              << "  Objective MPI Rank = " << orank
+              << std::endl << err.what() << std::endl; 
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/OED/example/helmholtz/multipleFrequency/model.hpp b/packages/rol/example/PDE-OPT/OED/example/helmholtz/multipleFrequency/model.hpp
index a44a4d681272..de6b545d66f9 100644
--- a/packages/rol/example/PDE-OPT/OED/example/helmholtz/multipleFrequency/model.hpp
+++ b/packages/rol/example/PDE-OPT/OED/example/helmholtz/multipleFrequency/model.hpp
@@ -25,11 +25,11 @@ class Model : public ROL::Constraint<Real> {
 
   void setParameter(const std::vector<Real> &param) {
     ROL::Constraint<Real>::setParameter(param);
-    for (const auto &obj : objs_) obj->setParameter(param);
+    for (const auto & obj : objs_) obj->setParameter(param);
   }
 
   void update(const ROL::Vector<Real> &x, bool flag = true, int iter = -1) {
-    for (const auto &obj : objs_) obj->update(x,flag,iter);
+    for (const auto & obj : objs_) obj->update(x,flag,iter);
   }
 
   void value(ROL::Vector<Real> &c, const ROL::Vector<Real> &x, Real &tol ) {
diff --git a/packages/rol/example/PDE-OPT/OED/example/helmholtz/singleFrequency/CMakeLists.txt b/packages/rol/example/PDE-OPT/OED/example/helmholtz/singleFrequency/CMakeLists.txt
index d5fb90477400..75f04e7ea398 100644
--- a/packages/rol/example/PDE-OPT/OED/example/helmholtz/singleFrequency/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/OED/example/helmholtz/singleFrequency/CMakeLists.txt
@@ -11,13 +11,13 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/teuchos/src/sol)
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_01
     SOURCES example_01.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     OEDHelmholtzOCTRealSingleFreqDataCopy
     SOURCE_FILES
       input.xml plotresults.m
@@ -25,5 +25,32 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
   )
 
+ENDIF()
+
+IF(${PROJECT_NAME}_ENABLE_Intrepid2 AND 
+   ${PROJECT_NAME}_ENABLE_Amesos2   AND
+   ${PROJECT_NAME}_ENABLE_Ifpack2   AND
+   ${PROJECT_NAME}_ENABLE_MueLu     AND
+   ${PROJECT_NAME}_ENABLE_Teuchos   AND
+   ${PROJECT_NAME}_ENABLE_Tpetra )
+
+  # Need ROL_TpetraMultiVector.hpp
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/teuchos/src/sol)
+
+  ROL_ADD_EXECUTABLE(
+    example_01_Kokkos
+    SOURCES example_01K.cpp
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_COPY_FILES_TO_BINARY_DIR(
+    OEDHelmholtzOCTRealSingleFreqDataCopyK
+    SOURCE_FILES
+      input.xml plotresults.m
+    SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}"
+    DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
+  )
 
 ENDIF()
diff --git a/packages/rol/example/PDE-OPT/OED/example/helmholtz/singleFrequency/example_01.cpp b/packages/rol/example/PDE-OPT/OED/example/helmholtz/singleFrequency/example_01.cpp
index 01dac1adc956..fb211b1c299d 100644
--- a/packages/rol/example/PDE-OPT/OED/example/helmholtz/singleFrequency/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/OED/example/helmholtz/singleFrequency/example_01.cpp
@@ -13,8 +13,7 @@
 
 #include "Teuchos_Comm.hpp"
 #include "Teuchos_Time.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -107,7 +106,7 @@ int main(int argc, char *argv[]) {
   using RealT = double;
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
   ROL::Ptr<const Teuchos::Comm<int>> dcomm, ocomm;
 #ifdef HAVE_MPI
   OED_SplitComm<int,Teuchos::MpiComm<int>> splitcomm(1);
diff --git a/packages/rol/example/PDE-OPT/OED/example/helmholtz/singleFrequency/example_01K.cpp b/packages/rol/example/PDE-OPT/OED/example/helmholtz/singleFrequency/example_01K.cpp
new file mode 100644
index 000000000000..8339a1ddde7e
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/OED/example/helmholtz/singleFrequency/example_01K.cpp
@@ -0,0 +1,310 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_01.cpp
+    \brief Shows how to solve the optimal control of Helmholtz problem.
+*/
+
+#include "Teuchos_Comm.hpp"
+#include "Teuchos_Time.hpp"
+#include "ROL_GlobalMPISession.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+
+#include "ROL_Stream.hpp"
+#include "ROL_Solver.hpp"
+#include "ROL_UserInputGenerator.hpp"
+#include "ROL_ScalarLinearConstraint.hpp"
+#include "ROL_Bounds.hpp"
+#include "ROL_StdTeuchosBatchManager.hpp"
+#include "ROL_Reduced_Objective_SimOpt.hpp"
+#include "ROL_PrimalDualRisk.hpp"
+
+#include "../../../../TOOLS/linearpdeconstraintK.hpp"
+#include "../../../../TOOLS/pdeobjectiveK.hpp"
+#include "../../../../TOOLS/pdevectorK.hpp"
+#include "../../../../TOOLS/meshreaderK.hpp"
+
+#include "ROL_OED_Factory.hpp"
+#include "ROL_OED_StdMomentOperator.hpp"
+#include "../../../utilities/OED_SplitComm.hpp"
+
+#include "../src/pde_helmholtzK.hpp"
+#include "../src/obj_helmholtzK.hpp"
+#include "../src/noise.hpp"
+#include "model.hpp"
+
+template<typename Real>
+void solve(const ROL::Ptr<ROL::OED::Factory<Real>>    &factory,
+           const ROL::Ptr<ROL::SampleGenerator<Real>> &dsampler,
+           const ROL::Ptr<ROL::SampleGenerator<Real>> &osampler,
+           ROL::ParameterList                         &list,
+           const std::string                          &dtype,
+           std::ostream                               &stream     = std::cout,
+           const bool                                  useUIG     = true,
+           const bool                                  checkDeriv = false) {
+  int nsampd = dsampler->numGlobalSamples();
+  if (useUIG) {
+    factory->setDesign(1.0/static_cast<Real>(nsampd));
+  }
+  else {
+    std::stringstream dfile;
+    dfile << dtype << "_optimal_design.txt";
+    int err = factory->loadDesign(dfile.str(),2,nsampd);
+    stream << "  Factory::loadDesign exited with error " << err << std::endl;
+  }
+  stream << "  Solve " << dtype << "-optimal design problem." << std::endl;
+  std::clock_t timer = std::clock();
+  ROL::Ptr<ROL::Problem<Real>> problem = factory->get(list,osampler);
+  problem->setProjectionAlgorithm(list);
+  problem->finalize(false,true,stream);
+  std::string type = list.sublist("OED").get("Optimality Type","I");
+  bool usePD = list.sublist("OED").sublist("R-Optimality").get("Use Primal-Dual Algorithm",false);
+  if (type=="R" && usePD) {
+    ROL::PrimalDualRisk<Real> solver(problem,osampler,list);
+    if (checkDeriv) {
+      factory->check(stream);
+      //solver.check(stream);
+      factory->reset();
+    }
+    solver.run(stream);
+  }
+  else {
+    ROL::Solver<Real> solver(problem,list);
+    if (checkDeriv) {
+      factory->check(stream);
+      problem->check(true,stream);
+      factory->reset();
+    }
+    solver.solve(stream);
+  }
+  stream << "  " << dtype << "-optimal design time:      "
+    << static_cast<Real>(std::clock()-timer)/static_cast<Real>(CLOCKS_PER_SEC)
+    << " seconds" << std::endl;
+  factory->profile(stream);
+  factory->reset();
+  std::stringstream dname, pname;
+  dname << dtype << "_optimal_design";
+  factory->printDesign(dname.str());
+  pname << dtype << "_optimal_prediction_variance_samples";
+  factory->printPredictionVariance(osampler,pname.str());
+}
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+int main(int argc, char *argv[]) {
+//  feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
+
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
+  Kokkos::ScopeGuard kokkosScope (argc, argv);
+  ROL::Ptr<const Teuchos::Comm<int>> dcomm, ocomm;
+#ifdef HAVE_MPI
+  OED_SplitComm<int,Teuchos::MpiComm<int>> splitcomm(1);
+  dcomm = splitcomm.getDesignComm();
+  ocomm = splitcomm.getSampleComm();
+#else
+  dcomm = ROL::makePtr<Teuchos::SerialComm<int>>();
+  ocomm = Tpetra::getDefaultComm();
+#endif
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  const int drank = dcomm->getRank();
+  const int orank = ocomm->getRank();
+  bool iprint = (argc > 1) && (drank == 0) && (orank == 0);
+  iprint = ((argc > 2) ? true : iprint);
+  auto outStream = ROL::makeStreamPtr( std::cout, iprint );
+
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+    RealT tol(1e-8);// one(1);
+
+    /*************************************************************************/
+    /******* BUILD LINEAR REGRESSION MODEL BASED ON HELMHOLTZ ****************/
+    /*************************************************************************/
+    /*** Read in XML input ***/
+    auto parlist = ROL::getParametersFromXmlFile("input.xml");
+    int   numSpeakers = 8;
+    bool   checkDeriv = parlist->sublist("Problem").get("Check Derivatives",  false);
+    int     verbosity = parlist->sublist("General").get("Print Verbosity",0);
+    bool      alPrint = parlist->sublist("Step").sublist("Augmented Lagrangian").get("Print Intermediate Optimization History",false);
+    verbosity         = (iprint ? verbosity : 0);
+    alPrint           = (iprint ? alPrint : false);
+    parlist->sublist("General").set("Print Verbosity",verbosity);
+    parlist->sublist("Step").sublist("Augmented Lagrangian").set("Print Intermediate Optimization History",alPrint);
+
+    // Initialize PDE describing Helmholtz equation.
+    auto meshMgr = ROL::makePtr<MeshReader<RealT,DeviceT>>(*parlist);
+    auto pde = ROL::makePtr<PDE_Helmholtz_OCT<RealT,DeviceT>>(*parlist);
+    auto con = ROL::makePtr<Linear_PDE_Constraint<RealT,DeviceT>>(pde,meshMgr,dcomm,*parlist,*outStream);
+    auto assembler = con->getAssembler();
+    con->printMeshData(*outStream);
+
+    // Create state vector.
+    auto u_ptr = assembler->createStateVector();
+    auto r_ptr = assembler->createResidualVector();
+    auto up = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(u_ptr,pde,assembler,*parlist);
+    auto rp = ROL::makePtr<PDE_DualSimVector<RealT,DeviceT>>(r_ptr,pde,assembler,*parlist);
+    auto zp = ROL::makePtr<ROL::StdVector<RealT>>(numSpeakers);
+
+    // Create observation function
+    auto qoi = ROL::makePtr<QoI_Helmholtz_Observation<RealT,DeviceT>>(pde->getFE(),*parlist);
+    auto obs = ROL::makePtr<PDE_Objective<RealT,DeviceT>>(qoi,assembler);
+
+    // Run derivative checks
+    if ( checkDeriv ) {
+      up->randomize();  zp->randomize();
+      auto dup = up->clone(); dup->randomize();
+      auto dzp = zp->clone(); dzp->randomize();
+      ROL::Vector_SimOpt<RealT> uz(up,zp), duz(dup,dzp);
+      std::vector<RealT> param(2,0);
+      obs->setParameter(param);
+      *outStream << "\n\nCheck Jacobian_1 of PDE Constraint\n";
+      con->checkApplyJacobian_1(*up,*zp,*dup,*rp,true,*outStream);
+      *outStream << "\n\nCheck Jacobian_2 of PDE Constraint\n";
+      con->checkApplyJacobian_2(*up,*zp,*dzp,*rp,true,*outStream);
+      *outStream << "\n\nCheck Gradient of Full Objective Function\n";
+      obs->checkGradient(uz,duz,true,*outStream);
+      *outStream << "\n\nCheck Hessian of Full Objective Function\n";
+      obs->checkHessVec(uz,duz,true,*outStream);
+    }
+
+    // Compute states for factor decompostion and build regression model
+    *outStream << std::endl << "Begin: PDE Solves" << std::endl;
+    std::vector<ROL::Ptr<ROL::Vector<RealT>>> state(numSpeakers);
+    for (int i = 0; i < numSpeakers; ++i) {
+      // Solve real part for speaker i
+      std::clock_t time_r = std::clock();
+      con->solve(*rp,*up,*zp->basis(i),tol);
+      state[i] = up->clone(); state[i]->set(*up);
+      std::stringstream real;
+      real << "state_" << i << ".txt";
+      con->outputTpetraVector(u_ptr,real.str());
+      *outStream
+        << "  Solve " << i << " time:      "
+        << static_cast<RealT>(std::clock()-time_r)/static_cast<RealT>(CLOCKS_PER_SEC)
+        << std::endl;
+    }
+    *outStream << "End: PDE Solves" << std::endl << std::endl;
+
+    // Build nonlinear model and parameter vector
+    auto theta = ROL::makePtr<ROL::StdVector<RealT>>(numSpeakers,1);
+    auto model = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(obs,con,up,theta,up,false);
+
+    /*************************************************************************/
+    /******* BUILD EXPERIMENTAL DESIGN PROBLEM AND SOLVE *********************/
+    /*************************************************************************/
+    // Build samplers for experiment space
+    std::string pd = parlist->sublist("Problem").sublist("Design").get("Points File",          "points_5000.txt");
+    std::string wd = parlist->sublist("Problem").sublist("Design").get("Weights File",         "weights_5000.txt");
+    int nsampd     = parlist->sublist("Problem").sublist("Design").get("Number of Samples",    5000);
+    std::string po = parlist->sublist("Problem").sublist("Objective").get("Points File",       "points_200.txt");
+    std::string wo = parlist->sublist("Problem").sublist("Objective").get("Weights File",      "weights_200.txt");
+    int nsampo     = parlist->sublist("Problem").sublist("Objective").get("Number of Samples", 200);
+    *outStream << std::endl;
+    *outStream << "  Design Points File:     " << pd << std::endl;
+    *outStream << "  Design Weights File:    " << wd << std::endl;
+    *outStream << "  Objective Points File:  " << po << std::endl;
+    *outStream << "  Objective Weights File: " << wo << std::endl;
+    *outStream << std::endl;
+    auto dbman  = ROL::makePtr<ROL::StdTeuchosBatchManager<RealT,int>>(dcomm);
+    auto obman  = ROL::makePtr<ROL::StdTeuchosBatchManager<RealT,int>>(ocomm);
+    auto dsampler  = ROL::makePtr<ROL::UserInputGenerator<RealT>>(pd,wd,nsampd,2,dbman);
+    auto osampler  = ROL::makePtr<ROL::UserInputGenerator<RealT>>(po,wo,nsampo,2,obman);
+
+    // Build OED problem factory.
+    bool useUIG, homNoise;
+    std::string regType, dtype;
+    homNoise = parlist->sublist("Problem").get("Homoscedastic Noise",true);
+    regType  = parlist->sublist("Problem").get("Regression Type","Least Squares"); 
+    auto type     = ROL::OED::StringToRegressionType(regType);
+    auto noise    = ROL::makePtr<Helmholtz_Noise<RealT>>();
+    auto cov      = ROL::makePtr<ROL::OED::StdMomentOperator<RealT>>(type,homNoise,noise);
+    auto factory  = ROL::makePtr<ROL::OED::Factory<RealT>>(model,dsampler,theta,cov,*parlist);
+    obman->barrier();
+
+    // Print factors
+    bool printFactors = parlist->sublist("Problem").get("Print Factors",false);
+    if (printFactors) {
+      std::stringstream factName_d, noisName_d, factName_o, noisName_o;
+      std::ofstream factFile_d, noisFile_d, factFile_o, noisFile_o;
+      ROL::Ptr<ROL::Vector<RealT>> Fp = zp->dual().clone();
+      std::vector<RealT> pt;
+
+      factName_d << "factors_des_" << dsampler->batchID() << ".txt";
+      noisName_d << "noise_des_" << dsampler->batchID() << ".txt";
+      factFile_d.open(factName_d.str());
+      noisFile_d.open(noisName_d.str());
+      factFile_d << std::scientific << std::setprecision(15);
+      noisFile_d << std::scientific << std::setprecision(15);
+      for (int i = 0; i < dsampler->numMySamples(); ++i) {
+        pt = dsampler->getMyPoint(i);
+        factory->getFactors()->evaluate(*Fp,pt);
+        for (int j = 0; j < numSpeakers; ++j) {
+          factFile_d << std::right << std::setw(25)
+                     << (*ROL::dynamicPtrCast<ROL::StdVector<RealT>>(Fp)->getVector())[j];
+        }
+        factFile_d << std::endl;
+        for (int j = 0; j < 2; ++j)
+          noisFile_d << std::right << std::setw(25) << pt[j];
+        noisFile_d << std::right << std::setw(25) << noise->evaluate(pt) << std::endl;
+      }
+      factFile_d.close();
+      noisFile_d.close();
+
+      factName_o << "factors_opt_" << osampler->batchID() << ".txt";
+      noisName_o << "noise_opt_" << osampler->batchID() << ".txt";
+      factFile_o.open(factName_o.str());
+      noisFile_o.open(noisName_o.str());
+      factFile_o << std::scientific << std::setprecision(15);
+      noisFile_o << std::scientific << std::setprecision(15);
+      for (int i = 0; i < osampler->numMySamples(); ++i) {
+        pt = osampler->getMyPoint(i);
+        factory->getFactors()->evaluate(*Fp,pt);
+        for (int j = 0; j < numSpeakers; ++j) {
+          factFile_o << std::right << std::setw(25)
+                     << (*ROL::dynamicPtrCast<ROL::StdVector<RealT>>(Fp)->getVector())[j];
+        }
+        factFile_o << std::endl;
+        for (int j = 0; j < 2; ++j)
+          noisFile_o << std::right << std::setw(25) << pt[j];
+        noisFile_o << std::right << std::setw(25) << noise->evaluate(pt) << std::endl;
+      }
+      factFile_o.close();
+      noisFile_o.close();
+    }
+
+    // Solve OED problem
+    dtype = parlist->sublist("OED").get("Optimality Type","A");
+    useUIG = parlist->sublist("Problem").get("Uniform Initial Guess", false);
+    solve<RealT>(factory,dsampler,osampler,*parlist,dtype,*outStream,useUIG,checkDeriv);
+
+    // Get a summary from the time monitor.
+    Teuchos::TimeMonitor::summarize();
+  }
+  catch (std::logic_error &err) {
+    std::cout << "Design MPI Rank = " << drank
+              << "  Objective MPI Rank = " << orank
+              << std::endl << err.what() << std::endl; 
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/OED/example/helmholtz/src/obj_helmholtzK.hpp b/packages/rol/example/PDE-OPT/OED/example/helmholtz/src/obj_helmholtzK.hpp
new file mode 100644
index 000000000000..0f9887a8bb75
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/OED/example/helmholtz/src/obj_helmholtzK.hpp
@@ -0,0 +1,280 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  obj.hpp
+    \brief Provides the interface for local (cell-based) objective function computations.
+*/
+
+#ifndef PDEOPT_QOI_HELMHOLTZ_OCT_REALK_HPP
+#define PDEOPT_QOI_HELMHOLTZ_OCT_REALK_HPP
+
+#include "../../../../TOOLS/qoiK.hpp"
+#include "ROL_StdObjective.hpp"
+#include "pde_helmholtzK.hpp"
+
+template <class Real,class DeviceType>
+class QoI_Helmholtz_Observation : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> fe_;
+  Real width_, coeff_;
+
+  Real observationFunc(const std::vector<Real> &x, const std::vector<Real> &loc, int deriv = 0, int dim1 = 0, int dim2 = 0) const {
+    const Real zero(0), half(0.5), one(1);
+    const int d = x.size();
+    Real dot(0), width2 = std::pow(width_,2);
+    for (int i = 0; i < d; ++i)
+      dot += std::pow((x[i]-loc[i]),2)/width2;
+    Real val = std::exp(-half*dot);
+    if (deriv == 1) {
+      val *= (x[dim1]-loc[dim1])/width2;
+    }
+    else if (deriv == 2) {
+      Real v0 = (dim1==dim2 ? -one/width2 : zero);
+      Real v1 = (x[dim1]-loc[dim1])/width2;
+      Real v2 = (x[dim2]-loc[dim2])/width2;
+      val *= (v0 + v1*v2);
+    }
+    return coeff_ * val;
+  }
+
+  void evaluateObservation(scalar_view &out, const std::vector<Real> &loc, int deriv = 0, int dim1 = 0, int dim2 = 0) const {
+    const int c = fe_->gradN().extent_int(0);
+    const int p = fe_->gradN().extent_int(2);
+    const int d = fe_->gradN().extent_int(3);
+    out = scalar_view("out",c,p);
+    std::vector<Real> x(d);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k)
+          x[k] = (fe_->cubPts())(i,j,k);
+        out(i,j) = observationFunc(x,loc,deriv,dim1,dim2);
+      }
+    }
+  }
+
+public:
+  QoI_Helmholtz_Observation(const ROL::Ptr<fe_type> &fe,
+                            ROL::ParameterList &parlist)
+    : fe_(fe) {
+    width_ = parlist.sublist("Problem").get("Microphone Width",5e-2);
+    coeff_ = static_cast<Real>(1)/(static_cast<Real>(2.0*M_PI)*std::pow(width_,2));
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    // Get relevant dimensions
+    const int c = fe_->gradN().extent_int(0);
+    const int p = fe_->gradN().extent_int(2);
+    // Initialize output val
+    val = scalar_view("val", c);
+    // Get components of the state
+    scalar_view u_real("u_real", c, p);
+    fe_->evaluateValue(u_real, u_coeff);
+    // Compute phi
+    scalar_view phi;
+    evaluateObservation(phi,QoI<Real,DeviceType>::getParameter());
+    // Integrate observation
+    fe_->computeIntegral(val,phi,u_real,false);
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    // Get relevant dimensions
+    const int c = fe_->gradN().extent_int(0);
+    const int f = fe_->gradN().extent_int(1);
+    // Initialize output grad
+    grad = scalar_view("grad", c, f);
+    // Compute phi
+    scalar_view phi;
+    evaluateObservation(phi,QoI<Real,DeviceType>::getParameter());
+    // Integrate observation derivative
+    fst::integrate(grad,phi,fe_->NdetJ(),false);
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::Zero(">>> QoI_Helmholtz_Observation::gradient_2 is zero.");
+  }
+
+  std::vector<Real> gradient_3(std::vector<scalar_view> & grad,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::Zero(">>> QoI_Helmholtz_Observation::gradient_3 is zero.");
+    // Get relevant dimensions
+    const int c = fe_->gradN().extent_int(0);
+    const int p = fe_->gradN().extent_int(2);
+    const int d = fe_->gradN().extent_int(3);
+    // Get components of the state
+    scalar_view u_real("u_real", c, p);
+    fe_->evaluateValue(u_real, u_coeff);
+    for (int i = 0; i < d; ++i) {
+      // Initialize output val
+      grad[i] = scalar_view("grad", c);
+      // Compute phi'
+      scalar_view phi;
+      evaluateObservation(phi,*z_param,1,i);
+      // Integrate observation gradient
+      fe_->computeIntegral(grad[i],phi,u_real,false);
+    }
+    std::vector<Real> empty(d);
+    return empty;
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::Zero(">>> QoI_Helmholtz::HessVec_11 is zero.");
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::Zero(">>> QoI_Helmholtz::HessVec_12 is zero.");
+  }
+
+  void HessVec_13(scalar_view & hess,
+                          const ROL::Ptr<const std::vector<Real>> & v_param,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::Zero(">>> QoI_Helmholtz::HessVec_13 is zero.");
+    // Get relevant dimensions
+    const int c = fe_->gradN().extent_int(0);
+    const int f = fe_->gradN().extent_int(1);
+    const int p = fe_->gradN().extent_int(2);
+    const int d = fe_->gradN().extent_int(3);
+    // Initialize hessian
+    hess = scalar_view("hess", c, f);
+    // Get components of the state
+    scalar_view vphi("vphi", c, p);
+    for (int i = 0; i < d; ++i) {
+      // Compute phi'
+      scalar_view phi;
+      evaluateObservation(phi,*z_param,1,i);
+      // Weight phi'
+      rst::scale(vphi, phi, (*v_param)[i]);
+      // Integrate observation gradient
+      fst::integrate(hess,vphi,fe_->NdetJ(),true);
+    }
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::Zero(">>> QoI_Helmholtz::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::Zero(">>> QoI_Helmholtz::HessVec_22 is zero.");
+  }
+
+  void HessVec_23(scalar_view & hess,
+                          const ROL::Ptr<const std::vector<Real>> & v_param,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::Zero(">>> QOI_Helmholtz::HessVec_23 is zero.");
+  }
+
+  std::vector<Real> HessVec_31(std::vector<scalar_view> & hess,
+                          const scalar_view v_coeff,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::Zero(">>> QoI_Helmholtz::HessVec_31 is zero.");
+    // Get relevant dimensions
+    const int c = fe_->gradN().extent_int(0);
+    const int p = fe_->gradN().extent_int(2);
+    const int d = fe_->gradN().extent_int(3);
+    // Get components of the state
+    scalar_view v_real("v_real", c, p);
+    fe_->evaluateValue(v_real, v_coeff);
+    for (int i = 0; i < d; ++i) {
+      // Initialize output hess
+      hess[i] = scalar_view("hess", c);
+      // Compute phi
+      scalar_view phi;
+      evaluateObservation(phi,*z_param,1,i);
+      // Integrate observation
+      fe_->computeIntegral(hess[i],phi,v_real,false);
+    }
+    std::vector<Real> empty(d);
+    return empty;
+  }
+
+  std::vector<Real> HessVec_32(std::vector<scalar_view> & hess,
+                          const scalar_view v_coeff,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::Zero(">>> QoI_Helmholtz::HessVec_32 is zero.");
+  }
+
+  std::vector<Real> HessVec_33(std::vector<scalar_view> & hess,
+                          const ROL::Ptr<const std::vector<Real>> & v_param,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::Zero(">>> QoI_Helmholtz::HessVec_33 is zero.");
+    // Get relevant dimensions
+    const int c = fe_->gradN().extent_int(0);
+    const int p = fe_->gradN().extent_int(2);
+    const int d = fe_->gradN().extent_int(3);
+    // Get components of the state
+    scalar_view u_real("u_real", c, p);
+    fe_->evaluateValue(u_real, u_coeff);
+
+    scalar_view vphi("vphi", c, p);
+    for (int i = 0; i < d; ++i) {
+      // Initialize output val
+      hess[i] = scalar_view("hess", c);
+      // Compute phi'
+      Kokkos::deep_copy(vphi,static_cast<Real>(0));
+      for (int j = 0; j < d; ++j) {
+        scalar_view phi;
+        evaluateObservation(phi,*z_param,2,i,j);
+        rst::scale(phi, (*v_param)[j]);
+        rst::add(vphi, phi);
+      }
+      // Integrate observation gradient
+      fe_->computeIntegral(hess[i],vphi,u_real,false);
+    }
+    std::vector<Real> empty(d);
+    return empty;
+  }
+
+}; // QoI_Helmholtz_Observation
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/OED/example/helmholtz/src/pde_helmholtzK.hpp b/packages/rol/example/PDE-OPT/OED/example/helmholtz/src/pde_helmholtzK.hpp
new file mode 100644
index 000000000000..a537d4933a15
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/OED/example/helmholtz/src/pde_helmholtzK.hpp
@@ -0,0 +1,427 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  pde_helmholtz.hpp
+    \brief Implements the local PDE interface for the optimal control of
+           Helmholtz.
+*/
+
+#ifndef PDE_HELMHOLTZ_OCT_REALK_HPP
+#define PDE_HELMHOLTZ_OCT_REALK_HPP
+
+#include "../../../../TOOLS/pdeK.hpp"
+#include "../../../../TOOLS/feK.hpp"
+#include "../../../../TOOLS/fieldhelperK.hpp"
+
+#include "Intrepid2_HGRAD_QUAD_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_TRI_C1_FEM.hpp"
+#include "Intrepid2_DefaultCubatureFactory.hpp"
+#include "Intrepid2_FunctionSpaceTools.hpp"
+#include "Intrepid2_CellTools.hpp"
+
+#include "ROL_Ptr.hpp"
+
+
+template <class Real,class DeviceType>
+class PDE_Helmholtz_OCT : public PDE<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  // Finite element basis information
+  basis_ptr basisPtr_;
+  std::vector<basis_ptr> basisPtrs_;
+  // Cell cubature information
+  ROL::Ptr<Intrepid2::Cubature<DeviceType,Real,Real>> cellCub_, bdryCub_;
+  // Cell node information
+  scalar_view volCellNodes_;
+  std::vector<std::vector<scalar_view>> bdryCellNodes_;
+  std::vector<std::vector<std::vector<int>>> bdryCellLocIds_;
+  // Finite element definition
+  ROL::Ptr<fe_type> fe_;
+  std::vector<std::vector<ROL::Ptr<fe_type>>> feBdry_;
+  // Local degrees of freedom on boundary, for each side of the reference cell (first index).
+  std::vector<std::vector<int>> fidx_;
+  // Coordinates of degrees freedom on boundary cells.
+  // Indexing:  [sideset number][local side id](cell number, value at dof)
+  std::vector<std::vector<scalar_view>> bdryCellDofValues_;
+
+  Real waveNumber_;
+  Real RoiWaveNumber_;
+  Real RoiIncWaveNumber_;
+  Real RoiRadius_;
+  Real impFactor_;
+  Real freq_;
+  Real density_;
+
+  Real refractiveIndex(const std::vector<Real> &x) const {
+    Real xnorm(0), xdnorm(0), val(0);
+    const int d = x.size();
+    const Real sqr2(std::sqrt(2.0)), half(0.5);
+    const std::vector<Real> xd = {-half/sqr2*RoiRadius_, -half/sqr2*RoiRadius_};
+    for (int i = 0; i < d; ++i) {
+      xnorm += x[i]*x[i];
+      xdnorm += (x[i]-xd[i])*(x[i]-xd[i]);
+    }
+    xnorm  = std::sqrt(xnorm);
+    xdnorm = std::sqrt(xdnorm);
+    val   = ((xnorm <= RoiRadius_)
+          ? ((xdnorm <= half*RoiRadius_)
+          ? RoiIncWaveNumber_ : RoiWaveNumber_) : waveNumber_);
+    return val;
+  }
+
+  void computeRefractiveIndex(scalar_view &kappa) const {
+    const int c = fe_->gradN().extent_int(0);
+    const int p = fe_->gradN().extent_int(2);
+    const int d = fe_->gradN().extent_int(3);
+    std::vector<Real> x(d);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k)
+          x[k] = (fe_->cubPts())(i,j,k);
+        kappa(i,j) = std::pow(refractiveIndex(x), 2);
+      }
+    }
+  }
+
+  void computeNeumannControl(scalar_view &Bz,
+                             const ROL::Ptr<const std::vector<Real>> &zp,
+                             int sideset,
+                             int deriv = 0) const {
+    const int c = Bz.extent_int(0);
+    const int p = Bz.extent_int(1);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        Bz(i,j) = (deriv==0 ? (*zp)[sideset]
+                : (deriv==1 ? static_cast<Real>(1)
+                : static_cast<Real>(0)));
+      }
+    }
+  }
+
+  scalar_view getBoundaryCoeff(const scalar_view &cell_coeff, int sideSet, int cell) const {
+    std::vector<int> bdryCellLocId = bdryCellLocIds_[sideSet][cell];
+    const int numCellsSide = bdryCellLocId.size();
+    const int f = basisPtr_->getCardinality();
+    
+    scalar_view bdry_coeff("bdry_coeff", numCellsSide, f);
+    for (int i = 0; i < numCellsSide; ++i) {
+      for (int j = 0; j < f; ++j)
+        bdry_coeff(i, j) = cell_coeff(bdryCellLocId[i], j);
+    }
+    return bdry_coeff;
+  }
+
+public:
+  PDE_Helmholtz_OCT(ROL::ParameterList &parlist) {
+    // Finite element fields.
+    std::string elemtype = parlist.sublist("Geometry").get("Element Shape","quad");
+    if (elemtype == "quad")
+      basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceType,Real,Real>>();
+    else if (elemtype == "tri")
+      basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_TRI_C1_FEM<DeviceType,Real,Real>>();
+    else
+      throw ROL::Exception::NotImplemented(">>> Element type not implemented.");
+    basisPtrs_.clear(); basisPtrs_.push_back(basisPtr_);  // acoustic pressure
+    // Quadrature rules.
+    shards::CellTopology cellType = basisPtr_->getBaseCellTopology();        // get the cell type from the basis
+    Intrepid2::DefaultCubatureFactory cubFactory;                            // create cubature factory
+    int cubDegree = parlist.sublist("Problem").get("Cubature Degree", 2);    // set cubature degree, e.g., 2
+    cellCub_ = cubFactory.create<DeviceType,Real,Real>(cellType, cubDegree); // create default cubature
+    int d = cellType.getDimension();
+
+    shards::CellTopology bdryCellType = cellType.getCellTopologyData(d-1, 0);
+    int bdryCubDegree = parlist.sublist("Problem").get("Boundary Cubature Degree",2); // set cubature degree, e.g., 2
+    bdryCub_ = cubFactory.create<DeviceType,Real,Real>(bdryCellType, bdryCubDegree);
+
+    const Real pi2    = static_cast<Real>(2.0*M_PI);
+    // Speed of sound of air
+    Real speed        = parlist.sublist("Problem").get("Speed of Sound",               343.0);
+    // Speed of sound of aluminum
+    Real RoiSpeed     = parlist.sublist("Problem").get("ROI Speed of Sound",           5100.0);
+    // Speed of sound of rubber
+    Real RoiIncSpeed  = parlist.sublist("Problem").get("ROI Inclusion Speed of Sound", 1600.0);
+    // Between middle C and the middle C on the treble clef
+    freq_             = parlist.sublist("Problem").get("Frequency",                    400.0);
+    // Density of air at 20C and standard atmospheric pressure
+    density_          = parlist.sublist("Problem").get("Density",                      1.204);
+    // Impedance for air v.s. wood (420.175 PA s/m / 0.5e6 PA s/m)
+    impFactor_        = parlist.sublist("Problem").get("Impedance Factor",             8.4e-4);
+    RoiRadius_        = parlist.sublist("Problem").get("ROI Radius",                   0.5);
+    waveNumber_       = pi2*freq_/speed;
+    RoiWaveNumber_    = pi2*freq_/RoiSpeed;
+    RoiIncWaveNumber_ = pi2*freq_/RoiIncSpeed;
+  }
+
+  void residual(scalar_view & res,
+                const scalar_view u_coeff,
+                const scalar_view z_coeff = scalar_view(),
+                const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c = fe_->gradN().extent_int(0);
+    const int f = fe_->gradN().extent_int(1);
+    const int p = fe_->gradN().extent_int(2);
+    const int d = fe_->gradN().extent_int(3);
+    // Initialize residuals.
+    res = scalar_view("res",c,f);
+    // Initialize storage
+    scalar_view valU_eval("valU_eval", c, p);
+    scalar_view gradU_eval("gradU_eval", c, p, d);
+    scalar_view kappa("kappa", c, p);
+    scalar_view kappaU("kappaU", c, p);
+    // Evaluate/interpolate finite element fields on cells.
+    fe_->evaluateValue(valU_eval, u_coeff);
+    fe_->evaluateGradient(gradU_eval, u_coeff);
+    // Build wave number
+    computeRefractiveIndex(kappa);
+    fst::scalarMultiplyDataData(kappaU,kappa,valU_eval);
+    rst::scale(kappaU,static_cast<Real>(-1));
+
+    /*******************************************************************/
+    /*** Evaluate weak form of the residual.****************************/
+    /*******************************************************************/
+    fst::integrate(res,gradU_eval,fe_->gradNdetJ(),false);
+    fst::integrate(res,kappaU,fe_->NdetJ(),true);
+
+    // APPLY BOUNDARY CONDITIONS
+    const int numSidesets = bdryCellLocIds_.size();
+    for (int s = 0; s < numSidesets; ++s) {
+      if (s != 8) { // s = 8 -> Speaker cabinents (0 impedance)
+        const int numLocalSideIds = bdryCellLocIds_[s].size();
+        const int numCubPerSide = bdryCub_->getNumPoints();
+        for (int j = 0; j < numLocalSideIds; ++j) {
+          const int numCellsSide = bdryCellLocIds_[s][j].size();
+          if (numCellsSide) {
+            scalar_view BCres("BCres", numCellsSide, f);
+            scalar_view BCcomp("BCcomp", numCellsSide, numCubPerSide);
+            // Compute control operator
+            computeNeumannControl(BCcomp,z_param,s,0);
+            // Integrate residual
+            fst::integrate(BCres,BCcomp,feBdry_[s][j]->NdetJ(),false);
+            // Add Robin and Neumann control residual to volume residual
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[s][j][k];
+              for (int l = 0; l < f; ++l)
+                res(cidx,l) -= density_ * freq_ * BCres(k,l);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void Jacobian_1(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    int p = fe_->gradN().extent_int(2);
+    // Initialize Jacobians.
+    jac = scalar_view("jac", c, f, f);
+    // Initialize storage.
+    scalar_view kappa("kappa", c, p);
+    scalar_view kappaN("kappaN", c, f, p);
+    // Build wave number
+    computeRefractiveIndex(kappa);
+    fst::scalarMultiplyDataField(kappaN,kappa,fe_->N());
+    rst::scale(kappaN,static_cast<Real>(-1));
+
+    /*** Evaluate weak form of the Jacobian. ***/
+    fst::integrate(jac,kappaN,fe_->NdetJ(),false);
+    rst::add(jac,fe_->stiffMat());
+  }
+
+
+  void Jacobian_2(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Helmholtz_OCT::Jacobian_2): Jacobian is zero.");
+  }
+
+  void Jacobian_3(std::vector<scalar_view> & jac,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // GET DIMENSIONS
+    const int c = fe_->gradN().extent_int(0);
+    const int f = fe_->gradN().extent_int(1);
+    // ADD CONTROL TERM TO RESIDUAL
+    const int numSidesets = bdryCellLocIds_.size();
+    for (int s = 0; s < numSidesets; ++s) {
+      if (s != 8) { // s = 8 -> Speaker cabinent (zero impedance)
+        jac[s] = scalar_view("jac", c, f);
+        const int numLocalSideIds = bdryCellLocIds_[s].size();
+        const int numCubPerSide = bdryCub_->getNumPoints();
+        for (int j = 0; j < numLocalSideIds; ++j) {
+          const int numCellsSide = bdryCellLocIds_[s][j].size();
+          if (numCellsSide) {
+            // Compute control operator
+            scalar_view Bz("Bz", numCellsSide, numCubPerSide);
+            computeNeumannControl(Bz,z_param,s,1);
+            // Compute Neumann residual
+            scalar_view neumJac("neumJac", numCellsSide, f);
+            fst::integrate(neumJac,Bz,feBdry_[s][j]->NdetJ(),false);
+            // Add Robin and Neumann control residual to volume residual
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[s][j][k];
+              for (int l = 0; l < f; ++l) { 
+                (jac[s])(cidx,l) -= density_*freq_*neumJac(k,l);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void Hessian_11(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Helmholtz_OCT::Hessian_11): Hessian is zero.");
+  }
+
+  void Hessian_12(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Helmholtz_OCT::Hessian_12): Hessian is zero.");
+  }
+
+  void Hessian_13(std::vector<scalar_view> & hess,
+                          const scalar_view l_coeff,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Helmholtz_OCT::Hessian_13): Hessian is zero.");
+  }
+
+  void Hessian_21(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Helmholtz_OCT::Hessian_21): Hessian is zero.");
+  }
+
+  void Hessian_22(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Helmholtz_OCT::Hessian_22): Hessian is zero.");
+  }
+
+  void Hessian_23(std::vector<scalar_view> & hess,
+                          const scalar_view l_coeff,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Helmholtz_OCT::Hessian_23): Hessian is zero.");
+  }
+
+  void Hessian_31(std::vector<scalar_view> & hess,
+                          const scalar_view l_coeff,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Helmholtz_OCT::Hessian_31): Hessian is zero.");
+  }
+
+  void Hessian_32(std::vector<scalar_view> & hess,
+                          const scalar_view l_coeff,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Helmholtz_OCT::Hessian_32): Hessian is zero.");
+  }
+
+  void Hessian_33(std::vector<std::vector<scalar_view>> & hess,
+                          const scalar_view l_coeff,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Helmholtz_OCT::Hessian_33): Hessian is zero.");
+  }
+
+  void RieszMap_1(scalar_view & riesz) override {
+    //throw Exception::NotImplemented(">>> (PDE_TopoOpt::RieszMap_1): Not implemented.");
+    // Retrieve dimensions.
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    // Initialize Riesz Map.
+    riesz = scalar_view("riesz1",c,f,f);
+    Kokkos::deep_copy(riesz,fe_->stiffMat());
+    rst::add(riesz,fe_->massMat());
+  }
+
+  void RieszMap_2(scalar_view & riesz) override {
+    //throw Exception::NotImplemented(">>> (PDE_TopoOpt::RieszMap_2): Not implemented.");
+    // Retrieve dimensions.
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    // Initialize Jacobians.
+    riesz = scalar_view("riesz2",c,f,f);
+    Kokkos::deep_copy(riesz,fe_->massMat());
+  }
+
+  std::vector<basis_ptr> getFields() override {
+    return basisPtrs_;
+  }
+
+  void setCellNodes(const scalar_view &volCellNodes,
+                    const std::vector<std::vector<scalar_view>> &bdryCellNodes,
+                    const std::vector<std::vector<std::vector<int>>> &bdryCellLocIds) override {
+    volCellNodes_ = volCellNodes;
+    bdryCellNodes_ = bdryCellNodes;
+    bdryCellLocIds_ = bdryCellLocIds;
+    // Finite element definition.
+    fe_ = ROL::makePtr<fe_type>(volCellNodes_,basisPtr_,cellCub_);
+    fidx_ = fe_->getBoundaryDofs();
+    // Construct boundary FE
+    const int numSidesets = bdryCellNodes.size();
+    feBdry_.resize(numSidesets);
+    for (int s = 0; s < numSidesets; ++s) {
+      const int numLocSides = bdryCellNodes[s].size();
+      feBdry_[s].resize(numLocSides);
+      for (int j = 0; j < numLocSides; ++j) {
+        if (bdryCellNodes[s][j] != scalar_view()) {
+          feBdry_[s][j] = ROL::makePtr<fe_type>(bdryCellNodes[s][j],basisPtr_,bdryCub_,j);
+        }
+      }
+    }
+  }
+
+  const ROL::Ptr<fe_type> getFE(void) const {
+    return fe_;
+  }
+
+  const std::vector<std::vector<ROL::Ptr<fe_type>>> getBdryFE(void) const {
+    return feBdry_;
+  }
+
+  const std::vector<std::vector<int>> getBdryCellLocIds(const int sideset = 0) const {
+    return bdryCellLocIds_[sideset];
+  }
+
+}; // PDE_Helmholtz_OCT
+
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/OED/example/poisson/singleObs/CMakeLists.txt b/packages/rol/example/PDE-OPT/OED/example/poisson/singleObs/CMakeLists.txt
index 48ec4aea62ad..cf4064eb3a3b 100644
--- a/packages/rol/example/PDE-OPT/OED/example/poisson/singleObs/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/OED/example/poisson/singleObs/CMakeLists.txt
@@ -11,7 +11,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/teuchos/src/sol)
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_01
     SOURCES example_01.cpp
     ADD_DIR_TO_NAME
@@ -26,7 +26,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     OEDPoissonSingleObsDataCopy
     SOURCE_FILES
       input.xml
@@ -34,5 +34,41 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
   )
 
+ENDIF()
+
+IF(${PROJECT_NAME}_ENABLE_Intrepid2 AND 
+   ${PROJECT_NAME}_ENABLE_Amesos2   AND
+   ${PROJECT_NAME}_ENABLE_Ifpack2   AND
+   ${PROJECT_NAME}_ENABLE_MueLu     AND
+   ${PROJECT_NAME}_ENABLE_Teuchos   AND
+   ${PROJECT_NAME}_ENABLE_Tpetra )
+
+  # Need ROL_TpetraMultiVector.hpp
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/teuchos/src/sol)
+
+  ROL_ADD_EXECUTABLE(
+    example_01_Kokkos
+    SOURCES example_01K.cpp
+    ADD_DIR_TO_NAME
+  )
+
+  TRIBITS_ADD_TEST(
+    example_01_Kokkos
+    ARGS PrintItAll
+    NUM_MPI_PROCS 4
+    NUM_TOTAL_CORES_USED 4
+    PASS_REGULAR_EXPRESSION "TEST PASSED"
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_COPY_FILES_TO_BINARY_DIR(
+    OEDPoissonSingleObsDataCopyK
+    SOURCE_FILES
+      input.xml
+    SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}"
+    DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
+  )
 
 ENDIF()
diff --git a/packages/rol/example/PDE-OPT/OED/example/poisson/singleObs/example_01.cpp b/packages/rol/example/PDE-OPT/OED/example/poisson/singleObs/example_01.cpp
index 8579148f6230..ee1eda096428 100644
--- a/packages/rol/example/PDE-OPT/OED/example/poisson/singleObs/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/OED/example/poisson/singleObs/example_01.cpp
@@ -13,8 +13,7 @@
 
 #include "Teuchos_Comm.hpp"
 #include "Teuchos_Time.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -109,7 +108,7 @@ int main(int argc, char *argv[]) {
   using RealT = double;
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
   ROL::Ptr<const Teuchos::Comm<int>> dcomm, ocomm;
 #ifdef HAVE_MPI
   OED_SplitComm<int,Teuchos::MpiComm<int>> splitcomm(1);
diff --git a/packages/rol/example/PDE-OPT/OED/example/poisson/singleObs/example_01K.cpp b/packages/rol/example/PDE-OPT/OED/example/poisson/singleObs/example_01K.cpp
new file mode 100644
index 000000000000..b2ee3cd30a27
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/OED/example/poisson/singleObs/example_01K.cpp
@@ -0,0 +1,323 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_01.cpp
+    \brief Shows how to solve the optimal control of Helmholtz problem.
+*/
+
+#include "Teuchos_Comm.hpp"
+#include "Teuchos_Time.hpp"
+#include "ROL_GlobalMPISession.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+
+#include "ROL_Stream.hpp"
+#include "ROL_Solver.hpp"
+#include "ROL_MonteCarloGenerator.hpp"
+#include "ROL_ScalarLinearConstraint.hpp"
+#include "ROL_Bounds.hpp"
+#include "ROL_StdTeuchosBatchManager.hpp"
+#include "ROL_Reduced_Objective_SimOpt.hpp"
+#include "ROL_PrimalDualRisk.hpp"
+
+#include "../../../../TOOLS/linearpdeconstraintK.hpp"
+#include "../../../../TOOLS/pdeobjectiveK.hpp"
+#include "../../../../TOOLS/pdevectorK.hpp"
+#include "../../../../TOOLS/meshreaderK.hpp"
+
+#include "ROL_OED_Factory.hpp"
+#include "ROL_OED_StdMomentOperator.hpp"
+#include "../../../utilities/OED_SplitComm.hpp"
+
+#include "../src/pde_poissonK.hpp"
+#include "../src/obj_poissonK.hpp"
+#include "../src/noise.hpp"
+#include "../src/meshK.hpp"
+
+template<typename Real>
+void solve(const ROL::Ptr<ROL::OED::Factory<Real>>    &factory,
+           const ROL::Ptr<ROL::SampleGenerator<Real>> &dsampler,
+           const ROL::Ptr<ROL::SampleGenerator<Real>> &osampler,
+           ROL::ParameterList                         &list,
+           const std::string                          &dtype,
+           std::ostream                               &stream     = std::cout,
+           const bool                                  useUIG     = true,
+           const bool                                  checkDeriv = false) {
+  int nsampd = dsampler->numGlobalSamples();
+  if (useUIG) {
+    factory->setDesign(1.0/static_cast<Real>(nsampd));
+  }
+  else {
+    std::stringstream dfile;
+    dfile << dtype << "_optimal_design.txt";
+    int err = factory->loadDesign(dfile.str(),2,nsampd);
+    stream << "  Factory::loadDesign exited with error " << err << std::endl;
+  }
+  stream << "  Solve " << dtype << "-optimal design problem." << std::endl;
+  std::clock_t timer = std::clock();
+  ROL::Ptr<ROL::Problem<Real>> problem = factory->get(list,osampler);
+  problem->setProjectionAlgorithm(list);
+  problem->finalize(false,true,stream);
+  std::string type = list.sublist("OED").get("Optimality Type","I");
+  bool usePD = list.sublist("OED").sublist("R-Optimality").get("Use Primal-Dual Algorithm",false);
+  if (type=="R" && usePD) {
+    ROL::PrimalDualRisk<Real> solver(problem,osampler,list);
+    // Commented out because check uses random vectors that do not respect bounds
+    //if (checkDeriv) {
+    //  factory->check(stream);
+    //  //solver.check(stream);
+    //  factory->reset();
+    //}
+    solver.run(stream);
+  }
+  else {
+    ROL::Solver<Real> solver(problem,list);
+    // Commented out because check uses random vectors that do not respect bounds
+    //if (checkDeriv) {
+    //  factory->check(stream);
+    //  problem->check(true,stream);
+    //  factory->reset();
+    //}
+    solver.solve(stream);
+  }
+  stream << "  " << dtype << "-optimal design time:      "
+    << static_cast<Real>(std::clock()-timer)/static_cast<Real>(CLOCKS_PER_SEC)
+    << " seconds" << std::endl;
+  factory->profile(stream);
+  factory->reset();
+  std::stringstream dname, pname;
+  dname << dtype << "_optimal_design";
+  factory->printDesign(dname.str());
+  pname << dtype << "_optimal_prediction_variance_samples";
+  factory->printPredictionVariance(osampler,pname.str());
+}
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+int main(int argc, char *argv[]) {
+//  feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
+
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
+  Kokkos::ScopeGuard kokkosScope (argc, argv);
+  ROL::Ptr<const Teuchos::Comm<int>> dcomm, ocomm;
+#ifdef HAVE_MPI
+  OED_SplitComm<int,Teuchos::MpiComm<int>> splitcomm(1);
+  dcomm = splitcomm.getDesignComm();
+  ocomm = splitcomm.getSampleComm();
+#else
+  dcomm = ROL::makePtr<Teuchos::SerialComm<int>>();
+  ocomm = Tpetra::getDefaultComm();
+#endif
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  const int drank = dcomm->getRank();
+  const int orank = ocomm->getRank();
+  bool iprint = (argc > 1) && (drank == 0) && (orank == 0);
+  iprint = ((argc > 2) ? true : iprint);
+  ROL::Ptr<std::ostream> outStream
+    = ROL::makeStreamPtr( std::cout, iprint );
+
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+    RealT tol(1e-8);// one(1);
+
+    /*************************************************************************/
+    /******* BUILD LINEAR REGRESSION MODEL BASED ON HELMHOLTZ ****************/
+    /*************************************************************************/
+    /*** Read in XML input ***/
+    auto parlist = ROL::getParametersFromXmlFile("input.xml");
+    int      numSides = 4;
+    bool   checkDeriv = parlist->sublist("Problem").get("Check Derivatives",  false);
+    int     verbosity = parlist->sublist("General").get("Print Verbosity",0);
+    bool      alPrint = parlist->sublist("Step").sublist("Augmented Lagrangian").get("Print Intermediate Optimization History",false);
+    verbosity         = (iprint ? verbosity : 0);
+    alPrint           = (iprint ? alPrint : false);
+    parlist->sublist("General").set("Print Verbosity",verbosity);
+    parlist->sublist("Step").sublist("Augmented Lagrangian").set("Print Intermediate Optimization History",alPrint);
+
+    // Initialize PDE describing Poisson equation.
+    auto meshMgr = ROL::makePtr<MeshManager_OED_Poisson<RealT,DeviceT>>(*parlist);
+    auto pde = ROL::makePtr<PDE_OED_Poisson<RealT,DeviceT>>(*parlist);
+    auto con = ROL::makePtr<Linear_PDE_Constraint<RealT,DeviceT>>(pde,meshMgr,dcomm,*parlist,*outStream);
+    auto assembler = con->getAssembler();
+    con->printMeshData(*outStream);
+
+    // Create state vector.
+    auto u_ptr = assembler->createStateVector();
+    auto r_ptr = assembler->createResidualVector();
+    auto up = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(u_ptr,pde,assembler,*parlist);
+    auto rp = ROL::makePtr<PDE_DualSimVector<RealT,DeviceT>>(r_ptr,pde,assembler,*parlist);
+    auto zp = ROL::makePtr<ROL::StdVector<RealT>>(numSides);
+
+    // Create observation function
+    auto qoi = ROL::makePtr<QoI_Poisson_Observation<RealT,DeviceT>>(pde->getFE(),*parlist);
+    auto obs = ROL::makePtr<PDE_Objective<RealT,DeviceT>>(qoi,assembler);
+
+    // Run derivative checks
+    if ( checkDeriv ) {
+      up->randomize();  zp->randomize();
+      auto dup = up->clone(); dup->randomize();
+      auto dzp = zp->clone(); dzp->randomize();
+      ROL::Vector_SimOpt<RealT> uz(up,zp), duz(dup,dzp);
+      std::vector<RealT> param(2,0);
+      obs->setParameter(param);
+      *outStream << "\n\nCheck Jacobian_1 of PDE Constraint\n";
+      con->checkApplyJacobian_1(*up,*zp,*dup,*rp,true,*outStream);
+      *outStream << "\n\nCheck Jacobian_2 of PDE Constraint\n";
+      con->checkApplyJacobian_2(*up,*zp,*dzp,*rp,true,*outStream);
+      *outStream << "\n\nCheck Gradient of Full Objective Function\n";
+      obs->checkGradient(uz,duz,true,*outStream);
+      *outStream << "\n\nCheck Hessian of Full Objective Function\n";
+      obs->checkHessVec(uz,duz,true,*outStream);
+    }
+
+    // Compute states for factor decompostion and build regression model
+    *outStream << std::endl << "Begin: PDE Solves" << std::endl;
+    std::vector<ROL::Ptr<ROL::Vector<RealT>>> state(numSides);
+    for (int i = 0; i < numSides; ++i) {
+      std::clock_t time_r = std::clock();
+      con->solve(*rp,*up,*zp->basis(i),tol);
+      state[i] = up->clone(); state[i]->set(*up);
+      std::stringstream real;
+      real << "state_" << i << ".txt";
+      con->outputTpetraVector(u_ptr,real.str());
+      *outStream
+        << "  Solve " << i << " time:      "
+        << static_cast<RealT>(std::clock()-time_r)/static_cast<RealT>(CLOCKS_PER_SEC)
+        << std::endl;
+    }
+    *outStream << "End: PDE Solves" << std::endl << std::endl;
+
+    // Build nonlinear model and parameter vector
+    auto theta = ROL::makePtr<ROL::StdVector<RealT>>(numSides,1);
+    auto model = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(obs,con,up,theta,up,false);
+
+    /*************************************************************************/
+    /******* BUILD EXPERIMENTAL DESIGN PROBLEM AND SOLVE *********************/
+    /*************************************************************************/
+    // Build samplers for experiment space
+    int nsampd     = parlist->sublist("Problem").sublist("Design").get("Number of Samples",    5000);
+    int nsampo     = parlist->sublist("Problem").sublist("Objective").get("Number of Samples", 200);
+    std::vector<std::vector<RealT>> bounds(2);
+    bounds[0] = {RealT(0), RealT(1)};
+    bounds[1] = {RealT(0), RealT(1)};
+    auto dbman = ROL::makePtr<ROL::StdTeuchosBatchManager<RealT,int>>(dcomm);
+    auto obman = ROL::makePtr<ROL::StdTeuchosBatchManager<RealT,int>>(ocomm);
+    auto dsampler = ROL::makePtr<ROL::MonteCarloGenerator<RealT>>(nsampd,bounds,dbman,false,false,0,1);
+    auto osampler = ROL::makePtr<ROL::MonteCarloGenerator<RealT>>(nsampo,bounds,obman,false,false,0,nsampd+1);
+
+    // Build OED problem factory.
+    bool useUIG, homNoise, useBudget;
+    std::string regType, dtype;
+    homNoise  = parlist->sublist("Problem").get("Homoscedastic Noise",true);
+    regType   = parlist->sublist("Problem").get("Regression Type","Least Squares"); 
+    useBudget = parlist->sublist("Problem").get("Use Budget Constraint",false);
+    auto type = ROL::OED::StringToRegressionType(regType);
+    auto noise   = ROL::makePtr<Poisson_Noise<RealT>>();
+    auto cov     = ROL::makePtr<ROL::OED::StdMomentOperator<RealT>>(type,homNoise,noise);
+    auto factory = ROL::makePtr<ROL::OED::Factory<RealT>>(model,dsampler,theta,cov,*parlist);
+    ROL::Ptr<ROL::Vector<RealT>> cost;
+    if (useBudget) {
+      cost = factory->getDesign()->dual().clone();
+      const RealT half(0.5), c0(1), c1(8);
+      std::vector<RealT> pt;
+      for (int i = 0; i < dsampler->numMySamples(); ++i) {
+        pt = dsampler->getMyPoint(i);
+        RealT norm(0);
+        for (const auto x : pt) norm = std::max(norm,std::abs(x-half));
+        ROL::staticPtrCast<ROL::ProbabilityVector<RealT>>(cost)->setProbability(i,c0+c1*norm);
+      }
+      RealT budget(50);
+      factory->setBudgetConstraint(cost,budget);
+    }
+    
+    obman->barrier();
+
+    // Print factors
+    bool printFactors = parlist->sublist("Problem").get("Print Factors",false);
+    if (printFactors) {
+      std::stringstream factName_d, noisName_d, factName_o, noisName_o;
+      std::ofstream factFile_d, noisFile_d, factFile_o, noisFile_o;
+      ROL::Ptr<ROL::Vector<RealT>> Fp = zp->dual().clone();
+      std::vector<RealT> pt;
+
+      factName_d << "factors_des_" << dsampler->batchID() << ".txt";
+      noisName_d << "noise_des_" << dsampler->batchID() << ".txt";
+      factFile_d.open(factName_d.str());
+      noisFile_d.open(noisName_d.str());
+      factFile_d << std::scientific << std::setprecision(15);
+      noisFile_d << std::scientific << std::setprecision(15);
+      for (int i = 0; i < dsampler->numMySamples(); ++i) {
+        pt = dsampler->getMyPoint(i);
+        factory->getFactors()->evaluate(*Fp,pt);
+        for (int j = 0; j < numSides; ++j) {
+          factFile_d << std::right << std::setw(25)
+                     << (*ROL::dynamicPtrCast<ROL::StdVector<RealT>>(Fp)->getVector())[j];
+        }
+        factFile_d << std::endl;
+        for (int j = 0; j < 2; ++j) {
+          noisFile_d << std::right << std::setw(25) << pt[j];
+        }
+        noisFile_d << std::right << std::setw(25) << noise->evaluate(pt) << std::endl;
+      }
+      factFile_d.close();
+      noisFile_d.close();
+
+      factName_o << "factors_opt_" << osampler->batchID() << ".txt";
+      noisName_o << "noise_opt_" << osampler->batchID() << ".txt";
+      factFile_o.open(factName_o.str());
+      noisFile_o.open(noisName_o.str());
+      factFile_o << std::scientific << std::setprecision(15);
+      noisFile_o << std::scientific << std::setprecision(15);
+      for (int i = 0; i < osampler->numMySamples(); ++i) {
+        pt = osampler->getMyPoint(i);
+        factory->getFactors()->evaluate(*Fp,pt);
+        for (int j = 0; j < numSides; ++j) {
+          factFile_o << std::right << std::setw(25)
+                     << (*ROL::dynamicPtrCast<ROL::StdVector<RealT>>(Fp)->getVector())[j];
+        }
+        factFile_o << std::endl;
+        for (int j = 0; j < 2; ++j) {
+          noisFile_o << std::right << std::setw(25) << pt[j];
+        }
+        noisFile_o << std::right << std::setw(25) << noise->evaluate(pt) << std::endl;
+      }
+      factFile_o.close();
+      noisFile_o.close();
+    }
+
+    // Solve OED problem
+    dtype = parlist->sublist("OED").get("Optimality Type","A");
+    useUIG = parlist->sublist("Problem").sublist("D-Optimal").get("Uniform Initial Guess", false);
+    solve<RealT>(factory,dsampler,osampler,*parlist,dtype,*outStream,useUIG,checkDeriv);
+
+    // Get a summary from the time monitor.
+    Teuchos::TimeMonitor::summarize();
+  }
+  catch (std::logic_error &err) {
+    std::cout << "Design MPI Rank = " << drank
+              << "  Objective MPI Rank = " << orank
+              << std::endl << err.what() << std::endl; 
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/OED/example/poisson/src/meshK.hpp b/packages/rol/example/PDE-OPT/OED/example/poisson/src/meshK.hpp
new file mode 100644
index 000000000000..f1db50b0c093
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/OED/example/poisson/src/meshK.hpp
@@ -0,0 +1,87 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+#ifndef OED_POISSON_MESHK_HPP
+#define OED_POISSON_MESHK_HPP
+
+#include "../../../../TOOLS/meshmanagerK.hpp"
+
+template <class Real, class DeviceType>
+class MeshManager_OED_Poisson : public MeshManager_Rectangle<Real,DeviceType> {
+
+private:
+
+  int nx_;
+  int ny_;
+  ROL::Ptr<std::vector<std::vector<std::vector<int>>>>  meshSideSets_;
+
+public: 
+
+  MeshManager_OED_Poisson(ROL::ParameterList &parlist) : MeshManager_Rectangle<Real,DeviceType>(parlist)
+  {
+    nx_ = parlist.sublist("Geometry").get("NX", 3);
+    ny_ = parlist.sublist("Geometry").get("NY", 3);
+    computeSideSets();
+  }
+
+
+  void computeSideSets() {
+
+    int numSideSets = 4;
+    meshSideSets_ = ROL::makePtr<std::vector<std::vector<std::vector<int>>>>(numSideSets);
+
+    // Neumann 0
+    (*meshSideSets_)[0].resize(4);
+    (*meshSideSets_)[0][0].resize(nx_);
+    (*meshSideSets_)[0][1].resize(0);
+    (*meshSideSets_)[0][2].resize(0);
+    (*meshSideSets_)[0][3].resize(0);
+    // Neumann 1
+    (*meshSideSets_)[1].resize(4);
+    (*meshSideSets_)[1][0].resize(0);
+    (*meshSideSets_)[1][1].resize(ny_);
+    (*meshSideSets_)[1][2].resize(0);
+    (*meshSideSets_)[1][3].resize(0);
+    // Neumann 2
+    (*meshSideSets_)[2].resize(4);
+    (*meshSideSets_)[2][0].resize(0);
+    (*meshSideSets_)[2][1].resize(0);
+    (*meshSideSets_)[2][2].resize(nx_);
+    (*meshSideSets_)[2][3].resize(0);
+    // Neumann 3
+    (*meshSideSets_)[3].resize(4);
+    (*meshSideSets_)[3][0].resize(0);
+    (*meshSideSets_)[3][1].resize(0);
+    (*meshSideSets_)[3][2].resize(0);
+    (*meshSideSets_)[3][3].resize(ny_);
+    
+    for (int i=0; i<nx_; ++i)
+      (*meshSideSets_)[0][0][i] = i;
+    for (int i=0; i<ny_; ++i)
+      (*meshSideSets_)[1][1][i] = (i+1)*nx_-1;
+    for (int i=0; i<nx_; ++i)
+      (*meshSideSets_)[2][2][i] = i + nx_*(ny_-1);
+    for (int i=0; i<ny_; ++i)
+      (*meshSideSets_)[3][3][i] = i*nx_;
+
+  } // computeSideSets
+
+  ROL::Ptr<std::vector<std::vector<std::vector<int>>>> getSideSets(
+              const bool verbose = false,
+              std::ostream & outStream = std::cout) const { 
+    if (verbose) {
+      outStream << "Mesh_stoch_adv_diff: getSideSets called" << std::endl;
+      outStream << "Mesh_stoch_adv_diff: numSideSets = "     << meshSideSets_->size() << std::endl;
+    }
+    return meshSideSets_;
+  }
+  
+}; // MeshManager_OED_Poisson
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/OED/example/poisson/src/obj_poissonK.hpp b/packages/rol/example/PDE-OPT/OED/example/poisson/src/obj_poissonK.hpp
new file mode 100644
index 000000000000..776b785e6423
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/OED/example/poisson/src/obj_poissonK.hpp
@@ -0,0 +1,279 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  obj.hpp
+    \brief Provides the interface for local (cell-based) objective function computations.
+*/
+
+#ifndef PDEOPT_QOI_OED_POISSONK_HPP
+#define PDEOPT_QOI_OED_POISSONK_HPP
+
+#include "../../../../TOOLS/qoiK.hpp"
+#include "ROL_StdObjective.hpp"
+#include "pde_poissonK.hpp"
+
+template <class Real,class DeviceType>
+class QoI_Poisson_Observation : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> fe_;
+  Real width_, coeff_;
+
+  Real observationFunc(const std::vector<Real> &x, const std::vector<Real> &loc, int deriv = 0, int dim1 = 0, int dim2 = 0) const {
+    const Real zero(0), half(0.5), one(1);
+    const int d = x.size();
+    Real dot(0), width2 = std::pow(width_,2);
+    for (int i = 0; i < d; ++i)
+      dot += std::pow((x[i]-loc[i]),2)/width2;
+    Real val = std::exp(-half*dot);
+    if (deriv == 1) {
+      val *= (x[dim1]-loc[dim1])/width2;
+    }
+    else if (deriv == 2) {
+      Real v0 = (dim1==dim2 ? -one/width2 : zero);
+      Real v1 = (x[dim1]-loc[dim1])/width2;
+      Real v2 = (x[dim2]-loc[dim2])/width2;
+      val *= (v0 + v1*v2);
+    }
+    return coeff_ * val;
+  }
+
+  void evaluateObservation(scalar_view &out, const std::vector<Real> &loc, int deriv = 0, int dim1 = 0, int dim2 = 0) const {
+    const int c = fe_->gradN().extent_int(0);
+    const int p = fe_->gradN().extent_int(2);
+    const int d = fe_->gradN().extent_int(3);
+    out = scalar_view("out",c,p);
+    std::vector<Real> x(d);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k)
+          x[k] = (fe_->cubPts())(i,j,k);
+        out(i,j) = observationFunc(x,loc,deriv,dim1,dim2);
+      }
+    }
+  }
+
+public:
+  QoI_Poisson_Observation(const ROL::Ptr<fe_type> &fe,ROL::ParameterList &parlist)
+    : fe_(fe) {
+    width_ = parlist.sublist("Problem").get("Microphone Width",5e-2);
+    coeff_ = static_cast<Real>(1)/(static_cast<Real>(2.0*M_PI)*std::pow(width_,2));
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c = fe_->gradN().extent_int(0);
+    const int p = fe_->gradN().extent_int(2);
+    // Initialize output val
+    val = scalar_view("val", c);
+    // Get components of the state
+    scalar_view u_real("u_real", c, p);
+    fe_->evaluateValue(u_real, u_coeff);
+    // Compute phi
+    scalar_view phi;
+    evaluateObservation(phi,QoI<Real,DeviceType>::getParameter());
+    // Integrate observation
+    fe_->computeIntegral(val,phi,u_real,false);
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c = fe_->gradN().extent_int(0);
+    const int f = fe_->gradN().extent_int(1);
+    // Initialize output grad
+    grad = scalar_view("grad", c, f);
+    // Compute phi
+    scalar_view phi;
+    evaluateObservation(phi,QoI<Real,DeviceType>::getParameter());
+    // Integrate observation derivative
+    fst::integrate(grad,phi,fe_->NdetJ(),false);
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Poisson_Observation::gradient_2 is zero.");
+  }
+
+  std::vector<Real> gradient_3(std::vector<scalar_view> & grad,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Poisson_Observation::gradient_3 is zero.");
+    // Get relevant dimensions
+    const int c = fe_->gradN().extent_int(0);
+    const int p = fe_->gradN().extent_int(2);
+    const int d = fe_->gradN().extent_int(3);
+    // Get components of the state
+    scalar_view u_real("u_real", c, p);
+    fe_->evaluateValue(u_real, u_coeff);
+    for (int i = 0; i < d; ++i) {
+      // Initialize output val
+      grad[i] = scalar_view("grad", c);
+      // Compute phi'
+      scalar_view phi;
+      evaluateObservation(phi,*z_param,1,i);
+      // Integrate observation gradient
+      fe_->computeIntegral(grad[i],phi,u_real,false);
+    }
+    std::vector<Real> empty(d);
+    return empty;
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Poisson::HessVec_11 is zero.");
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Poisson::HessVec_12 is zero.");
+  }
+
+  void HessVec_13(scalar_view & hess,
+                          const ROL::Ptr<const std::vector<Real>> & v_param,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Poisson::HessVec_13 is zero.");
+    // Get relevant dimensions
+    const int c = fe_->gradN().extent_int(0);
+    const int f = fe_->gradN().extent_int(1);
+    const int p = fe_->gradN().extent_int(2);
+    const int d = fe_->gradN().extent_int(3);
+    // Initialize hessian
+    hess = scalar_view("hess", c, f);
+    // Get components of the state
+    scalar_view vphi("vphi", c, p);
+    for (int i = 0; i < d; ++i) {
+      // Compute phi'
+      scalar_view phi;
+      evaluateObservation(phi,*z_param,1,i);
+      // Weight phi'
+      rst::scale(vphi, phi, (*v_param)[i]);
+      // Integrate observation gradient
+      fst::integrate(hess,vphi,fe_->NdetJ(),true);
+    }
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Poisson::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Poisson::HessVec_22 is zero.");
+  }
+
+  void HessVec_23(scalar_view & hess,
+                          const ROL::Ptr<const std::vector<Real>> & v_param,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QOI_Poisson::HessVec_23 is zero.");
+  }
+
+  std::vector<Real> HessVec_31(std::vector<scalar_view> & hess,
+                          const scalar_view v_coeff,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Poisson::HessVec_31 is zero.");
+    // Get relevant dimensions
+    const int c = fe_->gradN().extent_int(0);
+    const int p = fe_->gradN().extent_int(2);
+    const int d = fe_->gradN().extent_int(3);
+    // Get components of the state
+    scalar_view v_real("v_real", c, p);
+    fe_->evaluateValue(v_real, v_coeff);
+    for (int i = 0; i < d; ++i) {
+      // Initialize output hess
+      hess[i] = scalar_view("hess",c);
+      // Compute phi
+      scalar_view phi;
+      evaluateObservation(phi,*z_param,1,i);
+      // Integrate observation
+      fe_->computeIntegral(hess[i],phi,v_real,false);
+    }
+    std::vector<Real> empty(d);
+    return empty;
+  }
+
+  std::vector<Real> HessVec_32(std::vector<scalar_view> & hess,
+                          const scalar_view v_coeff,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Poisson::HessVec_32 is zero.");
+  }
+
+  std::vector<Real> HessVec_33(std::vector<scalar_view> & hess,
+                          const ROL::Ptr<const std::vector<Real>> & v_param,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Poisson::HessVec_33 is zero.");
+    // Get relevant dimensions
+    const int c = fe_->gradN().extent_int(0);
+    const int p = fe_->gradN().extent_int(2);
+    const int d = fe_->gradN().extent_int(3);
+    // Get components of the state
+    scalar_view u_real("u_real", c, p);
+    fe_->evaluateValue(u_real, u_coeff);
+
+    scalar_view vphi("vphi", c, p);
+    for (int i = 0; i < d; ++i) {
+      // Initialize output val
+      hess[i] = scalar_view("hess",c);
+      // Compute phi'
+      Kokkos::deep_copy(vphi,static_cast<Real>(0));
+      for (int j = 0; j < d; ++j) {
+        scalar_view phi;
+        evaluateObservation(phi,*z_param,2,i,j);
+        rst::scale(phi, (*v_param)[j]);
+        rst::add(vphi, phi);
+      }
+      // Integrate observation gradient
+      fe_->computeIntegral(hess[i],vphi,u_real,false);
+    }
+    std::vector<Real> empty(d);
+    return empty;
+  }
+
+}; // QoI_Poisson_Observation
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/OED/example/poisson/src/pde_poissonK.hpp b/packages/rol/example/PDE-OPT/OED/example/poisson/src/pde_poissonK.hpp
new file mode 100644
index 000000000000..b85bea9b09b6
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/OED/example/poisson/src/pde_poissonK.hpp
@@ -0,0 +1,384 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  pde_poisson.hpp
+    \brief Implements the local PDE interface for the optimal control of
+           Poisson.
+*/
+
+#ifndef PDE_OED_POISSONK_HPP
+#define PDE_OED_POISSONK_HPP
+
+#include "../../../../TOOLS/pdeK.hpp"
+#include "../../../../TOOLS/feK.hpp"
+#include "../../../../TOOLS/fieldhelperK.hpp"
+
+#include "Intrepid2_HGRAD_QUAD_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_TRI_C1_FEM.hpp"
+#include "Intrepid2_DefaultCubatureFactory.hpp"
+#include "Intrepid2_FunctionSpaceTools.hpp"
+#include "Intrepid2_CellTools.hpp"
+
+#include "ROL_Ptr.hpp"
+
+
+template <class Real, class DeviceType>
+class PDE_OED_Poisson : public PDE<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  // Finite element basis information
+  basis_ptr basisPtr_;
+  std::vector<basis_ptr> basisPtrs_;
+  // Cell cubature information
+  ROL::Ptr<Intrepid2::Cubature<DeviceType,Real,Real>> cellCub_, bdryCub_;
+  // Cell node information
+  scalar_view volCellNodes_;
+  std::vector<std::vector<scalar_view>> bdryCellNodes_;
+  std::vector<std::vector<std::vector<int>>> bdryCellLocIds_;
+  // Finite element definition
+  ROL::Ptr<fe_type> fe_;
+  std::vector<std::vector<ROL::Ptr<fe_type>>> feBdry_;
+  // Local degrees of freedom on boundary, for each side of the reference cell (first index).
+  std::vector<std::vector<int>> fidx_;
+  // Coordinates of degrees freedom on boundary cells.
+  // Indexing:  [sideset number][local side id](cell number, value at dof)
+  std::vector<std::vector<scalar_view>> bdryCellDofValues_;
+
+  Real diffusivity(const std::vector<Real> &x) const {
+    const Real half(0.5), rad(0.2), k0(10), k1(1e-1);
+    Real norm(0);
+    for (const auto xi : x) norm += (xi-half)*(xi-half);
+    norm = std::sqrt(norm);
+    return (norm <= rad ? k1 : k0);
+  }
+
+  void computeDiffusivity(scalar_view &kappa) const {
+    const int c = fe_->gradN().extent_int(0);
+    const int p = fe_->gradN().extent_int(2);
+    const int d = fe_->gradN().extent_int(3);
+    std::vector<Real> x(d);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k) x[k] = (fe_->cubPts())(i,j,k);
+        kappa(i,j) = diffusivity(x);
+      }
+    }
+  }
+
+  void computeNeumannControl(scalar_view &Bz,
+                             const ROL::Ptr<const std::vector<Real>> &zp,
+                             const int sideset,
+                             const int deriv = 0) const {
+    const int c = Bz.extent_int(0);
+    const int p = Bz.extent_int(1);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        Bz(i,j) = (deriv==0 ? (*zp)[sideset]
+                : (deriv==1 ? static_cast<Real>(1)
+                : static_cast<Real>(0)));
+      }
+    }
+  }
+
+  scalar_view getBoundaryCoeff(const scalar_view cell_coeff, int sideSet, int cell) const {
+    std::vector<int> bdryCellLocId = bdryCellLocIds_[sideSet][cell];
+    const int numCellsSide = bdryCellLocId.size();
+    const int f = basisPtr_->getCardinality();
+    
+    scalar_view bdry_coeff("bdry_coeff", numCellsSide, f);
+    for (int i = 0; i < numCellsSide; ++i) {
+      for (int j = 0; j < f; ++j)
+        bdry_coeff(i, j) = cell_coeff(bdryCellLocId[i], j);
+    }
+    return bdry_coeff;
+  }
+
+public:
+  PDE_OED_Poisson(ROL::ParameterList &parlist) {
+    // Finite element fields.
+    std::string elemtype = parlist.sublist("Geometry").get("Element Shape","quad");
+    if (elemtype == "quad")
+      basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceType,Real,Real>>();
+    else if (elemtype == "tri")
+      basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_TRI_C1_FEM<DeviceType,Real,Real>>();
+    else
+      throw ROL::Exception::NotImplemented(">>> Element type not implemented.");
+    basisPtrs_.clear(); basisPtrs_.push_back(basisPtr_);  // acoustic pressure
+    // Quadrature rules.
+    shards::CellTopology cellType = basisPtr_->getBaseCellTopology();        // get the cell type from the basis
+    Intrepid2::DefaultCubatureFactory cubFactory;                            // create cubature factory
+    int cubDegree = parlist.sublist("Problem").get("Cubature Degree", 2);    // set cubature degree, e.g., 2
+    cellCub_ = cubFactory.create<DeviceType,Real,Real>(cellType, cubDegree); // create default cubature
+    int d = cellType.getDimension();
+
+    shards::CellTopology bdryCellType = cellType.getCellTopologyData(d-1, 0);
+    int bdryCubDegree = parlist.sublist("Problem").get("Boundary Cubature Degree",2); // set cubature degree, e.g., 2
+    bdryCub_ = cubFactory.create<DeviceType,Real,Real>(bdryCellType, bdryCubDegree);
+  }
+
+  void residual(scalar_view & res,
+                const scalar_view u_coeff,
+                const scalar_view z_coeff = scalar_view(),
+                const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c = fe_->gradN().extent_int(0);
+    const int f = fe_->gradN().extent_int(1);
+    const int p = fe_->gradN().extent_int(2);
+    const int d = fe_->gradN().extent_int(3);
+    // Initialize residuals.
+    res = scalar_view("res",c,f);
+    // Initialize storage
+    scalar_view valU_eval("valU_eval", c, p);
+    scalar_view gradU_eval("gradU_eval", c, p, d);
+    scalar_view kappa("kappa", c, p);
+    scalar_view kappaU("kappaU", c, p, d);
+    // Evaluate/interpolate finite element fields on cells.
+    fe_->evaluateValue(valU_eval, u_coeff);
+    fe_->evaluateGradient(gradU_eval, u_coeff);
+    // Build wave number
+    computeDiffusivity(kappa);
+    fst::scalarMultiplyDataData(kappaU,kappa,gradU_eval);
+
+    /*******************************************************************/
+    /*** Evaluate weak form of the residual.****************************/
+    /*******************************************************************/
+    fst::integrate(res,kappaU,fe_->gradNdetJ(),false);
+
+    // APPLY BOUNDARY CONDITIONS
+    const int numSidesets = bdryCellLocIds_.size();
+    for (int s = 0; s < numSidesets; ++s) {
+      const int numLocalSideIds = bdryCellLocIds_[s].size();
+      const int numCubPerSide = bdryCub_->getNumPoints();
+      for (int j = 0; j < numLocalSideIds; ++j) {
+        const int numCellsSide = bdryCellLocIds_[s][j].size();
+        if (numCellsSide) {
+          scalar_view BCres("BCres", numCellsSide, f);
+          scalar_view BCcomp("BCcomp", numCellsSide, numCubPerSide);
+          // Compute control operator
+          computeNeumannControl(BCcomp,z_param,s,0);
+          // Integrate residual
+          fst::integrate(BCres,BCcomp,feBdry_[s][j]->NdetJ(),false);
+          // Add Robin and Neumann control residual to volume residual
+          for (int k = 0; k < numCellsSide; ++k) {
+            int cidx = bdryCellLocIds_[s][j][k];
+            for (int l = 0; l < f; ++l)
+              res(cidx,l) -= BCres(k,l);
+          }
+        }
+      }
+    }
+  }
+
+  void Jacobian_1(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    int p = fe_->gradN().extent_int(2);
+    int d = fe_->gradN().extent_int(3);
+    // Initialize Jacobians.
+    jac = scalar_view("jac", c, f, f);
+    // Initialize storage.
+    scalar_view kappa("kappa", c, p);
+    scalar_view kappaN("kappaN", c, f, p, d);
+    // Build wave number
+    computeDiffusivity(kappa);
+    fst::scalarMultiplyDataField(kappaN,kappa,fe_->gradN());
+
+    /*** Evaluate weak form of the Jacobian. ***/
+    fst::integrate(jac,kappaN,fe_->gradNdetJ(),false);
+  }
+
+
+  void Jacobian_2(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Helmholtz_OCT::Jacobian_2): Jacobian is zero.");
+  }
+
+  void Jacobian_3(std::vector<scalar_view> & jac,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // GET DIMENSIONS
+    const int c = fe_->gradN().extent_int(0);
+    const int f = fe_->gradN().extent_int(1);
+    // ADD CONTROL TERM TO RESIDUAL
+    const int numSidesets = bdryCellLocIds_.size();
+    for (int s = 0; s < numSidesets; ++s) {
+      jac[s] = scalar_view("jac", c, f);
+      const int numLocalSideIds = bdryCellLocIds_[s].size();
+      const int numCubPerSide = bdryCub_->getNumPoints();
+      for (int j = 0; j < numLocalSideIds; ++j) {
+        const int numCellsSide = bdryCellLocIds_[s][j].size();
+        if (numCellsSide) {
+          // Compute control operator
+          scalar_view Bz("Bz", numCellsSide, numCubPerSide);
+          computeNeumannControl(Bz,z_param,s,1);
+          // Compute Neumann residual
+          scalar_view neumJac("neumJac", numCellsSide, f);
+          fst::integrate(neumJac,Bz,feBdry_[s][j]->NdetJ(),false);
+          // Add Robin and Neumann control residual to volume residual
+          for (int k = 0; k < numCellsSide; ++k) {
+            int cidx = bdryCellLocIds_[s][j][k];
+            for (int l = 0; l < f; ++l) { 
+              (jac[s])(cidx,l) -= neumJac(k,l);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void Hessian_11(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Helmholtz_OCT::Hessian_11): Hessian is zero.");
+  }
+
+  void Hessian_12(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Helmholtz_OCT::Hessian_12): Hessian is zero.");
+  }
+
+  void Hessian_13(std::vector<scalar_view> & hess,
+                          const scalar_view l_coeff,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Helmholtz_OCT::Hessian_13): Hessian is zero.");
+  }
+
+  void Hessian_21(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Helmholtz_OCT::Hessian_21): Hessian is zero.");
+  }
+
+  void Hessian_22(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Helmholtz_OCT::Hessian_22): Hessian is zero.");
+  }
+
+  void Hessian_23(std::vector<scalar_view> & hess,
+                          const scalar_view l_coeff,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Helmholtz_OCT::Hessian_23): Hessian is zero.");
+  }
+
+  void Hessian_31(std::vector<scalar_view> & hess,
+                          const scalar_view l_coeff,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Helmholtz_OCT::Hessian_31): Hessian is zero.");
+  }
+
+  void Hessian_32(std::vector<scalar_view> & hess,
+                          const scalar_view l_coeff,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Helmholtz_OCT::Hessian_32): Hessian is zero.");
+  }
+
+  void Hessian_33(std::vector<std::vector<scalar_view>> & hess,
+                          const scalar_view l_coeff,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Helmholtz_OCT::Hessian_33): Hessian is zero.");
+  }
+
+  void RieszMap_1(scalar_view & riesz) override {
+    //throw Exception::NotImplemented(">>> (PDE_TopoOpt::RieszMap_1): Not implemented.");
+    // Retrieve dimensions.
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    // Initialize Riesz Map.
+    riesz = scalar_view("riesz1",c,f,f);
+    Kokkos::deep_copy(riesz,fe_->stiffMat());
+    rst::add(riesz,fe_->massMat());
+  }
+
+  void RieszMap_2(scalar_view & riesz) override {
+    //throw Exception::NotImplemented(">>> (PDE_TopoOpt::RieszMap_2): Not implemented.");
+    // Retrieve dimensions.
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    // Initialize Jacobians.
+    riesz = scalar_view("riesz2",c,f,f);
+    Kokkos::deep_copy(riesz,fe_->massMat());
+  }
+
+  std::vector<basis_ptr> getFields() override {
+    return basisPtrs_;
+  }
+
+  void setCellNodes(const scalar_view &volCellNodes,
+                    const std::vector<std::vector<scalar_view>> &bdryCellNodes,
+                    const std::vector<std::vector<std::vector<int>>> &bdryCellLocIds) override {
+    volCellNodes_ = volCellNodes;
+    bdryCellNodes_ = bdryCellNodes;
+    bdryCellLocIds_ = bdryCellLocIds;
+    // Finite element definition.
+    fe_ = ROL::makePtr<fe_type>(volCellNodes_,basisPtr_,cellCub_);
+    fidx_ = fe_->getBoundaryDofs();
+    // Construct boundary FE
+    const int numSidesets = bdryCellNodes.size();
+    feBdry_.resize(numSidesets);
+    for (int s = 0; s < numSidesets; ++s) {
+      const int numLocSides = bdryCellNodes[s].size();
+      feBdry_[s].resize(numLocSides);
+      for (int j = 0; j < numLocSides; ++j) {
+        if (bdryCellNodes[s][j] != scalar_view()) {
+          feBdry_[s][j] = ROL::makePtr<fe_type>(bdryCellNodes[s][j],basisPtr_,bdryCub_,j);
+        }
+      }
+    }
+  }
+
+  const ROL::Ptr<fe_type> getFE(void) const {
+    return fe_;
+  }
+
+  const std::vector<std::vector<ROL::Ptr<fe_type>>> getBdryFE(void) const {
+    return feBdry_;
+  }
+
+  const std::vector<std::vector<int>> getBdryCellLocIds(const int sideset = 0) const {
+    return bdryCellLocIds_[sideset];
+  }
+
+}; // PDE_Helmholtz_OCT
+
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/TEST/CMakeLists.txt b/packages/rol/example/PDE-OPT/TEST/CMakeLists.txt
index 623bc95bb847..549809721dd9 100644
--- a/packages/rol/example/PDE-OPT/TEST/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/TEST/CMakeLists.txt
@@ -1,8 +1,7 @@
 
-IF(${PROJECT_NAME}_ENABLE_Intrepid AND 
-   ${PROJECT_NAME}_ENABLE_Epetra )
+IF(${PROJECT_NAME}_ENABLE_Intrepid)
 
-  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  ROL_ADD_EXECUTABLE_AND_TEST(
     test_01
     SOURCES test_01.cpp
     ARGS PrintItAll
@@ -12,7 +11,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  ROL_ADD_EXECUTABLE_AND_TEST(
     test_02
     SOURCES test_02.cpp
     ARGS PrintItAll
@@ -22,7 +21,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  ROL_ADD_EXECUTABLE_AND_TEST(
     test_03
     SOURCES test_03.cpp
     ARGS PrintItAll
@@ -32,7 +31,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  ROL_ADD_EXECUTABLE_AND_TEST(
     test_04
     SOURCES test_04.cpp
     ARGS PrintItAll
@@ -42,7 +41,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  ROL_ADD_EXECUTABLE_AND_TEST(
     test_05
     SOURCES test_05.cpp
     ARGS PrintItAll
@@ -52,7 +51,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  ROL_ADD_EXECUTABLE_AND_TEST(
     test_07
     SOURCES test_07.cpp
     ARGS PrintItAll
@@ -62,7 +61,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  ROL_ADD_EXECUTABLE_AND_TEST(
     test_08
     SOURCES test_08.cpp
     ARGS PrintItAll
@@ -72,7 +71,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  ROL_ADD_EXECUTABLE_AND_TEST(
     test_09
     SOURCES test_09.cpp
     ARGS PrintItAll
@@ -82,7 +81,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     TestDataCopy
     SOURCE_FILES
       input_01.xml input_02.xml input_03.xml input_04.xml input_05.xml input_07.xml input_08.xml input_09.xml cube.txt L-cubes.txt
@@ -92,9 +91,101 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
 
 ENDIF()
 
+IF(${PROJECT_NAME}_ENABLE_Intrepid2)
+
+  ROL_ADD_EXECUTABLE_AND_TEST(
+    test_01_Kokkos
+    SOURCES test_01K.cpp
+    ARGS PrintItAll
+    NUM_MPI_PROCS 1
+    NUM_TOTAL_CORES_USED 1
+    PASS_REGULAR_EXPRESSION "TEST PASSED"
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_ADD_EXECUTABLE_AND_TEST(
+    test_02_Kokkos
+    SOURCES test_02K.cpp
+    ARGS PrintItAll
+    NUM_MPI_PROCS 1
+    NUM_TOTAL_CORES_USED 1
+    PASS_REGULAR_EXPRESSION "TEST PASSED"
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_ADD_EXECUTABLE_AND_TEST(
+    test_03_Kokkos
+    SOURCES test_03K.cpp
+    ARGS PrintItAll
+    NUM_MPI_PROCS 1
+    NUM_TOTAL_CORES_USED 1
+    PASS_REGULAR_EXPRESSION "TEST PASSED"
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_ADD_EXECUTABLE_AND_TEST(
+    test_04_Kokkos
+    SOURCES test_04K.cpp
+    ARGS PrintItAll
+    NUM_MPI_PROCS 1
+    NUM_TOTAL_CORES_USED 1
+    PASS_REGULAR_EXPRESSION "TEST PASSED"
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_ADD_EXECUTABLE_AND_TEST(
+    test_05_Kokkos
+    SOURCES test_05K.cpp
+    ARGS PrintItAll
+    NUM_MPI_PROCS 1
+    NUM_TOTAL_CORES_USED 1
+    PASS_REGULAR_EXPRESSION "TEST PASSED"
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_ADD_EXECUTABLE_AND_TEST(
+    test_07_Kokkos
+    SOURCES test_07K.cpp
+    ARGS PrintItAll
+    NUM_MPI_PROCS 1
+    NUM_TOTAL_CORES_USED 1
+    PASS_REGULAR_EXPRESSION "TEST PASSED"
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_ADD_EXECUTABLE_AND_TEST(
+    test_08_Kokkos
+    SOURCES test_08K.cpp
+    ARGS PrintItAll
+    NUM_MPI_PROCS 1
+    NUM_TOTAL_CORES_USED 1
+    PASS_REGULAR_EXPRESSION "TEST PASSED"
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_ADD_EXECUTABLE_AND_TEST(
+    test_09_Kokkos
+    SOURCES test_09K.cpp
+    ARGS PrintItAll
+    NUM_MPI_PROCS 1
+    NUM_TOTAL_CORES_USED 1
+    PASS_REGULAR_EXPRESSION "TEST PASSED"
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_COPY_FILES_TO_BINARY_DIR(
+    TestDataCopyK
+    SOURCE_FILES
+      input_01.xml input_02.xml input_03.xml input_04.xml input_05.xml input_07.xml input_08.xml input_09.xml cube.txt L-cubes.txt
+    SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}"
+    DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
+  )
+
+ENDIF()
+
 
 #IF(${PROJECT_NAME}_ENABLE_Sacado)
-#  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+#  ROL_ADD_EXECUTABLE_AND_TEST(
 #    test_06
 #    SOURCES test_06.cpp
 #    ARGS PrintItAll
@@ -104,6 +195,3 @@ ENDIF()
 #    ADD_DIR_TO_NAME
 #  )
 #ENDIF()
-
-
-
diff --git a/packages/rol/example/PDE-OPT/TEST/test_01.cpp b/packages/rol/example/PDE-OPT/TEST/test_01.cpp
index de3b344d35d9..2532ec5821a1 100644
--- a/packages/rol/example/PDE-OPT/TEST/test_01.cpp
+++ b/packages/rol/example/PDE-OPT/TEST/test_01.cpp
@@ -17,7 +17,7 @@
 #include "ROL_Vector_SimOpt.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Teuchos_XMLParameterListHelpers.hpp"
 
 #include "Intrepid_HGRAD_QUAD_C1_FEM.hpp"
@@ -33,7 +33,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
   ROL::Ptr<std::ostream> outStream;
@@ -59,7 +59,7 @@ int main(int argc, char *argv[]) {
     ROL::Ptr<Intrepid::FieldContainer<RealT> > nodesPtr = meshmgr.getNodes();
     ROL::Ptr<Intrepid::FieldContainer<int> >   cellToNodeMapPtr = meshmgr.getCellToNodeMap();
     ROL::Ptr<Intrepid::FieldContainer<int> >   cellToEdgeMapPtr = meshmgr.getCellToEdgeMap();
-    ROL::Ptr<std::vector<std::vector<std::vector<int> > > > sideSetsPtr = meshmgr.getSideSets(); 
+    ROL::Ptr<std::vector<std::vector<std::vector<int> > > > sideSetsPtr = meshmgr.getSideSets();
 
     Intrepid::FieldContainer<RealT> &nodes = *nodesPtr;
     Intrepid::FieldContainer<int>   &cellToNodeMap = *cellToNodeMapPtr;
diff --git a/packages/rol/example/PDE-OPT/TEST/test_01K.cpp b/packages/rol/example/PDE-OPT/TEST/test_01K.cpp
new file mode 100644
index 000000000000..b347a1539c18
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/TEST/test_01K.cpp
@@ -0,0 +1,166 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  test_01.cpp
+    \brief Unit test for the mesh manager and the degree-of-freedom manager.
+           Mesh type: RECTANGLE with QUAD CELLS and HGRAD SPACE.
+*/
+
+#include "ROL_Algorithm.hpp"
+#include "ROL_BoundConstraint_SimOpt.hpp"
+#include "ROL_Vector_SimOpt.hpp"
+
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+#include "Teuchos_XMLParameterListHelpers.hpp"
+
+#include "Intrepid2_HGRAD_QUAD_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_QUAD_C2_FEM.hpp"
+
+#include <Kokkos_Core.hpp>
+
+#include <iostream>
+#include <algorithm>
+
+#include "../TOOLS/meshmanagerK.hpp"
+#include "../TOOLS/dofmanagerK.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+using scalar_view = typename MeshManager<RealT, DeviceT>::scalar_view;
+using int_view = typename MeshManager<RealT, DeviceT>::int_view;
+using basis_ptr = Intrepid2::BasisPtr<DeviceT, RealT, RealT>;
+
+int main(int argc, char *argv[]) {
+
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
+  Kokkos::ScopeGuard kokkosScope(argc, argv);
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+  if (iprint > 0)
+    outStream = ROL::makePtrFromRef(std::cout);
+  else
+    outStream = ROL::makePtrFromRef(bhs);
+
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+
+    /*** Read in XML input ***/
+    std::string filename = "input_01.xml";
+    Teuchos::RCP<Teuchos::ParameterList> parlist
+      = Teuchos::rcp( new Teuchos::ParameterList() );
+    Teuchos::updateParametersFromXmlFile( filename, parlist.ptr() );
+
+    /*** Initialize mesh / degree-of-freedom manager. ***/
+    MeshManager_Rectangle<RealT, DeviceT> meshmgr(*parlist);
+    scalar_view nodes = meshmgr.getNodes();
+    int_view cellToNodeMap = meshmgr.getCellToNodeMap();
+    int_view cellToEdgeMap = meshmgr.getCellToEdgeMap();
+    ROL::Ptr<std::vector<std::vector<std::vector<int> > > > sideSetsPtr = meshmgr.getSideSets();
+
+    std::vector<std::vector<std::vector<int> > >  &sideSets = *sideSetsPtr;
+    *outStream << "Number of nodes = " << meshmgr.getNumNodes() << std::endl; // << nodes;
+    *outStream << "Number of cells = " << meshmgr.getNumCells() << std::endl; // << cellToNodeMap;
+    *outStream << "Number of edges = " << meshmgr.getNumEdges() << std::endl; // << cellToEdgeMap;
+    // Print mesh info to file.
+    std::ofstream meshfile;
+    meshfile.open("cells.txt");
+    for (int i=0; i<cellToNodeMap.extent_int(0); ++i) {
+      meshfile << cellToNodeMap(i,0) << "  "
+               << cellToNodeMap(i,1) << "  "
+               << cellToNodeMap(i,2) << "  "
+               << cellToNodeMap(i,3) << std::endl;
+    }
+    meshfile.close();
+    meshfile.open("edges.txt");
+    for (int i=0; i<cellToEdgeMap.extent_int(0); ++i) {
+      meshfile << cellToEdgeMap(i,0) << "  "
+               << cellToEdgeMap(i,1) << "  "
+               << cellToEdgeMap(i,2) << "  "
+               << cellToEdgeMap(i,3) << std::endl;
+    }
+    meshfile.close();
+    meshfile.open("nodes.txt");
+    for (int i=0; i<nodes.extent_int(0); ++i) {
+      meshfile << nodes(i,0) << "  "
+               << nodes(i,1) << std::endl;
+    }
+    meshfile.close();
+    meshfile.open("sideset.txt");
+    for (int i=0; i<static_cast<int>(sideSets.size()); ++i) {
+      for (int j=0; j<static_cast<int>(sideSets[i].size()); ++j) {
+        if (sideSets[i][j].size() > 0) {
+          for (int k=0; k<static_cast<int>(sideSets[i][j].size()); ++k) {
+            meshfile << sideSets[i][j][k] << std::endl;
+          }
+        }
+        meshfile << std::endl << std::endl;
+      }
+    }
+    meshfile.close();
+
+    basis_ptr basisPtrQ1 = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceT, RealT, RealT>>();
+    basis_ptr basisPtrQ2 = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C2_FEM<DeviceT, RealT, RealT>>();
+
+    std::vector<basis_ptr> basisPtrs(3, ROL::nullPtr);
+    basisPtrs[0] = basisPtrQ2;
+    basisPtrs[1] = basisPtrQ1;
+    basisPtrs[2] = basisPtrQ2;
+
+    ROL::Ptr<MeshManager<RealT, DeviceT> > meshmgrPtr = ROL::makePtrFromRef(meshmgr);
+
+    DofManager<RealT, DeviceT> dofmgr(meshmgrPtr, basisPtrs);
+
+    *outStream << "Number of node dofs = " << dofmgr.getNumNodeDofs() << std::endl; // << dofmgr.getNodeDofs();
+    *outStream << "Number of edge dofs = " << dofmgr.getNumEdgeDofs() << std::endl; // << dofmgr.getEdgeDofs();
+    *outStream << "Number of face dofs = " << dofmgr.getNumFaceDofs() << std::endl; // << dofmgr.getFaceDofs();
+    *outStream << "Number of void dofs = " << dofmgr.getNumVoidDofs() << std::endl; // << dofmgr.getVoidDofs();
+    *outStream << "Total number of dofs = " << dofmgr.getNumDofs() << std::endl; // << dofmgr.getCellDofs();
+
+    std::vector<std::vector<int>> fieldPattern = dofmgr.getFieldPattern();
+    for (int i=0; i<dofmgr.getNumFields(); ++i) {
+      *outStream << "\nField " << i << " pattern:   ";
+      for (int j=0; j<dofmgr.getLocalFieldSize(i); ++j) {
+        *outStream << fieldPattern[i][j] << " ";
+      }
+      *outStream << std::endl;
+    }
+
+    for (int i=0; i<dofmgr.getNumFields(); ++i) {
+      *outStream << "\nField  " << i << std::endl;
+      // *outStream << dofmgr.getFieldDofs(i);
+    }
+
+    bool correct = true;
+    static const int checkDofs[] = {20, 23, 35, 32, 55, 63, 69, 61, 81};
+    for (int i=0; i<dofmgr.getLocalFieldSize(2); ++i) {
+      correct = correct && ( (dofmgr.getFieldDofs(2))(5,i) == checkDofs[i] );
+    }
+    if (!correct) {
+      errorFlag = -1;
+    }
+
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/TEST/test_02.cpp b/packages/rol/example/PDE-OPT/TEST/test_02.cpp
index 18045cc5d57f..d8f9a53520c7 100644
--- a/packages/rol/example/PDE-OPT/TEST/test_02.cpp
+++ b/packages/rol/example/PDE-OPT/TEST/test_02.cpp
@@ -17,7 +17,7 @@
 #include "ROL_Vector_SimOpt.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Teuchos_XMLParameterListHelpers.hpp"
 
 #include "Intrepid_HGRAD_QUAD_C1_FEM.hpp"
@@ -33,7 +33,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
   ROL::Ptr<std::ostream> outStream;
@@ -59,7 +59,7 @@ int main(int argc, char *argv[]) {
     ROL::Ptr<Intrepid::FieldContainer<RealT> > nodesPtr = meshmgr.getNodes();
     ROL::Ptr<Intrepid::FieldContainer<int> >   cellToNodeMapPtr = meshmgr.getCellToNodeMap();
     ROL::Ptr<Intrepid::FieldContainer<int> >   cellToEdgeMapPtr = meshmgr.getCellToEdgeMap();
-    ROL::Ptr<std::vector<std::vector<std::vector<int> > > > sideSetsPtr = meshmgr.getSideSets(); 
+    ROL::Ptr<std::vector<std::vector<std::vector<int> > > > sideSetsPtr = meshmgr.getSideSets();
 
     Intrepid::FieldContainer<RealT> &nodes = *nodesPtr;
     Intrepid::FieldContainer<int>   &cellToNodeMap = *cellToNodeMapPtr;
diff --git a/packages/rol/example/PDE-OPT/TEST/test_02K.cpp b/packages/rol/example/PDE-OPT/TEST/test_02K.cpp
new file mode 100644
index 000000000000..a387506d4e58
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/TEST/test_02K.cpp
@@ -0,0 +1,164 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  test_02.cpp
+    \brief Unit test for the mesh manager and the degree-of-freedom manager.
+           Mesh type: BACKWARD-FACING STEP with QUAD CELLS and HGRAD SPACE.
+*/
+
+#include "ROL_Algorithm.hpp"
+#include "ROL_BoundConstraint_SimOpt.hpp"
+#include "ROL_Vector_SimOpt.hpp"
+
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+#include "Teuchos_XMLParameterListHelpers.hpp"
+
+#include "Intrepid2_HGRAD_QUAD_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_QUAD_C2_FEM.hpp"
+
+#include <iostream>
+#include <algorithm>
+
+#include "../TOOLS/meshmanagerK.hpp"
+#include "../TOOLS/dofmanagerK.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+using scalar_view = typename MeshManager<RealT, DeviceT>::scalar_view;
+using int_view = typename MeshManager<RealT, DeviceT>::int_view;
+using basis_ptr = Intrepid2::BasisPtr<DeviceT, RealT, RealT>;
+
+int main(int argc, char *argv[]) {
+
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
+  Kokkos::ScopeGuard kokkosScope(argc, argv);
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+  if (iprint > 0)
+    outStream = ROL::makePtrFromRef(std::cout);
+  else
+    outStream = ROL::makePtrFromRef(bhs);
+
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+
+    /*** Read in XML input ***/
+    std::string filename = "input_02.xml";
+    Teuchos::RCP<Teuchos::ParameterList> parlist
+      = Teuchos::rcp( new Teuchos::ParameterList() );
+    Teuchos::updateParametersFromXmlFile( filename, parlist.ptr() );
+
+    /*** Initialize mesh / degree-of-freedom manager. ***/
+    MeshManager_BackwardFacingStepChannel<RealT, DeviceT> meshmgr(*parlist);
+    scalar_view nodes = meshmgr.getNodes();
+    int_view cellToNodeMap = meshmgr.getCellToNodeMap();
+    int_view cellToEdgeMap = meshmgr.getCellToEdgeMap();
+    ROL::Ptr<std::vector<std::vector<std::vector<int> > > > sideSetsPtr = meshmgr.getSideSets();
+
+    std::vector<std::vector<std::vector<int> > >  &sideSets = *sideSetsPtr;
+    *outStream << "Number of nodes = " << meshmgr.getNumNodes() << std::endl; // << nodes;
+    *outStream << "Number of cells = " << meshmgr.getNumCells() << std::endl; // << cellToNodeMap;
+    *outStream << "Number of edges = " << meshmgr.getNumEdges() << std::endl; // << cellToEdgeMap;
+    // Print mesh info to file.
+    std::ofstream meshfile;
+    meshfile.open("cells.txt");
+    for (int i=0; i<cellToNodeMap.extent_int(0); ++i) {
+      meshfile << cellToNodeMap(i,0) << "  "
+               << cellToNodeMap(i,1) << "  "
+               << cellToNodeMap(i,2) << "  "
+               << cellToNodeMap(i,3) << std::endl;
+    }
+    meshfile.close();
+    meshfile.open("edges.txt");
+    for (int i=0; i<cellToEdgeMap.extent_int(0); ++i) {
+      meshfile << cellToEdgeMap(i,0) << "  "
+               << cellToEdgeMap(i,1) << "  "
+               << cellToEdgeMap(i,2) << "  "
+               << cellToEdgeMap(i,3) << std::endl;
+    }
+    meshfile.close();
+    meshfile.open("nodes.txt");
+    for (int i=0; i<nodes.extent_int(0); ++i) {
+      meshfile << nodes(i,0) << "  "
+               << nodes(i,1) << std::endl;
+    }
+    meshfile.close();
+    meshfile.open("sideset.txt");
+    for (int i=0; i<static_cast<int>(sideSets.size()); ++i) {
+      for (int j=0; j<static_cast<int>(sideSets[i].size()); ++j) {
+        if (sideSets[i][j].size() > 0) {
+          for (int k=0; k<static_cast<int>(sideSets[i][j].size()); ++k) {
+            meshfile << sideSets[i][j][k] << std::endl;
+          }
+        }
+        meshfile << std::endl << std::endl;
+      }
+    }
+    meshfile.close();
+
+    basis_ptr basisPtrQ1 = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceT, RealT, RealT>>();;
+    basis_ptr basisPtrQ2 = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C2_FEM<DeviceT, RealT, RealT>>();;
+
+    std::vector<basis_ptr> basisPtrs(3, ROL::nullPtr);
+    basisPtrs[0] = basisPtrQ2;
+    basisPtrs[1] = basisPtrQ1;
+    basisPtrs[2] = basisPtrQ2;
+
+    ROL::Ptr<MeshManager<RealT, DeviceT>> meshmgrPtr = ROL::makePtrFromRef(meshmgr);
+
+    DofManager<RealT, DeviceT> dofmgr(meshmgrPtr, basisPtrs);
+
+    *outStream << "Number of node dofs = " << dofmgr.getNumNodeDofs() << std::endl; // << *(dofmgr.getNodeDofs());
+    *outStream << "Number of edge dofs = " << dofmgr.getNumEdgeDofs() << std::endl; // << *(dofmgr.getEdgeDofs());
+    *outStream << "Number of face dofs = " << dofmgr.getNumFaceDofs() << std::endl; // << *(dofmgr.getFaceDofs());
+    *outStream << "Number of void dofs = " << dofmgr.getNumVoidDofs() << std::endl; // << *(dofmgr.getVoidDofs());
+    *outStream << "Total number of dofs = " << dofmgr.getNumDofs() << std::endl;    // << *(dofmgr.getCellDofs());
+
+    std::vector<std::vector<int>> fieldPattern = dofmgr.getFieldPattern();
+    for (int i=0; i<dofmgr.getNumFields(); ++i) {
+      *outStream << "\nField " << i << " pattern:   ";
+      for (int j=0; j<dofmgr.getLocalFieldSize(i); ++j) {
+        *outStream << fieldPattern[i][j] << " ";
+      }
+      *outStream << std::endl;
+    }
+
+    for (int i=0; i<dofmgr.getNumFields(); ++i) {
+      *outStream << "\nField  " << i << std::endl;
+      // *outStream << *(dofmgr.getFieldDofs(i));
+    }
+
+    bool correct = true;
+    static const int checkDofs[] = {149, 152, 176, 173, 350, 366, 380, 364, 468};
+    for (int i=0; i<dofmgr.getLocalFieldSize(2); ++i) {
+      correct = correct && ( (dofmgr.getFieldDofs(2))(43,i) == checkDofs[i] );
+    }
+    if (!correct) {
+      errorFlag = -1;
+    }
+
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/TEST/test_03.cpp b/packages/rol/example/PDE-OPT/TEST/test_03.cpp
index fbfaf7f0f47d..d60155879066 100644
--- a/packages/rol/example/PDE-OPT/TEST/test_03.cpp
+++ b/packages/rol/example/PDE-OPT/TEST/test_03.cpp
@@ -17,7 +17,7 @@
 #include "ROL_Vector_SimOpt.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Teuchos_XMLParameterListHelpers.hpp"
 
 #include "Intrepid_HCURL_HEX_I1_FEM.hpp"
@@ -32,7 +32,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
   ROL::Ptr<std::ostream> outStream;
@@ -58,7 +58,7 @@ int main(int argc, char *argv[]) {
     ROL::Ptr<Intrepid::FieldContainer<RealT> > nodesPtr = meshmgr.getNodes();
     ROL::Ptr<Intrepid::FieldContainer<int> >   cellToNodeMapPtr = meshmgr.getCellToNodeMap();
     ROL::Ptr<Intrepid::FieldContainer<int> >   cellToEdgeMapPtr = meshmgr.getCellToEdgeMap();
-    ROL::Ptr<std::vector<std::vector<std::vector<int> > > > sideSetsPtr = meshmgr.getSideSets(); 
+    ROL::Ptr<std::vector<std::vector<std::vector<int> > > > sideSetsPtr = meshmgr.getSideSets();
 
     Intrepid::FieldContainer<RealT> &nodes = *nodesPtr;
     Intrepid::FieldContainer<int>   &cellToNodeMap = *cellToNodeMapPtr;
diff --git a/packages/rol/example/PDE-OPT/TEST/test_03K.cpp b/packages/rol/example/PDE-OPT/TEST/test_03K.cpp
new file mode 100644
index 000000000000..ff259287a5ae
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/TEST/test_03K.cpp
@@ -0,0 +1,172 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  test_03.cpp
+    \brief Unit test for the mesh manager and the degree-of-freedom manager.
+           Mesh type: BRICK with HEX CELLS and HCURL SPACE.
+*/
+
+#include "ROL_Algorithm.hpp"
+#include "ROL_BoundConstraint_SimOpt.hpp"
+#include "ROL_Vector_SimOpt.hpp"
+
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+#include "Teuchos_XMLParameterListHelpers.hpp"
+
+#include "Intrepid2_HCURL_HEX_I1_FEM.hpp"
+
+#include <iostream>
+#include <algorithm>
+
+#include "../TOOLS/meshmanagerK.hpp"
+#include "../TOOLS/dofmanagerK.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+using basis_ptr = Intrepid2::BasisPtr<DeviceT, RealT, RealT>;
+using scalar_view = typename MeshManager<RealT,DeviceT>::scalar_view;
+using int_view = typename MeshManager<RealT,DeviceT>::int_view;
+
+int main(int argc, char *argv[]) {
+
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
+  Kokkos::ScopeGuard kokkosScope(argc, argv);
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+  if (iprint > 0)
+    outStream = ROL::makePtrFromRef(std::cout);
+  else
+    outStream = ROL::makePtrFromRef(bhs);
+
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+
+    /*** Read in XML input ***/
+    std::string filename = "input_03.xml";
+    Teuchos::RCP<Teuchos::ParameterList> parlist
+      = Teuchos::rcp( new Teuchos::ParameterList() );
+    Teuchos::updateParametersFromXmlFile( filename, parlist.ptr() );
+
+    /*** Initialize mesh / degree-of-freedom manager. ***/
+    MeshManager_Brick<RealT, DeviceT> meshmgr(*parlist);
+    scalar_view nodes = meshmgr.getNodes();
+    int_view cellToNodeMap = meshmgr.getCellToNodeMap();
+    int_view cellToEdgeMap = meshmgr.getCellToEdgeMap();
+    ROL::Ptr<std::vector<std::vector<std::vector<int> > > > sideSetsPtr = meshmgr.getSideSets();
+
+    std::vector<std::vector<std::vector<int> > >  &sideSets = *sideSetsPtr;
+    *outStream << "Number of nodes = " << meshmgr.getNumNodes() << std::endl;  // << nodes;
+    *outStream << "Number of cells = " << meshmgr.getNumCells() << std::endl;  // << cellToNodeMap;
+    *outStream << "Number of edges = " << meshmgr.getNumEdges() << std::endl;  // << cellToEdgeMap;
+    // Print mesh info to file.
+    std::ofstream meshfile;
+    meshfile.open("cells.txt");
+    for (int i=0; i<cellToNodeMap.extent_int(0); ++i) {
+      meshfile << cellToNodeMap(i,0) << "  "
+               << cellToNodeMap(i,1) << "  "
+               << cellToNodeMap(i,2) << "  "
+               << cellToNodeMap(i,3) << "  "
+               << cellToNodeMap(i,4) << "  "
+               << cellToNodeMap(i,5) << "  "
+               << cellToNodeMap(i,6) << "  "
+               << cellToNodeMap(i,7) << std::endl;
+    }
+    meshfile.close();
+    meshfile.open("edges.txt");
+    for (int i=0; i<cellToEdgeMap.extent_int(0); ++i) {
+      meshfile << cellToEdgeMap(i,0)  << "  "
+               << cellToEdgeMap(i,1)  << "  "
+               << cellToEdgeMap(i,2)  << "  "
+               << cellToEdgeMap(i,3)  << "  "
+               << cellToEdgeMap(i,4)  << "  "
+               << cellToEdgeMap(i,5)  << "  "
+               << cellToEdgeMap(i,6)  << "  "
+               << cellToEdgeMap(i,7)  << "  "
+               << cellToEdgeMap(i,8)  << "  "
+               << cellToEdgeMap(i,9)  << "  "
+               << cellToEdgeMap(i,10) << "  "
+               << cellToEdgeMap(i,11) << std::endl;
+    }
+    meshfile.close();
+    meshfile.open("nodes.txt");
+    for (int i=0; i<nodes.extent_int(0); ++i) {
+      meshfile << nodes(i,0) << "  "
+               << nodes(i,1) << "  "
+               << nodes(i,2) << std::endl;
+    }
+    meshfile.close();
+    meshfile.open("sideset.txt");
+    for (int i=0; i<static_cast<int>(sideSets.size()); ++i) {
+      for (int j=0; j<static_cast<int>(sideSets[i].size()); ++j) {
+        if (sideSets[i][j].size() > 0) {
+          for (int k=0; k<static_cast<int>(sideSets[i][j].size()); ++k) {
+            meshfile << sideSets[i][j][k] << std::endl;
+          }
+        }
+        meshfile << std::endl << std::endl;
+      }
+    }
+    meshfile.close();
+
+    basis_ptr basisPtrNedelec1 = ROL::makePtr<Intrepid2::Basis_HCURL_HEX_I1_FEM<DeviceT, RealT, RealT>>();
+
+    std::vector<basis_ptr> basisPtrs(1, ROL::nullPtr);
+    basisPtrs[0] = basisPtrNedelec1;
+
+    ROL::Ptr<MeshManager<RealT, DeviceT>> meshmgrPtr = ROL::makePtrFromRef(meshmgr);
+
+    DofManager<RealT, DeviceT> dofmgr(meshmgrPtr, basisPtrs);
+
+    *outStream << "Number of node dofs = " << dofmgr.getNumNodeDofs() << std::endl;  // << *(dofmgr.getNodeDofs());
+    *outStream << "Number of edge dofs = " << dofmgr.getNumEdgeDofs() << std::endl;  // << *(dofmgr.getEdgeDofs());
+    *outStream << "Number of face dofs = " << dofmgr.getNumFaceDofs() << std::endl;  // << *(dofmgr.getFaceDofs());
+    *outStream << "Number of void dofs = " << dofmgr.getNumVoidDofs() << std::endl;  // << *(dofmgr.getVoidDofs());
+    *outStream << "Total number of dofs = " << dofmgr.getNumDofs() << std::endl;     // << *(dofmgr.getCellDofs());
+
+    std::vector<std::vector<int> > fieldPattern = dofmgr.getFieldPattern();
+    for (int i=0; i<dofmgr.getNumFields(); ++i) {
+      *outStream << "\nField " << i << " pattern:   ";
+      for (int j=0; j<dofmgr.getLocalFieldSize(i); ++j) {
+        *outStream << fieldPattern[i][j] << " ";
+      }
+      *outStream << std::endl;
+    }
+
+    for (int i=0; i<dofmgr.getNumFields(); ++i) {
+      *outStream << "\nField  " << i << std::endl;
+      // *outStream << dofmgr.getFieldDofs(i);
+    }
+
+    bool correct = true;
+    static const int checkDofs[] = {38, 42, 45, 41, 67, 71, 74, 70, 52, 53, 57, 56};
+    for (int i=0; i<dofmgr.getLocalFieldSize(0); ++i) {
+      correct = correct && ( (dofmgr.getFieldDofs(0))(11,i) == checkDofs[i] );
+    }
+    if (!correct) {
+      errorFlag = -1;
+    }
+
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/TEST/test_04.cpp b/packages/rol/example/PDE-OPT/TEST/test_04.cpp
index c2f55d13b57a..5d30da0f542a 100644
--- a/packages/rol/example/PDE-OPT/TEST/test_04.cpp
+++ b/packages/rol/example/PDE-OPT/TEST/test_04.cpp
@@ -14,7 +14,7 @@
 #include "ROL_Types.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Teuchos_XMLParameterListHelpers.hpp"
 
 #include "Intrepid_HGRAD_QUAD_C1_FEM.hpp"
@@ -29,7 +29,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
   ROL::Ptr<std::ostream> outStream;
diff --git a/packages/rol/example/PDE-OPT/TEST/test_04K.cpp b/packages/rol/example/PDE-OPT/TEST/test_04K.cpp
new file mode 100644
index 000000000000..aa02427a02b6
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/TEST/test_04K.cpp
@@ -0,0 +1,145 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  test_04.cpp
+    \brief Unit test for the FE class on quads.
+*/
+
+#include "ROL_Types.hpp"
+
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+#include "Teuchos_XMLParameterListHelpers.hpp"
+
+#include "Intrepid2_HGRAD_QUAD_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_QUAD_C2_FEM.hpp"
+#include "Intrepid2_DefaultCubatureFactory.hpp"
+
+#include "../TOOLS/meshmanagerK.hpp"
+#include "../TOOLS/dofmanagerK.hpp"
+#include "../TOOLS/feK.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+using basis_ptr = Intrepid2::BasisPtr<DeviceT, RealT, RealT>;
+using scalar_view = typename MeshManager<RealT,DeviceT>::scalar_view;
+using int_view = typename MeshManager<RealT,DeviceT>::int_view;
+
+int main(int argc, char *argv[]) {
+
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
+  Kokkos::ScopeGuard kokkosScope(argc, argv);
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+  if (iprint > 0)
+    outStream = ROL::makePtrFromRef(std::cout);
+  else
+    outStream = ROL::makePtrFromRef(bhs);
+
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+
+    /*** Read in XML input ***/
+    std::string filename = "input_04.xml";
+    Teuchos::RCP<Teuchos::ParameterList> parlist
+      = Teuchos::rcp( new Teuchos::ParameterList() );
+    Teuchos::updateParametersFromXmlFile( filename, parlist.ptr() );
+
+    /*** Initialize mesh / degree-of-freedom manager. ***/
+    MeshManager_Rectangle<RealT, DeviceT> meshmgr(*parlist);
+    scalar_view nodes = meshmgr.getNodes();
+    int_view cellToNodeMap = meshmgr.getCellToNodeMap();
+
+    *outStream << "Number of nodes = " << meshmgr.getNumNodes() << std::endl;
+    *outStream << "Number of cells = " << meshmgr.getNumCells() << std::endl;
+    *outStream << "Number of edges = " << meshmgr.getNumEdges() << std::endl;
+
+    ROL::Ptr<MeshManager<RealT, DeviceT>> meshmgrPtr = ROL::makePtrFromRef(meshmgr);
+
+    // Basis.
+    basis_ptr basis = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C2_FEM<DeviceT, RealT, RealT>>();
+    // Cubature.
+    ROL::Ptr<Intrepid2::Cubature<DeviceT, RealT, RealT>> cubature;
+    shards::CellTopology cellType = basis->getBaseCellTopology();                        // get the cell type from any basis
+    Intrepid2::DefaultCubatureFactory cubFactory;                                         // create cubature factory
+    int cubDegree = 4;                                                                   // set cubature degree, e.g., 2
+    cubature = cubFactory.create<DeviceT, RealT, RealT>(cellType, cubDegree);            // create default cubature
+    // Cell nodes.
+    scalar_view cellNodes("cellNodes", meshmgr.getNumCells(), cellType.getNodeCount(), cellType.getDimension());
+    for (int i=0; i<meshmgr.getNumCells(); ++i) {
+      for (unsigned j=0; j<cellType.getNodeCount(); ++j) {
+        for (unsigned k=0; k<cellType.getDimension(); ++k) {
+          cellNodes(i,j,k) = nodes(cellToNodeMap(i,j), k);
+        }
+      }
+    }
+    // FE object.
+    FE<RealT, DeviceT> fe(cellNodes, basis, cubature);
+
+    // Check integration.
+    scalar_view feVals1("feVals1", meshmgr.getNumCells(), cubature->getNumPoints());
+    scalar_view feVals2("feVals2", meshmgr.getNumCells(), cubature->getNumPoints());
+    scalar_view feGrads1("feGrads1", meshmgr.getNumCells(), cubature->getNumPoints(), cubature->getDimension());
+    scalar_view feGrads2("feGrads2", meshmgr.getNumCells(), cubature->getNumPoints(), cubature->getDimension());
+    scalar_view feCoeffs1("feCoeffs1", meshmgr.getNumCells(), basis->getCardinality());
+    scalar_view feCoeffs2("feCoeffs2", meshmgr.getNumCells(), basis->getCardinality());
+    scalar_view feIntegral("feIntegral", meshmgr.getNumCells());
+    // values
+    Kokkos::deep_copy(feCoeffs1, static_cast<RealT>(1));
+    Kokkos::deep_copy(feCoeffs2, static_cast<RealT>(2));
+    fe.evaluateValue(feVals1, feCoeffs1);
+    fe.evaluateValue(feVals2, feCoeffs2);
+    fe.computeIntegral(feIntegral, feVals1, feVals2);
+    RealT valval(0);
+    for (int i=0; i<meshmgr.getNumCells(); ++i) {
+      valval += feIntegral(i);
+    }
+    if (std::abs(valval - static_cast<RealT>(1000)) > std::sqrt(ROL::ROL_EPSILON<RealT>())) {
+      errorFlag = -1;
+    }
+    // gradients
+    scalar_view dofCoords("dofCoords", meshmgr.getNumCells(), basis->getCardinality(), cubature->getDimension());
+    fe.computeDofCoords(dofCoords, cellNodes);
+    for (int i=0; i<meshmgr.getNumCells(); ++i) {
+      for (int j=0; j<basis->getCardinality(); ++j) {
+        RealT x = dofCoords(i,j,0);
+        RealT y = dofCoords(i,j,1);
+        feCoeffs1(i,j) = x + y;
+        feCoeffs2(i,j) = static_cast<RealT>(2)*x + static_cast<RealT>(3)*y;
+      }
+    }
+    fe.evaluateGradient(feGrads1, feCoeffs1);
+    fe.evaluateGradient(feGrads2, feCoeffs2);
+    fe.computeIntegral(feIntegral, feGrads1, feGrads2);
+    RealT gradgrad(0);
+    for (int i=0; i<meshmgr.getNumCells(); ++i) {
+      gradgrad += feIntegral(i);
+    }
+    if (std::abs(gradgrad - static_cast<RealT>(2500)) > std::sqrt(ROL::ROL_EPSILON<RealT>())) {
+      errorFlag = -1;
+    }
+    *outStream << std::setprecision(14) << "\nvalvalIntegral=" << valval << "  " << "gradgradIntegral=" << gradgrad << std::endl;
+
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/TEST/test_05.cpp b/packages/rol/example/PDE-OPT/TEST/test_05.cpp
index 3ad81401a839..3d781e084bb3 100644
--- a/packages/rol/example/PDE-OPT/TEST/test_05.cpp
+++ b/packages/rol/example/PDE-OPT/TEST/test_05.cpp
@@ -14,7 +14,7 @@
 #include "ROL_Types.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Teuchos_XMLParameterListHelpers.hpp"
 
 #include "Intrepid_HCURL_HEX_I1_FEM.hpp"
@@ -28,7 +28,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
   ROL::Ptr<std::ostream> outStream;
diff --git a/packages/rol/example/PDE-OPT/TEST/test_05K.cpp b/packages/rol/example/PDE-OPT/TEST/test_05K.cpp
new file mode 100644
index 000000000000..6aee95b49963
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/TEST/test_05K.cpp
@@ -0,0 +1,139 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  test_04.cpp
+    \brief Unit test for the FE_CURL class.
+*/
+
+#include "ROL_Types.hpp"
+
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+#include "Teuchos_XMLParameterListHelpers.hpp"
+
+#include "Intrepid2_HCURL_HEX_I1_FEM.hpp"
+#include "Intrepid2_DefaultCubatureFactory.hpp"
+
+#include "../TOOLS/meshmanagerK.hpp"
+#include "../TOOLS/dofmanagerK.hpp"
+#include "../TOOLS/feK_curl.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+using basis_ptr = Intrepid2::BasisPtr<DeviceT, RealT, RealT>;
+using scalar_view = typename MeshManager<RealT,DeviceT>::scalar_view;
+using int_view = typename MeshManager<RealT,DeviceT>::int_view;
+
+int main(int argc, char *argv[]) {
+
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
+  Kokkos::ScopeGuard kokkosScope(argc, argv);
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+  if (iprint > 0)
+    outStream = ROL::makePtrFromRef(std::cout);
+  else
+    outStream = ROL::makePtrFromRef(bhs);
+
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+
+    /*** Read in XML input ***/
+    std::string filename = "input_05.xml";
+    Teuchos::RCP<Teuchos::ParameterList> parlist
+      = Teuchos::rcp( new Teuchos::ParameterList() );
+    Teuchos::updateParametersFromXmlFile( filename, parlist.ptr() );
+
+    /*** Initialize mesh / degree-of-freedom manager. ***/
+    MeshManager_Brick<RealT, DeviceT> meshmgr(*parlist);
+    scalar_view nodes = meshmgr.getNodes();
+    int_view cellToNodeMap = meshmgr.getCellToNodeMap();
+
+    *outStream << "Number of nodes = " << meshmgr.getNumNodes() << std::endl;
+    *outStream << "Number of cells = " << meshmgr.getNumCells() << std::endl;
+    *outStream << "Number of edges = " << meshmgr.getNumEdges() << std::endl;
+
+    ROL::Ptr<MeshManager<RealT, DeviceT>> meshmgrPtr = ROL::makePtrFromRef(meshmgr);
+
+    // Basis.
+    basis_ptr basis = ROL::makePtr<Intrepid2::Basis_HCURL_HEX_I1_FEM<DeviceT, RealT, RealT>>();
+    // Cubature.
+    ROL::Ptr<Intrepid2::Cubature<DeviceT, RealT, RealT>> cubature;
+    shards::CellTopology cellType = basis->getBaseCellTopology();                        // get the cell type from any basis
+    Intrepid2::DefaultCubatureFactory cubFactory;                                        // create cubature factory
+    int cubDegree = 4;                                                                   // set cubature degree, e.g., 2
+    cubature = cubFactory.create<DeviceT, RealT, RealT>(cellType, cubDegree);            // create default cubature
+    // Cell nodes.
+    scalar_view cellNodes("cellNodes", meshmgr.getNumCells(), cellType.getNodeCount(), cellType.getDimension());
+    for (int i=0; i<meshmgr.getNumCells(); ++i) {
+      for (unsigned j=0; j<cellType.getNodeCount(); ++j) {
+        for (unsigned k=0; k<cellType.getDimension(); ++k) {
+          cellNodes(i,j,k) = nodes(cellToNodeMap(i,j), k);
+        }
+      }
+    }
+    // FE object.
+    FE_CURL<RealT, DeviceT> fe(cellNodes, basis, cubature);
+
+    // Check integration.
+    scalar_view feVals1("feVals1", meshmgr.getNumCells(), cubature->getNumPoints(), cubature->getDimension());
+    scalar_view feVals2("feVals2", meshmgr.getNumCells(), cubature->getNumPoints(), cubature->getDimension());
+    scalar_view feCurls1("feCurls1", meshmgr.getNumCells(), cubature->getNumPoints(), cubature->getDimension());
+    scalar_view feCurls2("feCurls2", meshmgr.getNumCells(), cubature->getNumPoints(), cubature->getDimension());
+    scalar_view feCoeffs1("feCoeffs1", meshmgr.getNumCells(), basis->getCardinality());
+    scalar_view feCoeffs2("feCoeffs2", meshmgr.getNumCells(), basis->getCardinality());
+    scalar_view feIntegral("feIntegral", meshmgr.getNumCells());
+    // values
+    Kokkos::deep_copy(feCoeffs1,static_cast<RealT>(0));
+    Kokkos::deep_copy(feCoeffs2,static_cast<RealT>(0));
+    for (int i=0; i<meshmgr.getNumCells(); ++i) {
+      feCoeffs1(i, 0) = static_cast<RealT>(1);
+      feCoeffs2(i, 0) = static_cast<RealT>(1);
+    }
+    fe.evaluateValue(feVals1, feCoeffs1);
+    fe.evaluateValue(feVals2, feCoeffs2);
+    fe.computeIntegral(feIntegral, feVals1, feVals2);
+    RealT valval(0);
+    for (int i=0; i<meshmgr.getNumCells(); ++i) {
+      valval += feIntegral(i);
+    }
+    if (std::abs(valval - static_cast<RealT>(4)/static_cast<RealT>(9)) > std::sqrt(ROL::ROL_EPSILON<RealT>())) {
+      errorFlag = -1;
+    }
+    // curls
+    fe.evaluateCurl(feCurls1, feCoeffs1);
+    fe.evaluateCurl(feCurls2, feCoeffs2);
+    fe.computeIntegral(feIntegral, feCurls1, feCurls2);
+    RealT curlcurl(0);
+    for (int i=0; i<meshmgr.getNumCells(); ++i) {
+      curlcurl += feIntegral(i);
+    }
+    if (std::abs(curlcurl - static_cast<RealT>(8)/static_cast<RealT>(3)) > std::sqrt(ROL::ROL_EPSILON<RealT>())) {
+      errorFlag = -1;
+    }
+    *outStream << std::setprecision(14) << "\nvalvalIntegral=" << valval << "  "
+                                        << "curlcurlIntegral=" << curlcurl << std::endl;
+
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/TEST/test_06.cpp b/packages/rol/example/PDE-OPT/TEST/test_06.cpp
index e91bca3223ea..d0023abda7c4 100644
--- a/packages/rol/example/PDE-OPT/TEST/test_06.cpp
+++ b/packages/rol/example/PDE-OPT/TEST/test_06.cpp
@@ -12,14 +12,14 @@
 */
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Teuchos_XMLParameterListHelpers.hpp"
 
 
 #include "../TOOLS/template_tools.hpp"
 #include "ROL_Ptr.hpp"
 
-// Example of ScalarFunction 
+// Example of ScalarFunction
 // f(x,y) = <x,x> + 2*<y,y> - 2 y[0]*(x[0]+x[1]) + 3*x[0]*(y[1]-y[0])
 //
 // coeff = [1,2,-2,3]
@@ -31,19 +31,19 @@ struct ExampleQuadratic {
 
 
   template<class Real, class ScalarX, class ScalarY>
-  static AD::ResultType<Real,ScalarX,ScalarY> 
+  static AD::ResultType<Real,ScalarX,ScalarY>
   eval( const Param &p,
-        const Array<ScalarX> &x, 
+        const Array<ScalarX> &x,
         const Array<ScalarY> &y ) {
 
-    using Index = TemplateTools::index_t<Array<ScalarX>>;    
+    using Index = TemplateTools::index_t<Array<ScalarX>>;
     using TemplateTools::get;
 
-    ScalarX xdotx(0); 
+    ScalarX xdotx(0);
     ScalarY ydoty(0);
 
     auto nx = TemplateTools::size(x);
-    auto ny = TemplateTools::size(y);     
+    auto ny = TemplateTools::size(y);
 
     for( Index i=0; i<nx; ++i ) {
       xdotx += get(x, i);
@@ -52,8 +52,8 @@ struct ExampleQuadratic {
       ydoty = get(y,j);
     }
 
-    return get(p,0)*xdotx + 
-           get(p,1)*ydoty + 
+    return get(p,0)*xdotx +
+           get(p,1)*ydoty +
            get(p,2)*get(y,0)*(get(x,0)+get(x,1)) +
            get(p,3)*get(x,0)*(get(y,1)-get(y,0));
 
@@ -62,7 +62,7 @@ struct ExampleQuadratic {
 
 
 struct Foo {
-  int size(){return 0;} 
+  int size(){return 0;}
   void bar(int i) {}
 };
 
@@ -70,13 +70,13 @@ int main(int argc, char *argv[] ) {
 
   using RealT = double;
 
-    
+
   ROL::nullstream bhs;
 
   ROL::Ptr<std::ostream> os;
   if(argc>1)   os = ROL::makePtrFromRef(std::cout);
   else         os = ROL::makePtrFromRef(bhs);
-  
+
   using DFad = Sacado::Fad::DFad<RealT>;
 
   int errorFlag = 0;
@@ -91,10 +91,10 @@ int main(int argc, char *argv[] ) {
     errorFlag += static_cast<int>(AD::has_dx<RealT>::value);
     errorFlag += static_cast<int>(!AD::has_dx<DFad>::value);
 
-    *os << "Does type RealT = double have a method .dx(int)?... " 
+    *os << "Does type RealT = double have a method .dx(int)?... "
         << AD::has_dx<RealT>::value << std::endl;
 
-    *os << "Does type DFad<RealT> have a method .dx(int)?...... " 
+    *os << "Does type DFad<RealT> have a method .dx(int)?...... "
         << AD::has_dx<DFad>::value << std::endl;
 
     auto xval  = AD::value(x);
@@ -107,7 +107,7 @@ int main(int argc, char *argv[] ) {
 
     *os << "RealT x(2.0);"             << std::endl;
     *os << "DFad x_fad(1,x); "         << std::endl;
-    *os << "DFad f_fad = x_fad*x_fad " << std::endl;  
+    *os << "DFad f_fad = x_fad*x_fad " << std::endl;
 
     *os << "AD::value(x)     = " << xval  << std::endl;
     *os << "AD::value(x_fad) = " << xfval << std::endl;
@@ -136,7 +136,7 @@ int main(int argc, char *argv[] ) {
     std::vector<RealT> coeff({1.0,2.0,-2.0,3.0});
 
     std::vector<RealT> x(Nx);
-    std::vector<RealT> y(Ny); 
+    std::vector<RealT> y(Ny);
 
 //    std::cout << TemplateTools::size(coeff) << std::endl;
 //    std::cout << TemplateTools::max( coeff ) << std::endl;
diff --git a/packages/rol/example/PDE-OPT/TEST/test_07.cpp b/packages/rol/example/PDE-OPT/TEST/test_07.cpp
index 24d741612255..a5032a885ad5 100644
--- a/packages/rol/example/PDE-OPT/TEST/test_07.cpp
+++ b/packages/rol/example/PDE-OPT/TEST/test_07.cpp
@@ -12,7 +12,7 @@
 */
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Teuchos_XMLParameterListHelpers.hpp"
 
 #include "Intrepid_HGRAD_HEX_C1_FEM.hpp"
@@ -25,7 +25,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
   ROL::Ptr<std::ostream> outStream;
@@ -53,7 +53,7 @@ int main(int argc, char *argv[]) {
     ROL::Ptr<Intrepid::FieldContainer<int> >   cellToNodeMapPtr = meshmgr.getCellToNodeMap();
     ROL::Ptr<Intrepid::FieldContainer<int> >   cellToEdgeMapPtr = meshmgr.getCellToEdgeMap();
     ROL::Ptr<Intrepid::FieldContainer<int> >   cellToFaceMapPtr = meshmgr.getCellToFaceMap();
-    ROL::Ptr<std::vector<std::vector<std::vector<int> > > > sideSetsPtr = meshmgr.getSideSets(); 
+    ROL::Ptr<std::vector<std::vector<std::vector<int> > > > sideSetsPtr = meshmgr.getSideSets();
 
     Intrepid::FieldContainer<RealT> &nodes = *nodesPtr;
     Intrepid::FieldContainer<int>   &cellToNodeMap = *cellToNodeMapPtr;
diff --git a/packages/rol/example/PDE-OPT/TEST/test_07K.cpp b/packages/rol/example/PDE-OPT/TEST/test_07K.cpp
new file mode 100644
index 000000000000..c137c7cecf3f
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/TEST/test_07K.cpp
@@ -0,0 +1,137 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  test_07.cpp
+    \brief Unit test for the MeshReader mesh manager.
+*/
+
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+#include "Teuchos_XMLParameterListHelpers.hpp"
+
+#include "Intrepid2_HGRAD_HEX_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_HEX_C2_FEM.hpp"
+
+#include "../TOOLS/meshreaderK.hpp"
+#include "../TOOLS/dofmanagerK.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+using basis_ptr = Intrepid2::BasisPtr<DeviceT, RealT, RealT>;
+using scalar_view = typename MeshManager<RealT,DeviceT>::scalar_view;
+using int_view = typename MeshManager<RealT,DeviceT>::int_view;
+
+int main(int argc, char *argv[]) {
+
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
+  Kokkos::ScopeGuard kokkosScope(argc, argv);
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+  if (iprint > 0)
+    outStream = ROL::makePtrFromRef(std::cout);
+  else
+    outStream = ROL::makePtrFromRef(bhs);
+
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+
+    /*** Read in XML input ***/
+    std::string filename = "input_07.xml";
+    Teuchos::RCP<Teuchos::ParameterList> parlist
+      = Teuchos::rcp( new Teuchos::ParameterList() );
+    Teuchos::updateParametersFromXmlFile( filename, parlist.ptr() );
+
+    /*** Initialize mesh / degree-of-freedom manager. ***/
+    MeshReader<RealT, DeviceT> meshmgr(*parlist);
+
+    scalar_view nodes = meshmgr.getNodes();
+    int_view cellToNodeMap = meshmgr.getCellToNodeMap();
+    int_view cellToEdgeMap = meshmgr.getCellToEdgeMap();
+    int_view cellToFaceMap = meshmgr.getCellToFaceMap();
+    ROL::Ptr<std::vector<std::vector<std::vector<int> > > > sideSetsPtr = meshmgr.getSideSets();
+
+    std::vector<std::vector<std::vector<int> > >  &sideSets = *sideSetsPtr;
+    *outStream << "Number of nodes = " << meshmgr.getNumNodes() << std::endl;  // << nodes;
+    *outStream << "Number of cells = " << meshmgr.getNumCells() << std::endl;  // << cellToNodeMap;
+    *outStream << "Number of edges = " << meshmgr.getNumEdges() << std::endl;  // << cellToEdgeMap;
+    *outStream << "Number of faces = " << meshmgr.getNumFaces() << std::endl;  // << cellToFaceMap;
+    // Print sideset info to file.
+    std::ofstream meshfile;
+    meshfile.open("sideset.txt");
+    for (int i=0; i<static_cast<int>(sideSets.size()); ++i) {
+      for (int j=0; j<static_cast<int>(sideSets[i].size()); ++j) {
+        if (sideSets[i][j].size() > 0) {
+          for (int k=0; k<static_cast<int>(sideSets[i][j].size()); ++k) {
+            meshfile << i << " " << j << " " << k << " " << sideSets[i][j][k] << std::endl;
+          }
+        }
+        meshfile << std::endl;
+      }
+      meshfile << std::endl << std::endl;
+    }
+    meshfile.close();
+
+    basis_ptr basisPtrQ1 = ROL::makePtr<Intrepid2::Basis_HGRAD_HEX_C1_FEM<DeviceT, RealT, RealT>>();
+    basis_ptr basisPtrQ2 = ROL::makePtr<Intrepid2::Basis_HGRAD_HEX_C2_FEM<DeviceT, RealT, RealT>>();
+
+    std::vector<basis_ptr> basisPtrs(3, ROL::nullPtr);
+    basisPtrs[0] = basisPtrQ2;
+    basisPtrs[1] = basisPtrQ1;
+    basisPtrs[2] = basisPtrQ2;
+
+    ROL::Ptr<MeshManager<RealT, DeviceT>> meshmgrPtr = ROL::makePtrFromRef(meshmgr);
+
+    DofManager<RealT, DeviceT> dofmgr(meshmgrPtr, basisPtrs);
+
+    *outStream << "Number of node dofs = " << dofmgr.getNumNodeDofs() << std::endl;  // << *(dofmgr.getNodeDofs());
+    *outStream << "Number of edge dofs = " << dofmgr.getNumEdgeDofs() << std::endl;  // << *(dofmgr.getEdgeDofs());
+    *outStream << "Number of face dofs = " << dofmgr.getNumFaceDofs() << std::endl;  // << *(dofmgr.getFaceDofs());
+    *outStream << "Number of void dofs = " << dofmgr.getNumVoidDofs() << std::endl;  // << *(dofmgr.getVoidDofs());
+    *outStream << "Total number of dofs = " << dofmgr.getNumDofs() << std::endl;     // << *(dofmgr.getCellDofs());
+
+    std::vector<std::vector<int> > fieldPattern = dofmgr.getFieldPattern();
+    for (int i=0; i<dofmgr.getNumFields(); ++i) {
+      *outStream << "\nField " << i << " pattern:   ";
+      for (int j=0; j<dofmgr.getLocalFieldSize(i); ++j) {
+        *outStream << fieldPattern[i][j] << " ";
+      }
+      *outStream << std::endl;
+    }
+
+    for (int i=0; i<dofmgr.getNumFields(); ++i) {
+      *outStream << "\nField  " << i << std::endl;
+      // *outStream << *(dofmgr.getFieldDofs(i));
+    }
+
+    bool correct = true;
+    static const int checkDofs[] = {11, 8, 38, 41, 23, 20, 44, 47, 53, 89, 91, 93, 61, 95, 97, 99, 71, 69, 101, 103, 109, 127, 129, 131, 133, 135, 141};
+    for (int i=0; i<dofmgr.getLocalFieldSize(2); ++i) {
+      correct = correct && ( (dofmgr.getFieldDofs(2))(2,i) == checkDofs[i] );
+    }
+    if (!correct) {
+      errorFlag = -1;
+    }
+
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/TEST/test_08.cpp b/packages/rol/example/PDE-OPT/TEST/test_08.cpp
index 975ef1352f61..ef38b93cfdee 100644
--- a/packages/rol/example/PDE-OPT/TEST/test_08.cpp
+++ b/packages/rol/example/PDE-OPT/TEST/test_08.cpp
@@ -14,7 +14,7 @@
 #include "ROL_Types.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Teuchos_XMLParameterListHelpers.hpp"
 
 #include "Intrepid_HGRAD_HEX_C1_FEM.hpp"
@@ -29,7 +29,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
   ROL::Ptr<std::ostream> outStream;
diff --git a/packages/rol/example/PDE-OPT/TEST/test_08K.cpp b/packages/rol/example/PDE-OPT/TEST/test_08K.cpp
new file mode 100644
index 000000000000..4ad6e8861a37
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/TEST/test_08K.cpp
@@ -0,0 +1,147 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  test_08.cpp
+    \brief Unit test for the FE class on hexes.
+*/
+
+#include "ROL_Types.hpp"
+
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+#include "Teuchos_XMLParameterListHelpers.hpp"
+
+#include "Intrepid2_HGRAD_HEX_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_HEX_C2_FEM.hpp"
+#include "Intrepid2_DefaultCubatureFactory.hpp"
+
+#include "../TOOLS/meshreaderK.hpp"
+#include "../TOOLS/dofmanagerK.hpp"
+#include "../TOOLS/feK.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+using basis_ptr = Intrepid2::BasisPtr<DeviceT, RealT, RealT>;
+using scalar_view = typename MeshManager<RealT,DeviceT>::scalar_view;
+using int_view = typename MeshManager<RealT,DeviceT>::int_view;
+
+int main(int argc, char *argv[]) {
+
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
+  Kokkos::ScopeGuard kokkosScope(argc, argv);
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+  if (iprint > 0)
+    outStream = ROL::makePtrFromRef(std::cout);
+  else
+    outStream = ROL::makePtrFromRef(bhs);
+
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+
+    /*** Read in XML input ***/
+    std::string filename = "input_08.xml";
+    Teuchos::RCP<Teuchos::ParameterList> parlist
+      = Teuchos::rcp( new Teuchos::ParameterList() );
+    Teuchos::updateParametersFromXmlFile( filename, parlist.ptr() );
+
+    /*** Initialize mesh / degree-of-freedom manager. ***/
+    MeshReader<RealT, DeviceT> meshmgr(*parlist);
+    scalar_view nodes = meshmgr.getNodes();
+    int_view  cellToNodeMap = meshmgr.getCellToNodeMap();
+
+    *outStream << "Number of nodes = " << meshmgr.getNumNodes() << std::endl;
+    *outStream << "Number of cells = " << meshmgr.getNumCells() << std::endl;
+    *outStream << "Number of edges = " << meshmgr.getNumEdges() << std::endl;
+    *outStream << "Number of edges = " << meshmgr.getNumFaces() << std::endl;
+
+    ROL::Ptr<MeshManager<RealT, DeviceT>> meshmgrPtr = ROL::makePtrFromRef(meshmgr);
+
+    // Basis.
+    basis_ptr basis = ROL::makePtr<Intrepid2::Basis_HGRAD_HEX_C2_FEM<DeviceT, RealT, RealT >>();
+    // Cubature.
+    ROL::Ptr<Intrepid2::Cubature<DeviceT, RealT, RealT>> cubature;
+    shards::CellTopology cellType = basis->getBaseCellTopology();              // get the cell type from any basis
+    Intrepid2::DefaultCubatureFactory cubFactory;                              // create cubature factory
+    int cubDegree = 6;                                                         // set cubature degree, e.g., 2
+    cubature = cubFactory.create<DeviceT, RealT, RealT>(cellType, cubDegree);  // create default cubature
+    // Cell nodes.
+    scalar_view cellNodes("cellNodes", meshmgr.getNumCells(), cellType.getNodeCount(), cellType.getDimension());
+    for (int i=0; i<meshmgr.getNumCells(); ++i) {
+      for (unsigned j=0; j<cellType.getNodeCount(); ++j) {
+        for (unsigned k=0; k<cellType.getDimension(); ++k) {
+          cellNodes(i,j,k) = nodes(cellToNodeMap(i,j), k);
+        }
+      }
+    }
+    // FE object.
+    FE<RealT, DeviceT> fe(cellNodes, basis, cubature);
+
+    // Check integration.
+    scalar_view feVals1("feVals1", meshmgr.getNumCells(), cubature->getNumPoints());
+    scalar_view feVals2("feVals2", meshmgr.getNumCells(), cubature->getNumPoints());
+    scalar_view feGrads1("feGrads1", meshmgr.getNumCells(), cubature->getNumPoints(), cubature->getDimension());
+    scalar_view feGrads2("feGrads2", meshmgr.getNumCells(), cubature->getNumPoints(), cubature->getDimension());
+    scalar_view feCoeffs1("feCoeffs1", meshmgr.getNumCells(), basis->getCardinality());
+    scalar_view feCoeffs2("feCoeffs2", meshmgr.getNumCells(), basis->getCardinality());
+    scalar_view feIntegral("feIntegral", meshmgr.getNumCells());
+    // values
+    Kokkos::deep_copy(feCoeffs1,static_cast<RealT>(1));
+    Kokkos::deep_copy(feCoeffs2,static_cast<RealT>(2));
+    fe.evaluateValue(feVals1, feCoeffs1);
+    fe.evaluateValue(feVals2, feCoeffs2);
+    fe.computeIntegral(feIntegral, feVals1, feVals2);
+    RealT valval(0);
+    for (int i=0; i<meshmgr.getNumCells(); ++i) {
+      valval += feIntegral(i);
+    }
+    if (std::abs(valval - static_cast<RealT>(6)) > std::sqrt(ROL::ROL_EPSILON<RealT>())) {
+      errorFlag = -1;
+    }
+    // gradients
+    scalar_view dofCoords("dofCoords", meshmgr.getNumCells(), basis->getCardinality(), cubature->getDimension());
+    fe.computeDofCoords(dofCoords, cellNodes);
+    for (int i=0; i<meshmgr.getNumCells(); ++i) {
+      for (int j=0; j<basis->getCardinality(); ++j) {
+        RealT x = dofCoords(i,j,0);
+        RealT y = dofCoords(i,j,1);
+        RealT z = dofCoords(i,j,2);
+        feCoeffs1(i,j) = x + y + z;
+        feCoeffs2(i,j) = static_cast<RealT>(2)*x + static_cast<RealT>(3)*y + static_cast<RealT>(4)*z;
+      }
+    }
+    fe.evaluateGradient(feGrads1, feCoeffs1);
+    fe.evaluateGradient(feGrads2, feCoeffs2);
+    fe.computeIntegral(feIntegral, feGrads1, feGrads2);
+    RealT gradgrad(0);
+    for (int i=0; i<meshmgr.getNumCells(); ++i) {
+      gradgrad += feIntegral(i);
+    }
+    if (std::abs(gradgrad - static_cast<RealT>(27)) > std::sqrt(ROL::ROL_EPSILON<RealT>())) {
+      errorFlag = -1;
+    }
+    *outStream << std::setprecision(14) << "\nvalvalIntegral=" << valval << "  " << "gradgradIntegral=" << gradgrad << std::endl;
+
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/TEST/test_09.cpp b/packages/rol/example/PDE-OPT/TEST/test_09.cpp
index a11f91abface..6814696cb353 100644
--- a/packages/rol/example/PDE-OPT/TEST/test_09.cpp
+++ b/packages/rol/example/PDE-OPT/TEST/test_09.cpp
@@ -17,7 +17,7 @@
 #include "ROL_Vector_SimOpt.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Teuchos_XMLParameterListHelpers.hpp"
 
 #include "Intrepid_HGRAD_LINE_Cn_FEM.hpp"
@@ -32,7 +32,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
   ROL::Ptr<std::ostream> outStream;
@@ -57,7 +57,7 @@ int main(int argc, char *argv[]) {
     MeshManager_Interval<RealT> meshmgr(*parlist);
     ROL::Ptr<Intrepid::FieldContainer<RealT> > nodesPtr = meshmgr.getNodes();
     ROL::Ptr<Intrepid::FieldContainer<int> >   cellToNodeMapPtr = meshmgr.getCellToNodeMap();
-    ROL::Ptr<std::vector<std::vector<std::vector<int> > > > sideSetsPtr = meshmgr.getSideSets(); 
+    ROL::Ptr<std::vector<std::vector<std::vector<int> > > > sideSetsPtr = meshmgr.getSideSets();
 
     Intrepid::FieldContainer<RealT> &nodes = *nodesPtr;
     Intrepid::FieldContainer<int>   &cellToNodeMap = *cellToNodeMapPtr;
diff --git a/packages/rol/example/PDE-OPT/TEST/test_09K.cpp b/packages/rol/example/PDE-OPT/TEST/test_09K.cpp
new file mode 100644
index 000000000000..745c2f9f1c96
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/TEST/test_09K.cpp
@@ -0,0 +1,139 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  test_09.cpp
+    \brief Unit test for the mesh manager and the degree-of-freedom manager.
+           Mesh type: INTERVAL with LINE CELLS and HGRAD SPACE.
+*/
+
+#include "ROL_Algorithm.hpp"
+#include "ROL_BoundConstraint_SimOpt.hpp"
+#include "ROL_Vector_SimOpt.hpp"
+
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+#include "Teuchos_XMLParameterListHelpers.hpp"
+
+#include "Intrepid2_HGRAD_LINE_Cn_FEM.hpp"
+
+#include <iostream>
+#include <algorithm>
+
+#include "../TOOLS/meshmanagerK.hpp"
+#include "../TOOLS/dofmanagerK.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+using basis_ptr = Intrepid2::BasisPtr<DeviceT, RealT, RealT>;
+using scalar_view = typename MeshManager<RealT,DeviceT>::scalar_view;
+using int_view = typename MeshManager<RealT,DeviceT>::int_view;
+
+int main(int argc, char *argv[]) {
+
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
+  Kokkos::ScopeGuard kokkosScope(argc, argv);
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+  if (iprint > 0)
+    outStream = ROL::makePtrFromRef(std::cout);
+  else
+    outStream = ROL::makePtrFromRef(bhs);
+
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+
+    /*** Read in XML input ***/
+    std::string filename = "input_09.xml";
+    Teuchos::RCP<Teuchos::ParameterList> parlist
+      = Teuchos::rcp( new Teuchos::ParameterList() );
+    Teuchos::updateParametersFromXmlFile( filename, parlist.ptr() );
+
+    /*** Initialize mesh / degree-of-freedom manager. ***/
+    MeshManager_Interval<RealT, DeviceT> meshmgr(*parlist);
+    scalar_view nodes = meshmgr.getNodes();
+    int_view cellToNodeMap = meshmgr.getCellToNodeMap();
+    ROL::Ptr<std::vector<std::vector<std::vector<int> > > > sideSetsPtr = meshmgr.getSideSets();
+
+    std::vector<std::vector<std::vector<int> > >  &sideSets = *sideSetsPtr;
+    // *outStream << "Number of nodes = " << meshmgr.getNumNodes() << std::endl << nodes;
+    // *outStream << "Number of cells = " << meshmgr.getNumCells() << std::endl << cellToNodeMap;
+    // Print mesh info to file.
+    std::ofstream meshfile;
+    meshfile.open("sideset.txt");
+    for (int i=0; i<static_cast<int>(sideSets.size()); ++i) {
+      for (int j=0; j<static_cast<int>(sideSets[i].size()); ++j) {
+        if (sideSets[i][j].size() > 0) {
+          for (int k=0; k<static_cast<int>(sideSets[i][j].size()); ++k) {
+            meshfile << sideSets[i][j][k] << std::endl;
+          }
+        }
+        meshfile << std::endl << std::endl;
+      }
+    }
+    meshfile.close();
+
+    basis_ptr basisPtrL1 = ROL::makePtr<Intrepid2::Basis_HGRAD_LINE_Cn_FEM<DeviceT, RealT, RealT >>(1, Intrepid2::POINTTYPE_EQUISPACED);
+
+    basis_ptr basisPtrL2 = ROL::makePtr<Intrepid2::Basis_HGRAD_LINE_Cn_FEM<DeviceT, RealT, RealT >>(2, Intrepid2::POINTTYPE_EQUISPACED);
+
+    std::vector<basis_ptr> basisPtrs(3, ROL::nullPtr);
+    basisPtrs[0] = basisPtrL2;
+    basisPtrs[1] = basisPtrL1;
+    basisPtrs[2] = basisPtrL2;
+
+    ROL::Ptr<MeshManager<RealT,DeviceT> > meshmgrPtr = ROL::makePtrFromRef(meshmgr);
+
+    DofManager<RealT,DeviceT> dofmgr(meshmgrPtr, basisPtrs);
+
+    *outStream << "Number of node dofs = " << dofmgr.getNumNodeDofs() << std::endl; // << *(dofmgr.getNodeDofs());
+    *outStream << "Number of edge dofs = " << dofmgr.getNumEdgeDofs() << std::endl; // << *(dofmgr.getEdgeDofs());
+    *outStream << "Number of face dofs = " << dofmgr.getNumFaceDofs() << std::endl; // << *(dofmgr.getFaceDofs());
+    *outStream << "Number of void dofs = " << dofmgr.getNumVoidDofs() << std::endl; // << *(dofmgr.getVoidDofs());
+    *outStream << "Total number of dofs = " << dofmgr.getNumDofs() << std::endl;    // << *(dofmgr.getCellDofs());
+
+    std::vector<std::vector<int> > fieldPattern = dofmgr.getFieldPattern();
+    for (int i=0; i<dofmgr.getNumFields(); ++i) {
+      *outStream << "\nField " << i << " pattern:   ";
+      for (int j=0; j<dofmgr.getLocalFieldSize(i); ++j) {
+        *outStream << fieldPattern[i][j] << " ";
+      }
+      *outStream << std::endl;
+    }
+
+    for (int i=0; i<dofmgr.getNumFields(); ++i) {
+      *outStream << "\nField  " << i << std::endl;
+      // *outStream << *(dofmgr.getFieldDofs(i));
+    }
+
+    /*bool correct = true;
+    static const int checkDofs[] = {20, 23, 35, 32, 55, 63, 69, 61, 81};
+    for (int i=0; i<dofmgr.getLocalFieldSize(2); ++i) {
+      correct = correct && ( (*(dofmgr.getFieldDofs(2)))(5,i) == checkDofs[i] );
+    }
+    if (!correct) {
+      errorFlag = -1;
+    }*/
+
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/TOOLS/Intrepid2_CubatureNodal.hpp b/packages/rol/example/PDE-OPT/TOOLS/Intrepid2_CubatureNodal.hpp
new file mode 100644
index 000000000000..7346d4c2169e
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/TOOLS/Intrepid2_CubatureNodal.hpp
@@ -0,0 +1,106 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+//
+//                           Intrepid Package
+// Copyright 2007 NTESS and the Intrepid contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/** \file   Intrepid_CubatureNodal.hpp
+    \brief  Header file for the Intrepid::CubatureNodal class.
+    \author Created by D. Ridzal.
+*/
+
+#ifndef INTREPID2_CUBATURE_NODAL_HPP
+#define INTREPID2_CUBATURE_NODAL_HPP
+
+#include "Intrepid2_Basis.hpp"
+#include "Intrepid2_Cubature.hpp"
+#include "Teuchos_Assert.hpp"
+
+
+namespace Intrepid2 {
+
+/** \class Intrepid::CubatureNodal
+    \brief Defines nodal cubature on select cell topologies, often used for
+           lower-order integration or underintegration.  The use of this
+           cubature with P1 elements yields a diagonal mass matrix.
+*/
+template<typename DeviceType=void, typename PointValueType=double, typename WeightValueType=double>
+class CubatureNodal : public Cubature<DeviceType,PointValueType,WeightValueType> {
+  public:
+    using ExecSpaceType = typename DeviceType::execution_space;
+    using PointViewType             = Kokkos::DynRankView<PointValueType,Kokkos::LayoutStride,DeviceType>;
+    using WeightViewType            = Kokkos::DynRankView<WeightValueType,Kokkos::LayoutStride,DeviceType>;
+
+    using PointViewTypeAllocatable  = Kokkos::DynRankView<PointValueType,DeviceType>;  // uses default layout; allows us to allocate (in contrast to LayoutStride)
+    using WeightViewTypeAllocatable = Kokkos::DynRankView<WeightValueType,DeviceType>; // uses default layout; allows us to allocate (in contrast to LayoutStride)
+    using TensorPointDataType       = TensorPoints<PointValueType,DeviceType>;
+    using TensorWeightDataType      = TensorData<WeightValueType,DeviceType>;
+  private:
+  /** \brief  Base topology of the cells for which the cubature is defined. See  the Shards package
+              http://trilinos.sandia.gov/packages/shards for definition of base cell topology.
+  */
+  shards::CellTopology cellTopo_;
+
+  unsigned numPoints_;
+  unsigned cellDim_;
+
+  WeightValueType weightVal_;
+
+  public:
+
+  ~CubatureNodal() {}
+
+  /** \brief Constructor.
+       \param cellTopo             [in]        - Cell topology.
+  */
+  CubatureNodal(const shards::CellTopology & cellTopo);
+
+  /** \brief Returns cubature points and weights
+             (return arrays must be pre-sized/pre-allocated).
+
+      \param cubPoints       [out]     - Vector containing the cubature points.
+      \param cubWeights      [out]     - Vector of corresponding cubature weights.
+  */
+  virtual void getCubature(PointViewType cubPoints,
+                           WeightViewType cubWeights) const override;
+
+  /** \brief Returns cubature points and weights.
+              Method for physical space cubature, throws an exception.
+
+       \param cubPoints             [out]        - Array containing the cubature points.
+       \param cubWeights            [out]        - Array of corresponding cubature weights.
+       \param cellCoords             [in]        - Array of cell coordinates
+  */
+  virtual void getCubature(PointViewType cubPoints,
+                           WeightViewType cubWeights,
+                           PointViewType cellCoords) const override;
+
+  /** \brief Returns the number of cubature points.
+  */
+  virtual ordinal_type getNumPoints() const override;
+
+  /** \brief Returns dimension of integration domain.
+  */
+  virtual ordinal_type getDimension() const override;
+
+  /** \brief Returns max. degree of polynomials that are integrated exactly.
+             The return vector has the size of the degree_ vector.
+  */
+  virtual ordinal_type getAccuracy() const override;
+
+}; // end class CubatureNodal
+
+
+} // end namespace Intrepid
+
+
+// include templated definitions
+#include "Intrepid2_CubatureNodalDef.hpp"
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/TOOLS/Intrepid2_CubatureNodalDef.hpp b/packages/rol/example/PDE-OPT/TOOLS/Intrepid2_CubatureNodalDef.hpp
new file mode 100644
index 000000000000..72678954c2d9
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/TOOLS/Intrepid2_CubatureNodalDef.hpp
@@ -0,0 +1,93 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+//
+//                           Intrepid Package
+// Copyright 2007 NTESS and the Intrepid contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/** \file   Intrepid_CubatureNodalDef.hpp
+    \brief  Definition file for the Intrepid::CubatureNodal class.
+    \author Created by D. Ridzal.
+*/
+
+namespace Intrepid2 {
+
+template <class DeviceType, class PointValueType, class WeightValueType>
+CubatureNodal<DeviceType,PointValueType,WeightValueType>::CubatureNodal(const shards::CellTopology & cellTopo) :
+  cellTopo_(cellTopo.getBaseCellTopologyData()) {
+
+  numPoints_ = cellTopo_.getVertexCount();
+  cellDim_ = cellTopo_.getDimension();
+
+  WeightValueType cellVolume(0);
+  switch( cellTopo_.getKey() ){
+    case shards::Line<2>::key:
+      cellVolume = 2.0;  break;
+    case shards::Triangle<3>::key:
+      cellVolume = 0.5;  break;
+    case shards::Quadrilateral<4>::key:
+      cellVolume = 4.0;  break;
+    case shards::Tetrahedron<4>::key:
+      cellVolume = 1.0/6.0;  break;
+    case shards::Hexahedron<8>::key:
+      cellVolume = 8.0;  break;
+    default:
+      TEUCHOS_TEST_FOR_EXCEPTION( (true), std::invalid_argument,
+                                 ">>> ERROR (Intrepid::CubatureNodal): Cell topology not supported.");
+  } // switch key
+
+  weightVal_ = cellVolume/numPoints_;
+}
+
+template <class DeviceType, class PointValueType, class WeightValueType>
+void CubatureNodal<DeviceType,PointValueType,WeightValueType>::getCubature(PointViewType  cubPoints,
+                                                                           WeightViewType cubWeights) const {
+  // check size of cubPoints and cubWeights
+  TEUCHOS_TEST_FOR_EXCEPTION( ( ( (unsigned)cubPoints.size() < numPoints_*cellDim_ ) || ( (unsigned)cubWeights.size() < numPoints_ ) ), std::out_of_range,
+                              ">>> ERROR (CubatureNodal): Insufficient space allocated for cubature points or weights.");
+
+  CellTools<DeviceType>::getReferenceSubcellVertices(cubPoints, cellDim_, 0, cellTopo_);
+
+  for (unsigned pointId = 0; pointId < numPoints_; pointId++) {
+    cubWeights(pointId) = weightVal_;
+  }
+
+} // end getCubature
+
+template <class DeviceType, class PointValueType, class WeightValueType>
+void CubatureNodal<DeviceType,PointValueType,WeightValueType>::getCubature(PointViewType  cubPoints,
+                                                                           WeightViewType cubWeights,
+                                                                           PointViewType  cellCoords) const
+{
+    TEUCHOS_TEST_FOR_EXCEPTION( (true), std::logic_error,
+                      ">>> ERROR (CubatureNodal): Cubature defined in reference space calling method for physical space cubature.");
+}
+
+
+
+template <class DeviceType, class PointValueType, class WeightValueType>
+ordinal_type CubatureNodal<DeviceType,PointValueType,WeightValueType>::getNumPoints() const {
+  return numPoints_;
+} // end getNumPoints
+
+
+
+template <class DeviceType, class PointValueType, class WeightValueType>
+ordinal_type CubatureNodal<DeviceType,PointValueType,WeightValueType>::getDimension() const {
+  return cellDim_;
+} // end dimension
+
+
+
+template <class DeviceType, class PointValueType, class WeightValueType>
+ordinal_type CubatureNodal<DeviceType,PointValueType,WeightValueType>::getAccuracy() const {
+  return 2;
+} // end getAccuracy
+
+
+} // end namespace Intrepid
diff --git a/packages/rol/example/PDE-OPT/TOOLS/assembler.hpp b/packages/rol/example/PDE-OPT/TOOLS/assembler.hpp
index 8972aea5fe0f..2caa264ff2f4 100644
--- a/packages/rol/example/PDE-OPT/TOOLS/assembler.hpp
+++ b/packages/rol/example/PDE-OPT/TOOLS/assembler.hpp
@@ -14,7 +14,7 @@
 #ifndef ROL_PDEOPT_ASSEMBLER_H
 #define ROL_PDEOPT_ASSEMBLER_H
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Teuchos_TimeMonitor.hpp"
 
 #include "Tpetra_MultiVector.hpp"
diff --git a/packages/rol/example/PDE-OPT/TOOLS/assemblerK.cpp b/packages/rol/example/PDE-OPT/TOOLS/assemblerK.cpp
new file mode 100644
index 000000000000..c119c1a63630
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/TOOLS/assemblerK.cpp
@@ -0,0 +1,16 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  assembler.cpp
+    \brief Template specialization for the Assembler class for PDE-OPT.
+*/
+
+#include "assemblerK.hpp"
+
+template class Assembler<double,Kokkos::HostSpace>;
diff --git a/packages/rol/example/PDE-OPT/TOOLS/assemblerK.hpp b/packages/rol/example/PDE-OPT/TOOLS/assemblerK.hpp
new file mode 100644
index 000000000000..e5519a9672c2
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/TOOLS/assemblerK.hpp
@@ -0,0 +1,768 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  assembler.hpp
+    \brief Finite element assembly class.
+*/
+
+#ifndef ROL_PDEOPT_ASSEMBLERK_H
+#define ROL_PDEOPT_ASSEMBLERK_H
+
+#include "Teuchos_GlobalMPISession.hpp"
+#include "Teuchos_TimeMonitor.hpp"
+
+#include "Tpetra_MultiVector.hpp"
+#include "Tpetra_Vector.hpp"
+#include "Tpetra_CrsGraph.hpp"
+#include "Tpetra_CrsMatrix.hpp"
+#include "Tpetra_Version.hpp"
+#include "Tpetra_RowMatrixTransposer.hpp"
+#include "MatrixMarket_Tpetra.hpp"
+
+#include "Intrepid2_DefaultCubatureFactory.hpp"
+
+#include "feK.hpp"
+#include "feK_curl.hpp"
+#include "pdeK.hpp"
+#include "dynpdeK.hpp"
+#include "qoiK.hpp"
+#include "dofmanagerK.hpp"
+#include "meshmanagerK.hpp"
+#include "fieldhelperK.hpp"
+
+//// Global Timers.
+#ifdef ROL_TIMERS
+namespace ROL {
+  namespace PDEOPT {
+    ROL::Ptr<Teuchos::Time> AssemblePDEResidual         = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble PDE Residual");
+    ROL::Ptr<Teuchos::Time> AssemblePDEJacobian1        = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble PDE Jacobian1");
+    ROL::Ptr<Teuchos::Time> AssemblePDEJacobian2        = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble PDE Jacobian2");
+    ROL::Ptr<Teuchos::Time> AssemblePDEJacobian3        = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble PDE Jacobian3");
+    ROL::Ptr<Teuchos::Time> AssemblePDEHessian11        = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble PDE Hessian11");
+    ROL::Ptr<Teuchos::Time> AssemblePDEHessian12        = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble PDE Hessian12");
+    ROL::Ptr<Teuchos::Time> AssemblePDEHessian13        = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble PDE Hessian13");
+    ROL::Ptr<Teuchos::Time> AssemblePDEHessian21        = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble PDE Hessian21");
+    ROL::Ptr<Teuchos::Time> AssemblePDEHessian22        = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble PDE Hessian22");
+    ROL::Ptr<Teuchos::Time> AssemblePDEHessian23        = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble PDE Hessian23");
+    ROL::Ptr<Teuchos::Time> AssemblePDEHessian31        = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble PDE Hessian31");
+    ROL::Ptr<Teuchos::Time> AssemblePDEHessian32        = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble PDE Hessian32");
+    ROL::Ptr<Teuchos::Time> AssemblePDEHessian33        = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble PDE Hessian33");
+    ROL::Ptr<Teuchos::Time> AssembleDynPDEResidual      = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble DynamicPDE Residual");
+    ROL::Ptr<Teuchos::Time> AssembleDynPDEJacobian_uo   = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble DynamicPDE Jacobian_uo");
+    ROL::Ptr<Teuchos::Time> AssembleDynPDEJacobian_un   = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble DynamicPDE Jacobian_un");
+    ROL::Ptr<Teuchos::Time> AssembleDynPDEJacobian_zf   = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble DynamicPDE Jacobian_zf");
+    ROL::Ptr<Teuchos::Time> AssembleDynPDEJacobian_zp   = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble DynamicPDE Jacobian_zp");
+    ROL::Ptr<Teuchos::Time> AssembleDynPDEHessian_uo_uo = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble DynamicPDE Hessian_uo_uo");
+    ROL::Ptr<Teuchos::Time> AssembleDynPDEHessian_uo_un = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble DynamicPDE Hessian_uo_un");
+    ROL::Ptr<Teuchos::Time> AssembleDynPDEHessian_uo_zf = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble DynamicPDE Hessian_uo_zf");
+    ROL::Ptr<Teuchos::Time> AssembleDynPDEHessian_uo_zp = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble DynamicPDE Hessian_uo_zp");
+    ROL::Ptr<Teuchos::Time> AssembleDynPDEHessian_un_uo = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble DynamicPDE Hessian_un_uo");
+    ROL::Ptr<Teuchos::Time> AssembleDynPDEHessian_un_un = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble DynamicPDE Hessian_un_un");
+    ROL::Ptr<Teuchos::Time> AssembleDynPDEHessian_un_zf = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble DynamicPDE Hessian_un_zf");
+    ROL::Ptr<Teuchos::Time> AssembleDynPDEHessian_un_zp = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble DynamicPDE Hessian_un_zp");
+    ROL::Ptr<Teuchos::Time> AssembleDynPDEHessian_zf_uo = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble DynamicPDE Hessian_zf_uo");
+    ROL::Ptr<Teuchos::Time> AssembleDynPDEHessian_zf_un = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble DynamicPDE Hessian_zf_un");
+    ROL::Ptr<Teuchos::Time> AssembleDynPDEHessian_zf_zf = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble DynamicPDE Hessian_zf_zf");
+    ROL::Ptr<Teuchos::Time> AssembleDynPDEHessian_zf_zp = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble DynamicPDE Hessian_zf_zp");
+    ROL::Ptr<Teuchos::Time> AssembleDynPDEHessian_zp_uo = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble DynamicPDE Hessian_zp_uo");
+    ROL::Ptr<Teuchos::Time> AssembleDynPDEHessian_zp_un = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble DynamicPDE Hessian_zp_un");
+    ROL::Ptr<Teuchos::Time> AssembleDynPDEHessian_zp_zf = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble DynamicPDE Hessian_zp_zf");
+    ROL::Ptr<Teuchos::Time> AssembleDynPDEHessian_zp_zp = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble DynamicPDE Hessian_zp_zp");
+    ROL::Ptr<Teuchos::Time> AssembleQOIValue            = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble QOI Value");
+    ROL::Ptr<Teuchos::Time> AssembleQOIGradient1        = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble QOI Gradient1");
+    ROL::Ptr<Teuchos::Time> AssembleQOIGradient2        = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble QOI Gradient2");
+    ROL::Ptr<Teuchos::Time> AssembleQOIGradient3        = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble QOI Gradient3");
+    ROL::Ptr<Teuchos::Time> AssembleQOIHessVec11        = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble QOI HessVec11");
+    ROL::Ptr<Teuchos::Time> AssembleQOIHessVec12        = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble QOI HessVec12");
+    ROL::Ptr<Teuchos::Time> AssembleQOIHessVec13        = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble QOI HessVec13");
+    ROL::Ptr<Teuchos::Time> AssembleQOIHessVec21        = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble QOI HessVec21");
+    ROL::Ptr<Teuchos::Time> AssembleQOIHessVec22        = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble QOI HessVec22");
+    ROL::Ptr<Teuchos::Time> AssembleQOIHessVec23        = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble QOI HessVec23");
+    ROL::Ptr<Teuchos::Time> AssembleQOIHessVec31        = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble QOI HessVec31");
+    ROL::Ptr<Teuchos::Time> AssembleQOIHessVec32        = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble QOI HessVec32");
+    ROL::Ptr<Teuchos::Time> AssembleQOIHessVec33        = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble QOI HessVec33");
+    ROL::Ptr<Teuchos::Time> AssembleQOIHessian11        = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble QOI Hessian11");
+    ROL::Ptr<Teuchos::Time> AssembleQOIHessian12        = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble QOI Hessian12");
+    ROL::Ptr<Teuchos::Time> AssembleQOIHessian21        = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble QOI Hessian21");
+    ROL::Ptr<Teuchos::Time> AssembleQOIHessian22        = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Assemble QOI Hessian22");
+  }
+}
+#endif
+
+template<class Real>
+class Solution {
+public:
+
+  virtual ~Solution() {}
+  Solution(void) {}
+  virtual Real evaluate(const std::vector<Real> &x, const int fieldNumber) const = 0;
+};
+
+template<class Real, class DeviceType>
+class Assembler {
+
+public:
+
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using int_view = Kokkos::DynRankView<int, DeviceType>;
+  using basis_ptr = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using MeshManager_type = MeshManager<Real, DeviceType>;
+  using DofManager_type = DofManager<Real, DeviceType>;
+  using QoI_type = QoI<Real, DeviceType>;
+  using PDE_type = PDE<Real, DeviceType>;
+  using DynPDE_type = DynamicPDE<Real, DeviceType>;
+  using FE_type = FE<Real, DeviceType>;
+  using FE_CURL_type = FE_CURL<Real, DeviceType>;
+  using FieldHelper_type = FieldHelper<Real, DeviceType>;
+
+  using fst = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct = Intrepid2::CellTools<DeviceType>;
+
+
+  typedef Tpetra::Map<>::local_ordinal_type LO;
+  typedef Tpetra::Map<>::global_ordinal_type GO;
+  typedef Tpetra::Map<>::node_type NO;
+  typedef Tpetra::MultiVector<Real,LO,GO,NO> MV;
+  typedef Tpetra::Operator<Real,LO,GO,NO> OP;
+
+private:
+  // Timers
+//  ROL::Ptr<Teuchos::Time::Time> timerSolverFactorization_;
+//  ROL::Ptr<Teuchos::Time::Time> timerSolverSubstitution_;
+//  ROL::Ptr<Teuchos::Time::Time> timerAssemblyNonlinear_;
+//  ROL::Ptr<Teuchos::Time::Time> timerSolverUpdate_;
+
+  // Set in Constructor.
+  bool verbose_;
+  bool isJ1Transposed_, isJ2Transposed_, isJuoTransposed_, isJunTransposed_, isJzfTransposed_;
+
+  // Set in SetCommunicator.
+  ROL::Ptr<const Teuchos::Comm<int>> comm_;
+  int myRank_, numProcs_;
+
+  // Set in SetBasis.
+  std::vector<basis_ptr> basisPtrs1_, basisPtrs2_;
+
+  // Set in SetDiscretization.
+  ROL::Ptr<MeshManager_type> meshMgr_;
+  ROL::Ptr<DofManager_type>  dofMgr1_, dofMgr2_;
+
+  // Set in SetParallelStructure.
+  int numCells_;
+  Teuchos::Array<GO> myCellIds_;
+  Teuchos::Array<GO> cellOffsets_;
+  ROL::Ptr<const Tpetra::Map<>> myOverlapStateMap_;
+  ROL::Ptr<const Tpetra::Map<>> myUniqueStateMap_;
+  ROL::Ptr<const Tpetra::Map<>> myOverlapControlMap_;
+  ROL::Ptr<const Tpetra::Map<>> myUniqueControlMap_;
+  ROL::Ptr<const Tpetra::Map<>> myOverlapResidualMap_;
+  ROL::Ptr<const Tpetra::Map<>> myUniqueResidualMap_;
+  ROL::Ptr<Tpetra::CrsGraph<>>  matJ1Graph_;
+  ROL::Ptr<Tpetra::CrsGraph<>>  matJ2Graph_;
+  ROL::Ptr<Tpetra::CrsGraph<>>  matR1Graph_;
+  ROL::Ptr<Tpetra::CrsGraph<>>  matR2Graph_;
+  ROL::Ptr<Tpetra::CrsGraph<>>  matH11Graph_;
+  ROL::Ptr<Tpetra::CrsGraph<>>  matH12Graph_;
+  ROL::Ptr<Tpetra::CrsGraph<>>  matH21Graph_;
+  ROL::Ptr<Tpetra::CrsGraph<>>  matH22Graph_;
+
+  // Set in SetCellNodes.
+  scalar_view volCellNodes_;
+  ROL::Ptr<std::vector<std::vector<std::vector<int>>>>  bdryCellIds_;
+  std::vector<std::vector<std::vector<int>>>  bdryCellLocIds_;
+  std::vector<std::vector<scalar_view>> bdryCellNodes_;
+
+  // Finite element vectors and matrices for PDE.
+  ROL::Ptr<Tpetra::MultiVector<>> pde_vecR_overlap_;
+  ROL::Ptr<Tpetra::MultiVector<>> pde_vecJ2_overlap_;
+  ROL::Ptr<Tpetra::MultiVector<>> pde_vecJ3_overlap_;
+  ROL::Ptr<Tpetra::MultiVector<>> pde_vecH13_overlap_;
+  ROL::Ptr<Tpetra::MultiVector<>> pde_vecH23_overlap_;
+
+  // Finite element vectors and matrices for QoI.
+  ROL::Ptr<Tpetra::MultiVector<>> qoi_vecG1_overlap_;
+  ROL::Ptr<Tpetra::MultiVector<>> qoi_vecG2_overlap_;
+  ROL::Ptr<Tpetra::MultiVector<>> qoi_vecH11_overlap_;
+  ROL::Ptr<Tpetra::MultiVector<>> qoi_vecH12_overlap_;
+  ROL::Ptr<Tpetra::MultiVector<>> qoi_vecH13_overlap_;
+  ROL::Ptr<Tpetra::MultiVector<>> qoi_vecH21_overlap_;
+  ROL::Ptr<Tpetra::MultiVector<>> qoi_vecH22_overlap_;
+  ROL::Ptr<Tpetra::MultiVector<>> qoi_vecH23_overlap_;
+
+  bool store_;
+
+private:
+
+  void setCommunicator(const ROL::Ptr<const Teuchos::Comm<int>> &comm,
+                       Teuchos::ParameterList &parlist,
+                       std::ostream &outStream = std::cout);
+  void setBasis(
+         const std::vector<basis_ptr>& basisPtrs1,
+         const std::vector<basis_ptr>& basisPtrs2,
+         Teuchos::ParameterList &parlist,
+         std::ostream &outStream = std::cout);
+  void setDiscretization(Teuchos::ParameterList &parlist,
+                         const ROL::Ptr<MeshManager_type> &meshMgr = ROL::nullPtr,
+                         std::ostream &outStream = std::cout);
+  void setParallelStructure(Teuchos::ParameterList &parlist,
+                            std::ostream &outStream = std::cout);
+  void setCellNodes(std::ostream &outStream = std::cout);
+  int mapGlobalToLocalCellId(const int & gid);
+  void getCoeffFromStateVector(scalar_view& xcoeff,
+                               const ROL::Ptr<const Tpetra::MultiVector<>> &x) const;
+  void getCoeffFromControlVector(scalar_view& xcoeff,
+                                 const ROL::Ptr<const Tpetra::MultiVector<>> &x) const;
+
+  /***************************************************************************/
+  /****** GENERIC ASSEMBLY ROUTINES ******************************************/
+  /***************************************************************************/
+  Real assembleScalar(const scalar_view val);
+  void assembleFieldVector(ROL::Ptr<Tpetra::MultiVector<>> &v,
+                           scalar_view val,
+                           ROL::Ptr<Tpetra::MultiVector<>> &vecOverlap,
+                           const ROL::Ptr<DofManager_type> &dofMgr);
+  void assembleParamVector(ROL::Ptr<std::vector<Real>> &v,
+                           std::vector<scalar_view> &val);
+  void assembleFieldMatrix(ROL::Ptr<Tpetra::CrsMatrix<>> &M,
+                           scalar_view val,
+                           const ROL::Ptr<DofManager_type> &dofMgr1,
+                           const ROL::Ptr<DofManager_type> &dofMgr2);
+  void assembleParamFieldMatrix(ROL::Ptr<Tpetra::MultiVector<>> &M,
+                                std::vector<scalar_view> &val,
+                                ROL::Ptr<Tpetra::MultiVector<>> &matOverlap,
+                                const ROL::Ptr<DofManager_type> &dofMgr);
+  void assembleParamMatrix(ROL::Ptr<std::vector<std::vector<Real>>> &M,
+                           std::vector<std::vector<scalar_view>> &val,
+                           const ROL::Ptr<DofManager_type> &dofMgr);
+  void transformToFieldPattern(scalar_view array,
+                               const ROL::Ptr<DofManager_type> &dofMgr1,
+                               const ROL::Ptr<DofManager_type> &dofMgr2 = ROL::nullPtr) const;
+
+public:
+  // destructor
+  virtual ~Assembler() {}
+  // Constuctor: Discretization set from ParameterList
+  Assembler(const std::vector<basis_ptr> &basisPtrs,
+          const ROL::Ptr<const Teuchos::Comm<int>> &comm,
+          Teuchos::ParameterList &parlist,
+          std::ostream &outStream = std::cout);
+  // Constructor: Discretization set from MeshManager input
+  Assembler(const std::vector<basis_ptr> &basisPtrs,
+          const ROL::Ptr<MeshManager_type> &meshMgr,
+          const ROL::Ptr<const Teuchos::Comm<int>> &comm,
+          Teuchos::ParameterList &parlist,
+          std::ostream &outStream = std::cout);
+  // Constuctor: Discretization set from ParameterList
+  Assembler(const std::vector<basis_ptr> &basisPtrs1,
+          const std::vector<basis_ptr> &basisPtrs2,
+          const ROL::Ptr<const Teuchos::Comm<int>> &comm,
+          Teuchos::ParameterList &parlist,
+          std::ostream &outStream = std::cout);
+  // Constructor: Discretization set from MeshManager input
+  Assembler(const std::vector<basis_ptr> &basisPtrs1,
+          const std::vector<basis_ptr> &basisPtrs2,
+          const ROL::Ptr<MeshManager_type> &meshMgr,
+          const ROL::Ptr<const Teuchos::Comm<int>> &comm,
+          Teuchos::ParameterList &parlist,
+          std::ostream &outStream = std::cout);
+  void setCellNodes(PDE_type &pde) const;
+  void setCellNodes(DynPDE_type &pde) const;
+
+  /***************************************************************************/
+  /* PDE assembly routines                                                   */
+  /***************************************************************************/
+  void assemblePDEResidual(ROL::Ptr<Tpetra::MultiVector<>> &r,
+                           const ROL::Ptr<PDE_type> &pde,
+                           const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                           const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                           const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr);
+  void assemblePDEJacobian1(ROL::Ptr<Tpetra::CrsMatrix<>> &J1,
+                            const ROL::Ptr<PDE_type> &pde,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                            const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr);
+  void assemblePDEJacobian2(ROL::Ptr<Tpetra::CrsMatrix<>> &J2,
+                            const ROL::Ptr<PDE_type> &pde,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                            const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr);
+  void assemblePDEJacobian3(ROL::Ptr<Tpetra::MultiVector<>> &J3,
+                            const ROL::Ptr<PDE_type> &pde,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                            const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr);
+  void assemblePDEapplyJacobian1(ROL::Ptr<Tpetra::MultiVector<>> &Jv1,
+                                 const ROL::Ptr<PDE_type> &pde,
+                                 const ROL::Ptr<const Tpetra::MultiVector<>> &v,
+                                 const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                 const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                                 const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr);
+  void assemblePDEapplyAdjointJacobian1(ROL::Ptr<Tpetra::MultiVector<>> &Jv1,
+                                        const ROL::Ptr<PDE_type> &pde,
+                                        const ROL::Ptr<const Tpetra::MultiVector<>> &v,
+                                        const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                        const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                                        const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr);
+  void assemblePDEapplyJacobian2(ROL::Ptr<Tpetra::MultiVector<>> &Jv2,
+                                 const ROL::Ptr<PDE_type> &pde,
+                                 const ROL::Ptr<const Tpetra::MultiVector<>> &v,
+                                 const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                 const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                                 const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr);
+  void assemblePDEapplyAdjointJacobian2(ROL::Ptr<Tpetra::MultiVector<>> &Jv2,
+                                        const ROL::Ptr<PDE_type> &pde,
+                                        const ROL::Ptr<const Tpetra::MultiVector<>> &v,
+                                        const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                        const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                                        const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr);
+  void assemblePDEHessian11(ROL::Ptr<Tpetra::CrsMatrix<>> &H11,
+                            const ROL::Ptr<PDE_type> &pde,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                            const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr);
+  void assemblePDEHessian12(ROL::Ptr<Tpetra::CrsMatrix<>> &H12,
+                            const ROL::Ptr<PDE_type> &pde,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                            const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr);
+  void assemblePDEHessian13(ROL::Ptr<Tpetra::MultiVector<>> &H13,
+                            const ROL::Ptr<PDE_type> &pde,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                            const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr);
+  void assemblePDEHessian21(ROL::Ptr<Tpetra::CrsMatrix<>> &H21,
+                            const ROL::Ptr<PDE_type> &pde,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                            const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr);
+  void assemblePDEHessian22(ROL::Ptr<Tpetra::CrsMatrix<>> &H22,
+                            const ROL::Ptr<PDE_type> &pde,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                            const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr);
+  void assemblePDEHessian23(ROL::Ptr<Tpetra::MultiVector<>> &H23,
+                            const ROL::Ptr<PDE_type> &pde,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                            const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr);
+  void assemblePDEHessian31(ROL::Ptr<Tpetra::MultiVector<>> &H31,
+                            const ROL::Ptr<PDE_type> &pde,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                            const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr);
+  void assemblePDEHessian32(ROL::Ptr<Tpetra::MultiVector<>> &H32,
+                            const ROL::Ptr<PDE_type> &pde,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                            const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr);
+  void assemblePDEHessian33(ROL::Ptr<std::vector<std::vector<Real>>> &H33,
+                            const ROL::Ptr<PDE_type> &pde,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                            const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr);
+  void assemblePDEapplyHessian11(ROL::Ptr<Tpetra::MultiVector<>> &Hv,
+                                 const ROL::Ptr<PDE_type> &pde,
+                                 const ROL::Ptr<const Tpetra::MultiVector<>> &v,
+                                 const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                 const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                 const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                                 const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr);
+  void assemblePDEapplyHessian12(ROL::Ptr<Tpetra::MultiVector<>> &Hv,
+                                 const ROL::Ptr<PDE_type> &pde,
+                                 const ROL::Ptr<const Tpetra::MultiVector<>> &v,
+                                 const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                 const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                 const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                                 const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr);
+  void assemblePDEapplyHessian21(ROL::Ptr<Tpetra::MultiVector<>> &Hv,
+                                 const ROL::Ptr<PDE_type> &pde,
+                                 const ROL::Ptr<const Tpetra::MultiVector<>> &v,
+                                 const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                 const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                 const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                                 const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr);
+  void assemblePDEapplyHessian22(ROL::Ptr<Tpetra::MultiVector<>> &Hv,
+                                 const ROL::Ptr<PDE_type> &pde,
+                                 const ROL::Ptr<const Tpetra::MultiVector<>> &v,
+                                 const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                 const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                 const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                                 const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr);
+  /***************************************************************************/
+  /* End of PDE assembly routines.                                           */
+  /***************************************************************************/
+
+  /***************************************************************************/
+  /* Dynamic PDE assembly routines                                           */
+  /***************************************************************************/
+  void assembleDynPDEResidual(ROL::Ptr<Tpetra::MultiVector<>> &r,
+                              const ROL::Ptr<DynPDE_type> &pde,
+                              const ROL::TimeStamp<Real> &ts,
+                              const ROL::Ptr<const Tpetra::MultiVector<>> &uo,
+                              const ROL::Ptr<const Tpetra::MultiVector<>> &un,
+                              const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                              const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr);
+  void assembleDynPDEJacobian_uo(ROL::Ptr<Tpetra::CrsMatrix<>> &J,
+                                 const ROL::Ptr<DynPDE_type> &pde,
+                                 const ROL::TimeStamp<Real> &ts,
+                                 const ROL::Ptr<const Tpetra::MultiVector<>> &uo,
+                                 const ROL::Ptr<const Tpetra::MultiVector<>> &un,
+                                 const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                                 const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr);
+  void assembleDynPDEJacobian_un(ROL::Ptr<Tpetra::CrsMatrix<>> &J,
+                                 const ROL::Ptr<DynPDE_type> &pde,
+                                 const ROL::TimeStamp<Real> &ts,
+                                 const ROL::Ptr<const Tpetra::MultiVector<>> &uo,
+                                 const ROL::Ptr<const Tpetra::MultiVector<>> &un,
+                                 const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                                 const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr);
+  void assembleDynPDEJacobian_zf(ROL::Ptr<Tpetra::CrsMatrix<>> &J,
+                                 const ROL::Ptr<DynPDE_type> &pde,
+                                 const ROL::TimeStamp<Real> &ts,
+                                 const ROL::Ptr<const Tpetra::MultiVector<>> &uo,
+                                 const ROL::Ptr<const Tpetra::MultiVector<>> &un,
+                                 const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                                 const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr);
+  void assembleDynPDEJacobian_zp(ROL::Ptr<Tpetra::MultiVector<>> &J,
+                                 const ROL::Ptr<DynPDE_type> &pde,
+                                 const ROL::TimeStamp<Real> &ts,
+                                 const ROL::Ptr<const Tpetra::MultiVector<>> &uo,
+                                 const ROL::Ptr<const Tpetra::MultiVector<>> &un,
+                                 const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                                 const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr);
+  void assembleDynPDEHessian_uo_uo(ROL::Ptr<Tpetra::CrsMatrix<>> &H,
+                                   const ROL::Ptr<DynPDE_type> &pde,
+                                   const ROL::TimeStamp<Real> &ts,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &uo,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &un,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                                   const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr);
+  void assembleDynPDEHessian_uo_un(ROL::Ptr<Tpetra::CrsMatrix<>> &H,
+                                   const ROL::Ptr<DynPDE_type> &pde,
+                                   const ROL::TimeStamp<Real> &ts,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &uo,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &un,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                                   const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr);
+  void assembleDynPDEHessian_uo_zf(ROL::Ptr<Tpetra::CrsMatrix<>> &H,
+                                   const ROL::Ptr<DynPDE_type> &pde,
+                                   const ROL::TimeStamp<Real> &ts,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &uo,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &un,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                                   const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr);
+  void assembleDynPDEHessian_uo_zp(ROL::Ptr<Tpetra::MultiVector<>> &H,
+                                   const ROL::Ptr<DynPDE_type> &pde,
+                                   const ROL::TimeStamp<Real> &ts,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &uo,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &un,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                                   const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr);
+  void assembleDynPDEHessian_un_uo(ROL::Ptr<Tpetra::CrsMatrix<>> &H,
+                                   const ROL::Ptr<DynPDE_type> &pde,
+                                   const ROL::TimeStamp<Real> &ts,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &uo,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &un,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                                   const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr);
+  void assembleDynPDEHessian_un_un(ROL::Ptr<Tpetra::CrsMatrix<>> &H,
+                                   const ROL::Ptr<DynPDE_type> &pde,
+                                   const ROL::TimeStamp<Real> &ts,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &uo,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &un,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                                   const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr);
+  void assembleDynPDEHessian_un_zf(ROL::Ptr<Tpetra::CrsMatrix<>> &H,
+                                   const ROL::Ptr<DynPDE_type> &pde,
+                                   const ROL::TimeStamp<Real> &ts,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &uo,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &un,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                                   const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr);
+  void assembleDynPDEHessian_un_zp(ROL::Ptr<Tpetra::MultiVector<>> &H,
+                                   const ROL::Ptr<DynPDE_type> &pde,
+                                   const ROL::TimeStamp<Real> &ts,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &uo,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &un,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                                   const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr);
+  void assembleDynPDEHessian_zf_uo(ROL::Ptr<Tpetra::CrsMatrix<>> &H,
+                                   const ROL::Ptr<DynPDE_type> &pde,
+                                   const ROL::TimeStamp<Real> &ts,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &uo,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &un,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                                   const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr);
+  void assembleDynPDEHessian_zf_un(ROL::Ptr<Tpetra::CrsMatrix<>> &H,
+                                   const ROL::Ptr<DynPDE_type> &pde,
+                                   const ROL::TimeStamp<Real> &ts,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &uo,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &un,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                                   const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr);
+  void assembleDynPDEHessian_zf_zf(ROL::Ptr<Tpetra::CrsMatrix<>> &H,
+                                   const ROL::Ptr<DynPDE_type> &pde,
+                                   const ROL::TimeStamp<Real> &ts,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &uo,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &un,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                                   const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr);
+  void assembleDynPDEHessian_zf_zp(ROL::Ptr<Tpetra::MultiVector<>> &H,
+                                   const ROL::Ptr<DynPDE_type> &pde,
+                                   const ROL::TimeStamp<Real> &ts,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &uo,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &un,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                                   const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr);
+  void assembleDynPDEHessian_zp_uo(ROL::Ptr<Tpetra::MultiVector<>> &H,
+                                   const ROL::Ptr<DynPDE_type> &pde,
+                                   const ROL::TimeStamp<Real> &ts,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &uo,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &un,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                                   const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr);
+  void assembleDynPDEHessian_zp_un(ROL::Ptr<Tpetra::MultiVector<>> &H,
+                                   const ROL::Ptr<DynPDE_type> &pde,
+                                   const ROL::TimeStamp<Real> &ts,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &uo,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &un,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                                   const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr);
+  void assembleDynPDEHessian_zp_zf(ROL::Ptr<Tpetra::MultiVector<>> &H,
+                                   const ROL::Ptr<DynPDE_type> &pde,
+                                   const ROL::TimeStamp<Real> &ts,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &uo,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &un,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                                   const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr);
+  void assembleDynPDEHessian_zp_zp(ROL::Ptr<std::vector<std::vector<Real>>> &H,
+                                   const ROL::Ptr<DynPDE_type> &pde,
+                                   const ROL::TimeStamp<Real> &ts,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &uo,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &un,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                                   const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr);
+  /***************************************************************************/
+  /* End of DynamicPDE assembly routines.                                    */
+  /***************************************************************************/
+
+  /***************************************************************************/
+  /* QoI assembly routines                                                   */
+  /***************************************************************************/
+  Real assembleQoIValue(const ROL::Ptr<QoI_type> &qoi,
+                        const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                        const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                        const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr);
+  void assembleQoIGradient1(ROL::Ptr<Tpetra::MultiVector<>> &g1,
+                            const ROL::Ptr<QoI_type> &qoi,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                            const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr);
+  void assembleQoIGradient2(ROL::Ptr<Tpetra::MultiVector<>> &g2,
+                            const ROL::Ptr<QoI_type> &qoi,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                            const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr);
+  void assembleQoIGradient3(ROL::Ptr<std::vector<Real>> &g3,
+                            const ROL::Ptr<QoI_type> &qoi,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                            const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr);
+  void assembleQoIHessVec11(ROL::Ptr<Tpetra::MultiVector<>> &H11,
+                            const ROL::Ptr<QoI_type> &qoi,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &v,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                            const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr);
+  void assembleQoIHessVec12(ROL::Ptr<Tpetra::MultiVector<>> &H12,
+                            const ROL::Ptr<QoI_type> &qoi,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &v,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                            const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr);
+  void assembleQoIHessVec13(ROL::Ptr<Tpetra::MultiVector<>> &H13,
+                            const ROL::Ptr<QoI_type> &qoi,
+                            const ROL::Ptr<const std::vector<Real>> &v,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                            const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr);
+  void assembleQoIHessVec21(ROL::Ptr<Tpetra::MultiVector<>> &H21,
+                            const ROL::Ptr<QoI_type> &qoi,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &v,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                            const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr);
+  void assembleQoIHessVec22(ROL::Ptr<Tpetra::MultiVector<>> &H22,
+                            const ROL::Ptr<QoI_type> &qoi,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &v,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                            const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr);
+  void assembleQoIHessVec23(ROL::Ptr<Tpetra::MultiVector<>> &H23,
+                            const ROL::Ptr<QoI_type> &qoi,
+                            const ROL::Ptr<const std::vector<Real>> &v,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                            const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr);
+  void assembleQoIHessVec31(ROL::Ptr<std::vector<Real>> &H31,
+                            const ROL::Ptr<QoI_type> &qoi,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &v,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                            const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr);
+  void assembleQoIHessVec32(ROL::Ptr<std::vector<Real>> &H32,
+                            const ROL::Ptr<QoI_type> &qoi,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &v,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                            const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr);
+  void assembleQoIHessVec33(ROL::Ptr<std::vector<Real>> &H33,
+                            const ROL::Ptr<QoI_type> &qoi,
+                            const ROL::Ptr<const std::vector<Real>> &v,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                            const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr);
+  void assembleQoIHessian11(ROL::Ptr<Tpetra::CrsMatrix<>> &H11,
+                            const ROL::Ptr<QoI_type> &qoi,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                            const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr);
+  void assembleQoIHessian12(ROL::Ptr<Tpetra::CrsMatrix<>> &H12,
+                            const ROL::Ptr<QoI_type> &qoi,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                            const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr);
+  void assembleQoIHessian21(ROL::Ptr<Tpetra::CrsMatrix<>> &H21,
+                            const ROL::Ptr<QoI_type> &qoi,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                            const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr);
+  void assembleQoIHessian22(ROL::Ptr<Tpetra::CrsMatrix<>> &H22,
+                            const ROL::Ptr<QoI_type> &qoi,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                            const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr);
+  /***************************************************************************/
+  /* End QoI assembly routines                                               */
+  /***************************************************************************/
+
+  /***************************************************************************/
+  /* Assemble and apply Riesz operator corresponding to simulation variables */
+  /***************************************************************************/
+  void assemblePDERieszMap1(ROL::Ptr<Tpetra::CrsMatrix<>> &R1,
+                            const ROL::Ptr<PDE_type> &pde);
+  void assembleDynPDERieszMap1(ROL::Ptr<Tpetra::CrsMatrix<>> &R1,
+                               const ROL::Ptr<DynPDE_type> &pde);
+  /***************************************************************************/
+  /* End of functions for Riesz operator of simulation variables.            */
+  /***************************************************************************/
+
+  /***************************************************************************/
+  /* Assemble and apply Riesz operator corresponding to optimization         */
+  /* variables                                                               */
+  /***************************************************************************/
+  void assemblePDERieszMap2(ROL::Ptr<Tpetra::CrsMatrix<>> &R2,
+                            const ROL::Ptr<PDE_type> &pde);
+  void assembleDynPDERieszMap2(ROL::Ptr<Tpetra::CrsMatrix<>> &R2,
+                               const ROL::Ptr<DynPDE_type> &pde);
+  /***************************************************************************/
+  /* End of functions for Riesz operator of optimization variables.          */
+  /***************************************************************************/
+
+  /***************************************************************************/
+  /* Compute error routines.                                                 */
+  /***************************************************************************/
+  Real computeStateError(const ROL::Ptr<const Tpetra::MultiVector<>> &soln,
+                         const ROL::Ptr<Solution<Real>> &trueSoln,
+                         const int cubDeg = 6,
+                         const ROL::Ptr<FieldHelper_type> &fieldHelper = ROL::nullPtr) const;
+  Real computeControlError(const ROL::Ptr<const Tpetra::MultiVector<>> &soln,
+                           const ROL::Ptr<Solution<Real>> &trueSoln,
+                           const int cubDeg = 6,
+                           const ROL::Ptr<FieldHelper_type> &fieldHelper = ROL::nullPtr) const;
+  /***************************************************************************/
+  /* End of compute solution routines.                                       */
+  /***************************************************************************/
+
+  /***************************************************************************/
+  /* Output routines.                                                        */
+  /***************************************************************************/
+  void printMeshData(std::ostream &outStream) const;
+  void outputTpetraVector(const ROL::Ptr<const Tpetra::MultiVector<>> &vec,
+                          const std::string &filename) const;
+  void inputTpetraVector(ROL::Ptr<Tpetra::MultiVector<>> &vec,
+                         const std::string &filename) const;
+  void printDataPDE(const ROL::Ptr<PDE_type> &pde,
+                    const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                    const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                    const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr) const;
+  void printCellAveragesPDE(const ROL::Ptr<PDE_type> &pde,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                            const ROL::Ptr<const Tpetra::MultiVector<>> &z = ROL::nullPtr,
+                            const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr) const;
+  void serialPrintStateEdgeField(const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                 const ROL::Ptr<FieldHelper_type> &fieldHelper,
+                                 const std::string &filename,
+                                 const ROL::Ptr<FE_CURL_type> &fe) const;
+  /***************************************************************************/
+  /* End of output routines.                                                 */
+  /***************************************************************************/
+
+  /***************************************************************************/
+  /* Vector generation routines.                                             */
+  /***************************************************************************/
+  const ROL::Ptr<const Tpetra::Map<>> getStateMap(void) const;
+  const ROL::Ptr<const Tpetra::Map<>> getControlMap(void) const;
+  const ROL::Ptr<const Tpetra::Map<>> getResidualMap(void) const;
+  ROL::Ptr<Tpetra::MultiVector<>> createStateVector(void) const;
+  ROL::Ptr<Tpetra::MultiVector<>> createControlVector(void) const;
+  ROL::Ptr<Tpetra::MultiVector<>> createResidualVector(void) const;
+  /***************************************************************************/
+  /* End of vector generation routines.                                      */
+  /***************************************************************************/
+
+  /***************************************************************************/
+  /* Accessor routines.                                                      */
+  /***************************************************************************/
+  const ROL::Ptr<DofManager_type> getDofManager(void) const;
+  const ROL::Ptr<DofManager_type> getDofManager2(void) const;
+  Teuchos::Array<GO> getCellIds(void) const;
+  /***************************************************************************/
+  /* End of accessor routines.                                               */
+  /***************************************************************************/
+}; // class Assembler
+
+#include "assemblerK_def.hpp"
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/TOOLS/assemblerK_def.hpp b/packages/rol/example/PDE-OPT/TOOLS/assemblerK_def.hpp
new file mode 100644
index 000000000000..c2d25494db53
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/TOOLS/assemblerK_def.hpp
@@ -0,0 +1,3249 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  assemblerK_def.hpp
+    \brief Finite element assembly class.
+*/
+
+#ifndef ROL_PDEOPT_ASSEMBLERK_DEF_H
+#define ROL_PDEOPT_ASSEMBLERK_DEF_H
+
+#include "assemblerK.hpp"
+
+/*****************************************************************************/
+/******************* PUBLIC MEMBER FUNCTIONS *********************************/
+/*****************************************************************************/
+// Constuctor: Discretization set from ParameterList
+template<class Real, class DeviceType>
+Assembler<Real, DeviceType>::Assembler(
+        const std::vector<basis_ptr> &basisPtrs,
+        const ROL::Ptr<const Teuchos::Comm<int>> &comm,
+        Teuchos::ParameterList &parlist,
+        std::ostream &outStream)
+  : isJ1Transposed_(false), isJ2Transposed_(false),
+    isJuoTransposed_(false), isJunTransposed_(false), isJzfTransposed_(false) {
+  setCommunicator(comm,parlist,outStream);
+  setBasis(basisPtrs,basisPtrs,parlist,outStream);
+  setDiscretization(parlist,ROL::nullPtr,outStream);
+  setParallelStructure(parlist,outStream);
+  setCellNodes(outStream);
+  store_ = parlist.sublist("Assembler").get("Store Overlap Vectors",true);
+}
+
+// Constructor: Discretization set from MeshManager input
+template<class Real, class DeviceType>
+Assembler<Real,DeviceType>::Assembler(
+        const std::vector<basis_ptr> &basisPtrs,
+        const ROL::Ptr<MeshManager_type> &meshMgr,
+        const ROL::Ptr<const Teuchos::Comm<int>> &comm,
+        Teuchos::ParameterList &parlist,
+        std::ostream &outStream)
+  : isJ1Transposed_(false), isJ2Transposed_(false),
+    isJuoTransposed_(false), isJunTransposed_(false), isJzfTransposed_(false) {
+  setCommunicator(comm,parlist,outStream);
+  setBasis(basisPtrs,basisPtrs,parlist,outStream);
+  setDiscretization(parlist,meshMgr,outStream);
+  setParallelStructure(parlist,outStream);
+  setCellNodes(outStream);
+  store_ = parlist.sublist("Assembler").get("Store Overlap Vectors",true);
+}
+// Constuctor: Discretization set from ParameterList
+template<class Real, class DeviceType>
+Assembler<Real,DeviceType>::Assembler(
+        const std::vector<basis_ptr> &basisPtrs1,
+        const std::vector<basis_ptr> &basisPtrs2,
+        const ROL::Ptr<const Teuchos::Comm<int>> &comm,
+        Teuchos::ParameterList &parlist,
+        std::ostream &outStream)
+  : isJ1Transposed_(false), isJ2Transposed_(false),
+    isJuoTransposed_(false), isJunTransposed_(false), isJzfTransposed_(false) {
+  setCommunicator(comm,parlist,outStream);
+  setBasis(basisPtrs1,basisPtrs2,parlist,outStream);
+  setDiscretization(parlist,ROL::nullPtr,outStream);
+  setParallelStructure(parlist,outStream);
+  setCellNodes(outStream);
+  store_ = parlist.sublist("Assembler").get("Store Overlap Vectors",true);
+}
+
+// Constructor: Discretization set from MeshManager input
+template<class Real, class DeviceType>
+Assembler<Real,DeviceType>::Assembler(
+        const std::vector<basis_ptr> &basisPtrs1,
+        const std::vector<basis_ptr> &basisPtrs2,
+        const ROL::Ptr<MeshManager_type> &meshMgr,
+        const ROL::Ptr<const Teuchos::Comm<int>> &comm,
+        Teuchos::ParameterList &parlist,
+        std::ostream &outStream)
+  : isJ1Transposed_(false), isJ2Transposed_(false),
+    isJuoTransposed_(false), isJunTransposed_(false), isJzfTransposed_(false) {
+  setCommunicator(comm,parlist,outStream);
+  setBasis(basisPtrs1,basisPtrs2,parlist,outStream);
+  setDiscretization(parlist,meshMgr,outStream);
+  setParallelStructure(parlist,outStream);
+  setCellNodes(outStream);
+  store_ = parlist.sublist("Assembler").get("Store Overlap Vectors",true);
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::setCellNodes(PDE_type &pde) const {
+  // Set PDE cell nodes
+  pde.setFieldPattern(dofMgr1_->getFieldPattern(),dofMgr2_->getFieldPattern());
+  pde.setCellNodes(volCellNodes_, bdryCellNodes_, bdryCellLocIds_);
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::setCellNodes(DynPDE_type &pde) const {
+  // Set PDE cell nodes
+  pde.setFieldPattern(dofMgr1_->getFieldPattern(),dofMgr2_->getFieldPattern());
+  pde.setCellNodes(volCellNodes_, bdryCellNodes_, bdryCellLocIds_);
+}
+
+/***************************************************************************/
+/* PDE assembly routines                                                   */
+/***************************************************************************/
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assemblePDEResidual(ROL::Ptr<Tpetra::MultiVector<>> &r,
+                                    const ROL::Ptr<PDE_type> &pde,
+                                    const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                    const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                    const ROL::Ptr<const std::vector<Real>> & z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssemblePDEResidual);
+  #endif
+  // Initialize residual vectors if not done so
+  if ( r == ROL::nullPtr ) // Unique components of residual vector
+    r = ROL::makePtr<Tpetra::MultiVector<>>(myUniqueResidualMap_, 1, true);
+  ROL::Ptr<Tpetra::MultiVector<>> overlapVec;
+  if ( store_ ) {
+    if ( pde_vecR_overlap_ == ROL::nullPtr ) // Overlapping components of residual vector
+      pde_vecR_overlap_ = ROL::makePtr<Tpetra::MultiVector<>>(myOverlapResidualMap_, 1, true);
+    overlapVec = pde_vecR_overlap_;
+  }
+  else {
+    overlapVec = ROL::makePtr<Tpetra::MultiVector<>>(myOverlapResidualMap_, 1, true);
+  }
+  // Get u_coeff from u and z_coeff from z
+  scalar_view u_coeff, z_coeff;
+  getCoeffFromStateVector(u_coeff,u);
+  getCoeffFromControlVector(z_coeff,z);
+  // Assemble
+  scalar_view localVal;
+  pde->residual(localVal,u_coeff,z_coeff,z_param);
+  assembleFieldVector( r, localVal, overlapVec, dofMgr1_ );
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assemblePDEJacobian1(ROL::Ptr<Tpetra::CrsMatrix<>> &J1,
+                                     const ROL::Ptr<PDE_type> &pde,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                     const ROL::Ptr<const std::vector<Real>> & z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssemblePDEJacobian1);
+  #endif
+  try {
+    // Get u_coeff from u and z_coeff from z
+    scalar_view u_coeff, z_coeff;
+    getCoeffFromStateVector(u_coeff,u);
+    getCoeffFromControlVector(z_coeff,z);
+    // Initialize Jacobian matrices
+    if ( J1 == ROL::nullPtr ) {
+      J1 = ROL::makePtr<Tpetra::CrsMatrix<>>(matJ1Graph_);
+    }
+    // Assemble
+    scalar_view localVal;
+    pde->Jacobian_1(localVal,u_coeff,z_coeff,z_param);
+    assembleFieldMatrix( J1, localVal, dofMgr1_, dofMgr1_ );
+    isJ1Transposed_ = false;
+    // Output
+    //Tpetra::MatrixMarket::Writer< Tpetra::CrsMatrix<>> matWriter;
+    //matWriter.writeSparseFile("jacobian_1.txt", J1);
+  }
+  catch ( Exception::Zero & ez ) {
+    throw Exception::Zero(">>> (Assembler::assemblePDEJacobian1): Jacobian is zero.");
+  }
+  catch ( Exception::NotImplemented & eni ) {
+    throw Exception::NotImplemented(">>> (Assembler::assemblePDEJacobian1): Jacobian not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assemblePDEJacobian2(ROL::Ptr<Tpetra::CrsMatrix<>> &J2,
+                                     const ROL::Ptr<PDE_type> &pde,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                     const ROL::Ptr<const std::vector<Real>> & z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssemblePDEJacobian2);
+  #endif
+  try {
+    // Get u_coeff from u and z_coeff from z
+    scalar_view u_coeff, z_coeff;
+    getCoeffFromStateVector(u_coeff,u);
+    getCoeffFromControlVector(z_coeff,z);
+    // Initialize Jacobian matrices
+    if ( J2 == ROL::nullPtr ) {
+      J2 = ROL::makePtr<Tpetra::CrsMatrix<>>(matJ2Graph_);
+    }
+    // Assemble
+    scalar_view localVal;
+    pde->Jacobian_2(localVal,u_coeff,z_coeff,z_param);
+    assembleFieldMatrix( J2, localVal, dofMgr1_, dofMgr2_ );
+    isJ2Transposed_ = false;
+  }
+  catch ( Exception::Zero & ez ) {
+    throw Exception::Zero(">>> (Assembler::assemblePDEJacobian2): Jacobian is zero.");
+  }
+  catch ( Exception::NotImplemented & eni ) {
+    throw Exception::NotImplemented(">>> (Assembler::assemblePDEJacobian2): Jacobian not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assemblePDEJacobian3(ROL::Ptr<Tpetra::MultiVector<>> &J3,
+                                     const ROL::Ptr<PDE_type> &pde,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                     const ROL::Ptr<const std::vector<Real>> & z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssemblePDEJacobian3);
+  #endif
+  if ( z_param != ROL::nullPtr ) {
+    try {
+      const int size = static_cast<int>(z_param->size());
+      // Initialize Jacobian storage if not done so already
+      if (J3 == ROL::nullPtr)
+        J3 = ROL::makePtr<Tpetra::MultiVector<>>(myUniqueResidualMap_, size, true);
+      ROL::Ptr<Tpetra::MultiVector<>> overlapVec;
+      if ( store_ ) {
+        if ( pde_vecJ3_overlap_ == ROL::nullPtr)
+          pde_vecJ3_overlap_ = ROL::makePtr<Tpetra::MultiVector<>>(myOverlapResidualMap_, size, true);
+        overlapVec = pde_vecJ3_overlap_;
+      }
+      else {
+        overlapVec = ROL::makePtr<Tpetra::MultiVector<>>(myOverlapResidualMap_, size, true);
+      }
+      // Get u_coeff from u and z_coeff from z
+      scalar_view u_coeff, z_coeff;
+      getCoeffFromStateVector(u_coeff,u);
+      getCoeffFromControlVector(z_coeff,z);
+      // Assemble
+      std::vector<scalar_view> localVal(size);
+      pde->Jacobian_3(localVal,u_coeff,z_coeff,z_param);
+      assembleParamFieldMatrix( J3, localVal, overlapVec, dofMgr1_ );
+    }
+    catch ( Exception::Zero & ez ) {
+      throw Exception::Zero(">>> (Assembler::assemblePDEJacobian3): Jacobian is zero.");
+    }
+    catch ( Exception::NotImplemented & eni ) {
+      throw Exception::NotImplemented(">>> (Assembler::assemblePDEJacobian3): Jacobian not implemented.");
+    }
+  }
+  else {
+    throw Exception::NotImplemented(">>> (Assembler::assemblePDEJacobian3): Jacobian not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assemblePDEapplyJacobian1(ROL::Ptr<Tpetra::MultiVector<>> &Jv1,
+                                                const ROL::Ptr<PDE_type> &pde,
+                                                const ROL::Ptr<const Tpetra::MultiVector<>> &v,
+                                                const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                                const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                                const ROL::Ptr<const std::vector<Real>> & z_param) {
+//  #ifdef ROL_TIMERS
+//    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssemblePDEapplyJacobian1);
+//  #endif
+  try {
+    // Initialize residual vectors if not done so
+    if ( Jv1 == ROL::nullPtr ) // Unique components of residual vector
+      Jv1 = ROL::makePtr<Tpetra::MultiVector<>>(myUniqueResidualMap_, 1, true);
+    ROL::Ptr<Tpetra::MultiVector<>> overlapVec;
+    if ( store_ ) {
+      if ( pde_vecR_overlap_ == ROL::nullPtr ) // Overlapping components of residual vector
+        pde_vecR_overlap_ = ROL::makePtr<Tpetra::MultiVector<>>(myOverlapResidualMap_, 1, true);
+      overlapVec = pde_vecR_overlap_;
+    }
+    else {
+      overlapVec = ROL::makePtr<Tpetra::MultiVector<>>(myOverlapResidualMap_, 1, true);
+    }
+    // Get u_coeff from u and z_coeff from z
+    scalar_view v_coeff, u_coeff, z_coeff;
+    getCoeffFromStateVector(v_coeff,v);
+    getCoeffFromStateVector(u_coeff,u);
+    getCoeffFromControlVector(z_coeff,z);
+    // Assemble
+    scalar_view localVal;
+    pde->applyJacobian_1(localVal,v_coeff,u_coeff,z_coeff,z_param);
+    assembleFieldVector( Jv1, localVal, pde_vecR_overlap_, dofMgr1_ );
+  }
+  catch ( Exception::Zero & ez ) {
+    throw Exception::Zero(">>> (Assembler::assemblePDEapplyJacobian1): Jacobian is zero.");
+  }
+  catch ( Exception::NotImplemented & eni ) {
+    throw Exception::NotImplemented(">>> (Assembler::assemblePDEapplyJacobian1): Jacobian not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assemblePDEapplyAdjointJacobian1(ROL::Ptr<Tpetra::MultiVector<>> &Jv1,
+                                                       const ROL::Ptr<PDE_type> &pde,
+                                                       const ROL::Ptr<const Tpetra::MultiVector<>> &v,
+                                                       const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                                       const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                                       const ROL::Ptr<const std::vector<Real>> & z_param) {
+//  #ifdef ROL_TIMERS
+//    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssemblePDEapplyAdjointJacobian1);
+//  #endif
+  try {
+    // Initialize residual vectors if not done so
+    if ( Jv1 == ROL::nullPtr ) // Unique components of residual vector
+      Jv1 = ROL::makePtr<Tpetra::MultiVector<>>(myUniqueResidualMap_, 1, true);
+    ROL::Ptr<Tpetra::MultiVector<>> overlapVec;
+    if ( store_ ) {
+      if ( pde_vecR_overlap_ == ROL::nullPtr ) // Overlapping components of residual vector
+        pde_vecR_overlap_ = ROL::makePtr<Tpetra::MultiVector<>>(myOverlapResidualMap_, 1, true);
+      overlapVec = pde_vecR_overlap_;
+    }
+    else {
+      overlapVec = ROL::makePtr<Tpetra::MultiVector<>>(myOverlapResidualMap_, 1, true);
+    }
+    // Get u_coeff from u and z_coeff from z
+    scalar_view v_coeff, u_coeff, z_coeff;
+    getCoeffFromStateVector(v_coeff,v);
+    getCoeffFromStateVector(u_coeff,u);
+    getCoeffFromControlVector(z_coeff,z);
+    // Assemble
+    scalar_view localVal;
+    pde->applyAdjointJacobian_1(localVal,v_coeff,u_coeff,z_coeff,z_param);
+    assembleFieldVector( Jv1, localVal, overlapVec, dofMgr1_ );
+  }
+  catch ( Exception::Zero & ez ) {
+    throw Exception::Zero(">>> (Assembler::assemblePDEapplyAdjointJacobian1): Jacobian is zero.");
+  }
+  catch ( Exception::NotImplemented & eni ) {
+    throw Exception::NotImplemented(">>> (Assembler::assemblePDEapplyAdjointJacobian1): Jacobian not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assemblePDEapplyJacobian2(ROL::Ptr<Tpetra::MultiVector<>> &Jv2,
+                                                const ROL::Ptr<PDE_type> &pde,
+                                                const ROL::Ptr<const Tpetra::MultiVector<>> &v,
+                                                const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                                const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                                const ROL::Ptr<const std::vector<Real>> & z_param) {
+//  #ifdef ROL_TIMERS
+//    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssemblePDEapplyJacobian2);
+//  #endif
+  try {
+    // Initialize residual vectors if not done so
+    if ( Jv2 == ROL::nullPtr ) // Unique components of residual vector
+      Jv2 = ROL::makePtr<Tpetra::MultiVector<>>(myUniqueResidualMap_, 1, true);
+    ROL::Ptr<Tpetra::MultiVector<>> overlapVec;
+    if ( store_ ) {
+      if ( pde_vecR_overlap_ == ROL::nullPtr ) // Overlapping components of residual vector
+        pde_vecR_overlap_ = ROL::makePtr<Tpetra::MultiVector<>>(myOverlapResidualMap_, 1, true);
+      overlapVec = pde_vecR_overlap_;
+    }
+    else {
+      overlapVec = ROL::makePtr<Tpetra::MultiVector<>>(myOverlapResidualMap_, 1, true);
+    }
+    // Get u_coeff from u and z_coeff from z
+    scalar_view v_coeff, u_coeff, z_coeff;
+    getCoeffFromControlVector(v_coeff,v);
+    getCoeffFromStateVector(u_coeff,u);
+    getCoeffFromControlVector(z_coeff,z);
+    // Assemble
+    scalar_view localVal;
+    pde->applyJacobian_2(localVal,v_coeff,u_coeff,z_coeff,z_param);
+    assembleFieldVector( Jv2, localVal, overlapVec, dofMgr1_ );
+  }
+  catch ( Exception::Zero & ez ) {
+    throw Exception::Zero(">>> (Assembler::assemblePDEapplyJacobian2): Jacobian is zero.");
+  }
+  catch ( Exception::NotImplemented & eni ) {
+    throw Exception::NotImplemented(">>> (Assembler::assemblePDEapplyJacobian2): Jacobian not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assemblePDEapplyAdjointJacobian2(ROL::Ptr<Tpetra::MultiVector<>> &Jv2,
+                                                       const ROL::Ptr<PDE_type> &pde,
+                                                       const ROL::Ptr<const Tpetra::MultiVector<>> &v,
+                                                       const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                                       const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                                       const ROL::Ptr<const std::vector<Real>> & z_param) {
+//  #ifdef ROL_TIMERS
+//    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssemblePDEapplyAdjointJacobian2);
+//  #endif
+  try {
+    // Initialize residual vectors if not done so
+    if ( Jv2 == ROL::nullPtr ) // Unique components of residual vector
+      Jv2 = ROL::makePtr<Tpetra::MultiVector<>>(myUniqueControlMap_, 1, true);
+    ROL::Ptr<Tpetra::MultiVector<>> overlapVec;
+    if ( store_ ) {
+      if ( pde_vecJ2_overlap_ == ROL::nullPtr ) // Overlapping components of residual vector
+        pde_vecJ2_overlap_ = ROL::makePtr<Tpetra::MultiVector<>>(myOverlapControlMap_, 1, true);
+      overlapVec = pde_vecJ2_overlap_;
+    }
+    else {
+      overlapVec = ROL::makePtr<Tpetra::MultiVector<>>(myOverlapControlMap_, 1, true);
+    }
+    // Get u_coeff from u and z_coeff from z
+    scalar_view v_coeff, u_coeff, z_coeff;
+    getCoeffFromStateVector(v_coeff,v);
+    getCoeffFromStateVector(u_coeff,u);
+    getCoeffFromControlVector(z_coeff,z);
+    // Assemble
+    scalar_view localVal;
+    pde->applyAdjointJacobian_2(localVal,v_coeff,u_coeff,z_coeff,z_param);
+    assembleFieldVector( Jv2, localVal, overlapVec, dofMgr2_ );
+  }
+  catch ( Exception::Zero & ez ) {
+    throw Exception::Zero(">>> (Assembler::assemblePDEapplyAdjointJacobian2): Jacobian is zero.");
+  }
+  catch ( Exception::NotImplemented & eni ) {
+    throw Exception::NotImplemented(">>> (Assembler::assemblePDEapplyAdjointJacobian2): Jacobian not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assemblePDEHessian11(ROL::Ptr<Tpetra::CrsMatrix<>> &H11,
+                                     const ROL::Ptr<PDE_type> &pde,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                     const ROL::Ptr<const std::vector<Real>> & z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssemblePDEHessian11);
+  #endif
+  try {
+    // Get u_coeff from u and z_coeff from z
+    scalar_view u_coeff, z_coeff, l_coeff;
+    getCoeffFromStateVector(u_coeff,u);
+    getCoeffFromControlVector(z_coeff,z);
+    getCoeffFromStateVector(l_coeff,l);
+    // Initialize Hessian storage if not done so already
+    if ( H11 == ROL::nullPtr ) {
+      H11 = ROL::makePtr<Tpetra::CrsMatrix<>>(matH11Graph_);
+    }
+    // Assemble
+    scalar_view localVal;
+    pde->Hessian_11(localVal,l_coeff,u_coeff,z_coeff,z_param);
+    assembleFieldMatrix( H11, localVal, dofMgr1_, dofMgr1_ );
+  }
+  catch (Exception::Zero &ez) {
+    throw Exception::Zero(">>> (Assembler::assemblePDEHessian11): Hessian is zero.");
+  }
+  catch (Exception::NotImplemented &eni) {
+    throw Exception::NotImplemented(">>> (Assembler::assemblePDEHessian11): Hessian not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assemblePDEHessian12(ROL::Ptr<Tpetra::CrsMatrix<>> &H12,
+                                     const ROL::Ptr<PDE_type> &pde,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                     const ROL::Ptr<const std::vector<Real>> & z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssemblePDEHessian12);
+  #endif
+  try {
+    // Get u_coeff from u and z_coeff from z
+    scalar_view u_coeff, z_coeff, l_coeff;
+    getCoeffFromStateVector(u_coeff,u);
+    getCoeffFromControlVector(z_coeff,z);
+    getCoeffFromStateVector(l_coeff,l);
+    // Initialize Hessian storage if not done so already
+    if ( H12 == ROL::nullPtr ) {
+      H12 = ROL::makePtr<Tpetra::CrsMatrix<>>(matH12Graph_);
+    }
+    // Assemble
+    scalar_view localVal;
+    pde->Hessian_12(localVal,l_coeff,u_coeff,z_coeff,z_param);
+    assembleFieldMatrix( H12, localVal, dofMgr2_, dofMgr1_ );
+  }
+  catch (Exception::Zero &ez) {
+    throw Exception::Zero(">>> (Assembler::assemblePDEHessian12): Hessian is zero.");
+  }
+  catch (Exception::NotImplemented &eni) {
+    throw Exception::NotImplemented(">>> (Assembler::assemblePDEHessian12): Hessian not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assemblePDEHessian13(ROL::Ptr<Tpetra::MultiVector<>> &H13,
+                                     const ROL::Ptr<PDE_type> &pde,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                     const ROL::Ptr<const std::vector<Real>> & z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssemblePDEHessian13);
+  #endif
+  if ( z_param != ROL::nullPtr ) {
+    try {
+      int size = static_cast<int>(z_param->size());
+      // Get u_coeff from u, z_coeff from z and l_coeff from l
+      scalar_view u_coeff, z_coeff, l_coeff;
+      getCoeffFromStateVector(u_coeff,u);
+      getCoeffFromControlVector(z_coeff,z);
+      getCoeffFromStateVector(l_coeff,l);
+      // Initialize Jacobian storage if not done so already
+      if (H13 == ROL::nullPtr) {
+        H13 = ROL::makePtr<Tpetra::MultiVector<>>(myUniqueStateMap_, size, true);
+      }
+      if ( pde_vecH13_overlap_ == ROL::nullPtr) {
+        pde_vecH13_overlap_ = ROL::makePtr<Tpetra::MultiVector<>>(myOverlapStateMap_, size, true);
+      }
+      // Assemble
+      std::vector<scalar_view> localVal(size);
+      pde->Hessian_13(localVal,l_coeff,u_coeff,z_coeff,z_param);
+      assembleParamFieldMatrix( H13, localVal, pde_vecH13_overlap_, dofMgr1_ );
+    }
+    catch ( Exception::Zero & ez ) {
+      throw Exception::Zero(">>> (Assembler::assemblePDEHessian13): Hessian is zero.");
+    }
+    catch ( Exception::NotImplemented & eni ) {
+      throw Exception::NotImplemented(">>> (Assembler::assemblePDEHessian13): Hessian not implemented.");
+    }
+  }
+  else {
+    throw Exception::NotImplemented(">>> (Assembler::assemblePDEHessian13): Hessian not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assemblePDEHessian21(ROL::Ptr<Tpetra::CrsMatrix<>> &H21,
+                                     const ROL::Ptr<PDE_type> &pde,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                     const ROL::Ptr<const std::vector<Real>> & z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssemblePDEHessian21);
+  #endif
+  try {
+    // Get u_coeff from u and z_coeff from z
+    scalar_view u_coeff, z_coeff, l_coeff;
+    getCoeffFromStateVector(u_coeff,u);
+    getCoeffFromControlVector(z_coeff,z);
+    getCoeffFromStateVector(l_coeff,l);
+    // Initialize Hessian storage if not done so already
+    if ( H21 == ROL::nullPtr ) {
+      H21 = ROL::makePtr<Tpetra::CrsMatrix<>>(matH21Graph_);
+    }
+    // Assemble
+    scalar_view localVal;
+    pde->Hessian_21(localVal,l_coeff,u_coeff,z_coeff,z_param);
+    assembleFieldMatrix( H21, localVal, dofMgr1_, dofMgr2_ );
+  }
+  catch (Exception::Zero &ez) {
+    throw Exception::Zero(">>> (Assembler::assemblePDEHessian21): Hessian is zero.");
+  }
+  catch (Exception::NotImplemented &eni) {
+    throw Exception::NotImplemented(">>> (Assembler::assemblePDEHessian21): Hessian not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assemblePDEHessian22(ROL::Ptr<Tpetra::CrsMatrix<>> &H22,
+                                     const ROL::Ptr<PDE_type> &pde,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                     const ROL::Ptr<const std::vector<Real>> & z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssemblePDEHessian22);
+  #endif
+  try {
+    // Get u_coeff from u and z_coeff from z
+    scalar_view u_coeff, z_coeff, l_coeff;
+    getCoeffFromStateVector(u_coeff,u);
+    getCoeffFromControlVector(z_coeff,z);
+    getCoeffFromStateVector(l_coeff,l);
+    // Initialize Hessian storage if not done so already
+    if ( H22 == ROL::nullPtr ) {
+      H22 = ROL::makePtr<Tpetra::CrsMatrix<>>(matH22Graph_);
+    }
+    // Assemble
+    scalar_view localVal;
+    pde->Hessian_22(localVal,l_coeff,u_coeff,z_coeff,z_param);
+    assembleFieldMatrix( H22, localVal, dofMgr2_, dofMgr2_ );
+  }
+  catch (Exception::Zero &ez) {
+    throw Exception::Zero(">>> (Assembler::assemblePDEHessian22): Hessian is zero.");
+  }
+  catch (Exception::NotImplemented &eni) {
+    throw Exception::NotImplemented(">>> (Assembler::assemblePDEHessian22): Hessian not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assemblePDEHessian23(ROL::Ptr<Tpetra::MultiVector<>> &H23,
+                                     const ROL::Ptr<PDE_type> &pde,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                     const ROL::Ptr<const std::vector<Real>> & z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssemblePDEHessian23);
+  #endif
+  if ( z_param != ROL::nullPtr ) {
+    try {
+      int size = static_cast<int>(z_param->size());
+      // Get u_coeff from u, z_coeff from z and l_coeff from l
+      scalar_view u_coeff, z_coeff, l_coeff;
+      getCoeffFromStateVector(u_coeff,u);
+      getCoeffFromControlVector(z_coeff,z);
+      getCoeffFromStateVector(l_coeff,l);
+      // Initialize Jacobian storage if not done so already
+      if (H23 == ROL::nullPtr) {
+        H23 = ROL::makePtr<Tpetra::MultiVector<>>(myUniqueControlMap_, size, true);
+      }
+      if ( pde_vecH23_overlap_ == ROL::nullPtr) {
+        pde_vecH23_overlap_ = ROL::makePtr<Tpetra::MultiVector<>>(myOverlapControlMap_, size, true);
+      }
+      // Assemble
+      std::vector<scalar_view> localVal(size);
+      pde->Hessian_23(localVal,l_coeff,u_coeff,z_coeff,z_param);
+      assembleParamFieldMatrix( H23, localVal, pde_vecH23_overlap_, dofMgr2_ );
+    }
+    catch ( Exception::Zero & ez ) {
+      throw Exception::Zero(">>> (Assembler::assemblePDEHessian23): Hessian is zero.");
+    }
+    catch ( Exception::NotImplemented & eni ) {
+      throw Exception::NotImplemented(">>> (Assembler::assemblePDEHessian23): Hessian not implemented.");
+    }
+  }
+  else {
+    throw Exception::NotImplemented(">>> (Assembler::assemblePDEHessian23): Hessian not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assemblePDEHessian31(ROL::Ptr<Tpetra::MultiVector<>> &H31,
+                                     const ROL::Ptr<PDE_type> &pde,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                     const ROL::Ptr<const std::vector<Real>> & z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssemblePDEHessian31);
+  #endif
+  assemblePDEHessian13(H31,pde,l,u,z,z_param);
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assemblePDEHessian32(ROL::Ptr<Tpetra::MultiVector<>> &H32,
+                                     const ROL::Ptr<PDE_type> &pde,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                     const ROL::Ptr<const std::vector<Real>> & z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssemblePDEHessian32);
+  #endif
+  assemblePDEHessian23(H32,pde,l,u,z,z_param);
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assemblePDEHessian33(ROL::Ptr<std::vector<std::vector<Real>>> &H33,
+                                     const ROL::Ptr<PDE_type> &pde,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                     const ROL::Ptr<const std::vector<Real>> & z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssemblePDEHessian33);
+  #endif
+  if ( z_param != ROL::nullPtr ) {
+    try {
+      int size = static_cast<int>(z_param->size());
+      // Get u_coeff from u, z_coeff from z and l_coeff from l
+      scalar_view u_coeff, z_coeff, l_coeff;
+      getCoeffFromStateVector(u_coeff,u);
+      getCoeffFromControlVector(z_coeff,z);
+      getCoeffFromStateVector(l_coeff,l);
+      // Initialize Jacobian storage if not done so already
+      if (H33 == ROL::nullPtr) {
+        std::vector<Real> col(size,static_cast<Real>(0));
+        H33 = ROL::makePtr<std::vector<std::vector<Real>>>(size,col);
+      }
+      // Assemble
+      std::vector<std::vector<scalar_view>> localVal(size);
+      for (int i=0; i< size; ++i) {
+        localVal[i].resize(size);
+      }
+      pde->Hessian_33(localVal,l_coeff,u_coeff,z_coeff,z_param);
+      assembleParamMatrix( H33, localVal, dofMgr1_ );
+    }
+    catch ( Exception::Zero & ez ) {
+      throw Exception::Zero(">>> (Assembler::assemblePDEHessian33): Hessian is zero.");
+    }
+    catch ( Exception::NotImplemented & eni ) {
+      throw Exception::NotImplemented(">>> (Assembler::assemblePDEHessian33): Hessian not implemented.");
+    }
+  }
+  else {
+    throw Exception::NotImplemented(">>> (Assembler::assemblePDEHessian33): Hessian not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assemblePDEapplyHessian11(ROL::Ptr<Tpetra::MultiVector<>> &Hv,
+                                                const ROL::Ptr<PDE_type> &pde,
+                                                const ROL::Ptr<const Tpetra::MultiVector<>> &v,
+                                                const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                                const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                                const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                                const ROL::Ptr<const std::vector<Real>> & z_param) {
+//  #ifdef ROL_TIMERS
+//    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssemblePDEapplyJacobian2);
+//  #endif
+  try {
+    // Initialize residual vectors if not done so
+    if ( Hv == ROL::nullPtr ) // Unique components of residual vector
+      Hv = ROL::makePtr<Tpetra::MultiVector<>>(myUniqueResidualMap_, 1, true);
+    ROL::Ptr<Tpetra::MultiVector<>> overlapVec;
+    if ( store_ ) {
+      if ( pde_vecR_overlap_ == ROL::nullPtr ) // Overlapping components of residual vector
+        pde_vecR_overlap_ = ROL::makePtr<Tpetra::MultiVector<>>(myOverlapResidualMap_, 1, true);
+      overlapVec = pde_vecR_overlap_;
+    }
+    else {
+      overlapVec = ROL::makePtr<Tpetra::MultiVector<>>(myOverlapResidualMap_, 1, true);
+    }
+    // Get u_coeff from u and z_coeff from z
+    scalar_view v_coeff, l_coeff, u_coeff, z_coeff;
+    getCoeffFromStateVector(v_coeff,v);
+    getCoeffFromStateVector(l_coeff,l);
+    getCoeffFromStateVector(u_coeff,u);
+    getCoeffFromControlVector(z_coeff,z);
+    // Assemble
+    scalar_view localVal;
+    pde->applyHessian_11(localVal,v_coeff,l_coeff,u_coeff,z_coeff,z_param);
+    assembleFieldVector( Hv, localVal, overlapVec, dofMgr1_ );
+  }
+  catch ( Exception::Zero & ez ) {
+    throw Exception::Zero(">>> (Assembler::assemblePDEapplyHessian11): Hessian is zero.");
+  }
+  catch ( Exception::NotImplemented & eni ) {
+    throw Exception::NotImplemented(">>> (Assembler::assemblePDEapplyHessian11): Hessian not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assemblePDEapplyHessian12(ROL::Ptr<Tpetra::MultiVector<>> &Hv,
+                                                const ROL::Ptr<PDE_type> &pde,
+                                                const ROL::Ptr<const Tpetra::MultiVector<>> &v,
+                                                const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                                const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                                const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                                const ROL::Ptr<const std::vector<Real>> & z_param) {
+//  #ifdef ROL_TIMERS
+//    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssemblePDEapplyJacobian2);
+//  #endif
+  try {
+    // Initialize residual vectors if not done so
+    if ( Hv == ROL::nullPtr ) // Unique components of residual vector
+      Hv = ROL::makePtr<Tpetra::MultiVector<>>(myUniqueControlMap_, 1, true);
+    ROL::Ptr<Tpetra::MultiVector<>> overlapVec;
+    if ( store_ ) {
+      if ( pde_vecJ2_overlap_ == ROL::nullPtr ) // Overlapping components of residual vector
+        pde_vecJ2_overlap_ = ROL::makePtr<Tpetra::MultiVector<>>(myOverlapControlMap_, 1, true);
+      overlapVec = pde_vecJ2_overlap_;
+    }
+    else {
+      overlapVec = ROL::makePtr<Tpetra::MultiVector<>>(myOverlapControlMap_, 1, true);
+    }
+    // Get u_coeff from u and z_coeff from z
+    scalar_view v_coeff, l_coeff, u_coeff, z_coeff;
+    getCoeffFromStateVector(v_coeff,v);
+    getCoeffFromStateVector(l_coeff,l);
+    getCoeffFromStateVector(u_coeff,u);
+    getCoeffFromControlVector(z_coeff,z);
+    // Assemble
+    scalar_view localVal;
+    pde->applyHessian_12(localVal,v_coeff,l_coeff,u_coeff,z_coeff,z_param);
+    assembleFieldVector( Hv, localVal, overlapVec, dofMgr2_ );
+  }
+  catch ( Exception::Zero & ez ) {
+    throw Exception::Zero(">>> (Assembler::assemblePDEapplyHessian12): Hessian is zero.");
+  }
+  catch ( Exception::NotImplemented & eni ) {
+    throw Exception::NotImplemented(">>> (Assembler::assemblePDEapplyHessian12): Hessian not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assemblePDEapplyHessian21(ROL::Ptr<Tpetra::MultiVector<>> &Hv,
+                                                const ROL::Ptr<PDE_type> &pde,
+                                                const ROL::Ptr<const Tpetra::MultiVector<>> &v,
+                                                const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                                const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                                const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                                const ROL::Ptr<const std::vector<Real>> & z_param) {
+//  #ifdef ROL_TIMERS
+//    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssemblePDEapplyJacobian2);
+//  #endif
+  try {
+    // Initialize residual vectors if not done so
+    if ( Hv == ROL::nullPtr ) // Unique components of residual vector
+      Hv = ROL::makePtr<Tpetra::MultiVector<>>(myUniqueResidualMap_, 1, true);
+    ROL::Ptr<Tpetra::MultiVector<>> overlapVec;
+    if ( store_ ) {
+      if ( pde_vecR_overlap_ == ROL::nullPtr ) // Overlapping components of residual vector
+        pde_vecR_overlap_ = ROL::makePtr<Tpetra::MultiVector<>>(myOverlapResidualMap_, 1, true);
+      overlapVec = pde_vecR_overlap_;
+    }
+    else {
+      overlapVec = ROL::makePtr<Tpetra::MultiVector<>>(myOverlapResidualMap_, 1, true);
+    }
+    // Get u_coeff from u and z_coeff from z
+    scalar_view v_coeff, l_coeff, u_coeff, z_coeff;
+    getCoeffFromControlVector(v_coeff,v);
+    getCoeffFromStateVector(l_coeff,l);
+    getCoeffFromStateVector(u_coeff,u);
+    getCoeffFromControlVector(z_coeff,z);
+    // Assemble
+    scalar_view localVal;
+    pde->applyHessian_21(localVal,v_coeff,l_coeff,u_coeff,z_coeff,z_param);
+    assembleFieldVector( Hv, localVal, overlapVec, dofMgr2_ );
+  }
+  catch ( Exception::Zero & ez ) {
+    throw Exception::Zero(">>> (Assembler::assemblePDEapplyHessian21): Hessian is zero.");
+  }
+  catch ( Exception::NotImplemented & eni ) {
+    throw Exception::NotImplemented(">>> (Assembler::assemblePDEapplyHessian21): Hessian not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assemblePDEapplyHessian22(ROL::Ptr<Tpetra::MultiVector<>> &Hv,
+                                                const ROL::Ptr<PDE_type> &pde,
+                                                const ROL::Ptr<const Tpetra::MultiVector<>> &v,
+                                                const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                                const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                                const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                                const ROL::Ptr<const std::vector<Real>> & z_param) {
+//  #ifdef ROL_TIMERS
+//    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssemblePDEapplyJacobian2);
+//  #endif
+  try {
+    // Initialize residual vectors if not done so
+    if ( Hv == ROL::nullPtr ) // Unique components of residual vector
+      Hv = ROL::makePtr<Tpetra::MultiVector<>>(myUniqueControlMap_, 1, true);
+    ROL::Ptr<Tpetra::MultiVector<>> overlapVec;
+    if ( store_ ) {
+      if ( pde_vecJ2_overlap_ == ROL::nullPtr ) // Overlapping components of residual vector
+        pde_vecJ2_overlap_ = ROL::makePtr<Tpetra::MultiVector<>>(myOverlapControlMap_, 1, true);
+      overlapVec = pde_vecJ2_overlap_;
+    }
+    else {
+      overlapVec = ROL::makePtr<Tpetra::MultiVector<>>(myOverlapControlMap_, 1, true);
+    }
+    // Get u_coeff from u and z_coeff from z
+    scalar_view v_coeff, l_coeff, u_coeff, z_coeff;
+    getCoeffFromControlVector(v_coeff,v);
+    getCoeffFromStateVector(l_coeff,l);
+    getCoeffFromStateVector(u_coeff,u);
+    getCoeffFromControlVector(z_coeff,z);
+    // Assemble
+    scalar_view localVal;
+    pde->applyHessian_22(localVal,v_coeff,l_coeff,u_coeff,z_coeff,z_param);
+    assembleFieldVector( Hv, localVal, overlapVec, dofMgr2_ );
+  }
+  catch ( Exception::Zero & ez ) {
+    throw Exception::Zero(">>> (Assembler::assemblePDEapplyHessian22): Hessian is zero.");
+  }
+  catch ( Exception::NotImplemented & eni ) {
+    throw Exception::NotImplemented(">>> (Assembler::assemblePDEapplyHessian22): Hessian not implemented.");
+  }
+}
+/***************************************************************************/
+/* End of PDE assembly routines.                                           */
+/***************************************************************************/
+
+/***************************************************************************/
+/* DynamicPDE assembly routines                                            */
+/***************************************************************************/
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assembleDynPDEResidual(ROL::Ptr<Tpetra::MultiVector<>> &r,
+                                       const ROL::Ptr<DynPDE_type> &pde,
+                                       const ROL::TimeStamp<Real> &ts,
+                                       const ROL::Ptr<const Tpetra::MultiVector<>> &uo,
+                                       const ROL::Ptr<const Tpetra::MultiVector<>> &un,
+                                       const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                       const ROL::Ptr<const std::vector<Real>> &z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssembleDynPDEResidual);
+  #endif
+  // Initialize residual vectors if not done so
+  if ( r == ROL::nullPtr ) { // Unique components of residual vector
+    r = ROL::makePtr<Tpetra::MultiVector<>>(myUniqueResidualMap_, 1, true);
+  }
+  if ( pde_vecR_overlap_ == ROL::nullPtr ) { // Overlapping components of residual vector
+    pde_vecR_overlap_ = ROL::makePtr<Tpetra::MultiVector<>>(myOverlapResidualMap_, 1, true);
+  }
+  // Get u_coeff from u and z_coeff from z
+  scalar_view uo_coeff, un_coeff, z_coeff;
+  getCoeffFromStateVector(uo_coeff,uo);
+  getCoeffFromStateVector(un_coeff,un);
+  getCoeffFromControlVector(z_coeff,z);
+  // Assemble
+  scalar_view localVal;
+  pde->residual(localVal,ts,uo_coeff,un_coeff,z_coeff,z_param);
+  assembleFieldVector( r, localVal, pde_vecR_overlap_, dofMgr1_ );
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assembleDynPDEJacobian_uo(ROL::Ptr<Tpetra::CrsMatrix<>> &J,
+                                          const ROL::Ptr<DynPDE_type> &pde,
+                                          const ROL::TimeStamp<Real> &ts,
+                                          const ROL::Ptr<const Tpetra::MultiVector<>> &uo,
+                                          const ROL::Ptr<const Tpetra::MultiVector<>> &un,
+                                          const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                          const ROL::Ptr<const std::vector<Real>> &z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssembleDynPDEJacobian_uo);
+  #endif
+  try {
+    // Get u_coeff from u and z_coeff from z
+    scalar_view uo_coeff, un_coeff, z_coeff;
+    getCoeffFromStateVector(uo_coeff,uo);
+    getCoeffFromStateVector(un_coeff,un);
+    getCoeffFromControlVector(z_coeff,z);
+    // Initialize Jacobian matrices
+    if ( J == ROL::nullPtr ) {
+      J = ROL::makePtr<Tpetra::CrsMatrix<>>(matJ1Graph_);
+    }
+    // Assemble
+    scalar_view localVal;
+    pde->Jacobian_uo(localVal,ts,uo_coeff,un_coeff,z_coeff,z_param);
+    assembleFieldMatrix( J, localVal, dofMgr1_, dofMgr1_ );
+    isJuoTransposed_ = false;
+  }
+  catch ( Exception::Zero & ez ) {
+    throw Exception::Zero(">>> (Assembler::assembleDynPDEJacobian_uo): Jacobian is zero.");
+  }
+  catch ( Exception::NotImplemented & eni ) {
+    throw Exception::NotImplemented(">>> (Assembler::assembleDynPDEJacobian_uo): Jacobian not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assembleDynPDEJacobian_un(ROL::Ptr<Tpetra::CrsMatrix<>> &J,
+                                          const ROL::Ptr<DynPDE_type> &pde,
+                                          const ROL::TimeStamp<Real> &ts,
+                                          const ROL::Ptr<const Tpetra::MultiVector<>> &uo,
+                                          const ROL::Ptr<const Tpetra::MultiVector<>> &un,
+                                          const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                          const ROL::Ptr<const std::vector<Real>> &z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssembleDynPDEJacobian_un);
+  #endif
+  try {
+    // Get u_coeff from u and z_coeff from z
+    scalar_view uo_coeff, un_coeff, z_coeff;
+    getCoeffFromStateVector(uo_coeff,uo);
+    getCoeffFromStateVector(un_coeff,un);
+    getCoeffFromControlVector(z_coeff,z);
+    // Initialize Jacobian matrices
+    if ( J == ROL::nullPtr ) {
+      J = ROL::makePtr<Tpetra::CrsMatrix<>>(matJ1Graph_);
+    }
+    // Assemble
+    scalar_view localVal;
+    pde->Jacobian_un(localVal,ts,uo_coeff,un_coeff,z_coeff,z_param);
+    assembleFieldMatrix( J, localVal, dofMgr1_, dofMgr1_ );
+    isJunTransposed_ = false;
+  }
+  catch ( Exception::Zero & ez ) {
+    throw Exception::Zero(">>> (Assembler::assembleDynPDEJacobian_un): Jacobian is zero.");
+  }
+  catch ( Exception::NotImplemented & eni ) {
+    throw Exception::NotImplemented(">>> (Assembler::assembleDynPDEJacobian_un): Jacobian not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assembleDynPDEJacobian_zf(ROL::Ptr<Tpetra::CrsMatrix<>> &J,
+                                          const ROL::Ptr<DynPDE_type> &pde,
+                                          const ROL::TimeStamp<Real> &ts,
+                                          const ROL::Ptr<const Tpetra::MultiVector<>> &uo,
+                                          const ROL::Ptr<const Tpetra::MultiVector<>> &un,
+                                          const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                          const ROL::Ptr<const std::vector<Real>> &z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssembleDynPDEJacobian_zf);
+  #endif
+  try {
+    // Get u_coeff from u and z_coeff from z
+    scalar_view uo_coeff, un_coeff, z_coeff;
+    getCoeffFromStateVector(uo_coeff,uo);
+    getCoeffFromStateVector(un_coeff,un);
+    getCoeffFromControlVector(z_coeff,z);
+    // Initialize Jacobian matrices
+    if ( J == ROL::nullPtr ) {
+      J = ROL::makePtr<Tpetra::CrsMatrix<>>(matJ2Graph_);
+    }
+    // Assemble
+    scalar_view localVal;
+    pde->Jacobian_zf(localVal,ts,uo_coeff,un_coeff,z_coeff,z_param);
+    assembleFieldMatrix( J, localVal, dofMgr1_, dofMgr2_ );
+    isJzfTransposed_ = false;
+  }
+  catch ( Exception::Zero & ez ) {
+    throw Exception::Zero(">>> (Assembler::assembleDynPDEJacobian_zf): Jacobian is zero.");
+  }
+  catch ( Exception::NotImplemented & eni ) {
+    throw Exception::NotImplemented(">>> (Assembler::assembleDynPDEJacobian_zf): Jacobian not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assembleDynPDEJacobian_zp(ROL::Ptr<Tpetra::MultiVector<>> &J,
+                                          const ROL::Ptr<DynPDE_type> &pde,
+                                          const ROL::TimeStamp<Real> &ts,
+                                          const ROL::Ptr<const Tpetra::MultiVector<>> &uo,
+                                          const ROL::Ptr<const Tpetra::MultiVector<>> &un,
+                                          const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                          const ROL::Ptr<const std::vector<Real>> & z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssembleDynPDEJacobian_zp);
+  #endif
+  if ( z_param != ROL::nullPtr ) {
+    try {
+      int size = static_cast<int>(z_param->size());
+      // Get u_coeff from u and z_coeff from z
+      scalar_view uo_coeff, un_coeff, z_coeff;
+      getCoeffFromStateVector(uo_coeff,uo);
+      getCoeffFromStateVector(un_coeff,un);
+      getCoeffFromControlVector(z_coeff,z);
+      // Initialize Jacobian storage if not done so already
+      if (J == ROL::nullPtr) {
+        J = ROL::makePtr<Tpetra::MultiVector<>>(myUniqueResidualMap_, size, true);
+      }
+      if ( pde_vecJ3_overlap_ == ROL::nullPtr) {
+        pde_vecJ3_overlap_ = ROL::makePtr<Tpetra::MultiVector<>>(myOverlapResidualMap_, size, true);
+      }
+      // Assemble
+      std::vector<scalar_view> localVal(size);
+      pde->Jacobian_zp(localVal,ts,uo_coeff,un_coeff,z_coeff,z_param);
+      assembleParamFieldMatrix( J, localVal, pde_vecJ3_overlap_, dofMgr1_ );
+    }
+    catch ( Exception::Zero & ez ) {
+      throw Exception::Zero(">>> (Assembler::assembleDynPDEJacobian_zp): Jacobian is zero.");
+    }
+    catch ( Exception::NotImplemented & eni ) {
+      throw Exception::NotImplemented(">>> (Assembler::assembleDynPDEJacobian_zp): Jacobian not implemented.");
+    }
+  }
+  else {
+    throw Exception::NotImplemented(">>> (Assembler::assembleDynPDEJacobian_zp): Jacobian not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assembleDynPDEHessian_uo_uo(ROL::Ptr<Tpetra::CrsMatrix<>> &H,
+                                            const ROL::Ptr<DynPDE_type> &pde,
+                                            const ROL::TimeStamp<Real> &ts,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &uo,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &un,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                            const ROL::Ptr<const std::vector<Real>> & z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssembleDynPDEHessian_uo_uo);
+  #endif
+  try {
+    // Get u_coeff from u and z_coeff from z
+    scalar_view uo_coeff, un_coeff, z_coeff, l_coeff;
+    getCoeffFromStateVector(uo_coeff,uo);
+    getCoeffFromStateVector(un_coeff,un);
+    getCoeffFromControlVector(z_coeff,z);
+    getCoeffFromStateVector(l_coeff,l);
+    // Initialize Hessian storage if not done so already
+    if ( H == ROL::nullPtr ) {
+      H = ROL::makePtr<Tpetra::CrsMatrix<>>(matH11Graph_);
+    }
+    // Assemble
+    scalar_view localVal;
+    pde->Hessian_uo_uo(localVal,ts,l_coeff,uo_coeff,un_coeff,z_coeff,z_param);
+    assembleFieldMatrix( H, localVal, dofMgr1_, dofMgr1_ );
+  }
+  catch (Exception::Zero &ez) {
+    throw Exception::Zero(">>> (Assembler::assembleDynPDEHessian_uo_uo): Hessian is zero.");
+  }
+  catch (Exception::NotImplemented &eni) {
+    throw Exception::NotImplemented(">>> (Assembler::assembleDynPDEHessian_uo_uo): Hessian not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assembleDynPDEHessian_uo_un(ROL::Ptr<Tpetra::CrsMatrix<>> &H,
+                                            const ROL::Ptr<DynPDE_type> &pde,
+                                            const ROL::TimeStamp<Real> &ts,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &uo,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &un,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                            const ROL::Ptr<const std::vector<Real>> & z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssembleDynPDEHessian_uo_un);
+  #endif
+  try {
+    // Get u_coeff from u and z_coeff from z
+    scalar_view uo_coeff, un_coeff, z_coeff, l_coeff;
+    getCoeffFromStateVector(uo_coeff,uo);
+    getCoeffFromStateVector(un_coeff,un);
+    getCoeffFromControlVector(z_coeff,z);
+    getCoeffFromStateVector(l_coeff,l);
+    // Initialize Hessian storage if not done so already
+    if ( H == ROL::nullPtr ) {
+      H = ROL::makePtr<Tpetra::CrsMatrix<>>(matH11Graph_);
+    }
+    // Assemble
+    scalar_view localVal;
+    pde->Hessian_uo_un(localVal,ts,l_coeff,uo_coeff,un_coeff,z_coeff,z_param);
+    assembleFieldMatrix( H, localVal, dofMgr1_, dofMgr1_ );
+  }
+  catch (Exception::Zero &ez) {
+    throw Exception::Zero(">>> (Assembler::assembleDynPDEHessian_uo_un): Hessian is zero.");
+  }
+  catch (Exception::NotImplemented &eni) {
+    throw Exception::NotImplemented(">>> (Assembler::assembleDynPDEHessian_uo_un): Hessian not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assembleDynPDEHessian_uo_zf(ROL::Ptr<Tpetra::CrsMatrix<>> &H,
+                                            const ROL::Ptr<DynPDE_type> &pde,
+                                            const ROL::TimeStamp<Real> &ts,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &uo,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &un,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                            const ROL::Ptr<const std::vector<Real>> & z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssembleDynPDEHessian_uo_zf);
+  #endif
+  try {
+    // Get u_coeff from u and z_coeff from z
+    scalar_view uo_coeff, un_coeff, z_coeff, l_coeff;
+    getCoeffFromStateVector(uo_coeff,uo);
+    getCoeffFromStateVector(un_coeff,un);
+    getCoeffFromControlVector(z_coeff,z);
+    getCoeffFromStateVector(l_coeff,l);
+    // Initialize Hessian storage if not done so already
+    if ( H == ROL::nullPtr ) {
+      H = ROL::makePtr<Tpetra::CrsMatrix<>>(matH12Graph_);
+    }
+    // Assemble
+    scalar_view localVal;
+    pde->Hessian_uo_zf(localVal,ts,l_coeff,uo_coeff,un_coeff,z_coeff,z_param);
+    assembleFieldMatrix( H, localVal, dofMgr1_, dofMgr2_ );
+  }
+  catch (Exception::Zero &ez) {
+    throw Exception::Zero(">>> (Assembler::assembleDynPDEHessian_uo_zf): Hessian is zero.");
+  }
+  catch (Exception::NotImplemented &eni) {
+    throw Exception::NotImplemented(">>> (Assembler::assembleDynPDEHessian_uo_zf): Hessian not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assembleDynPDEHessian_uo_zp(ROL::Ptr<Tpetra::MultiVector<>> &H,
+                                            const ROL::Ptr<DynPDE_type> &pde,
+                                            const ROL::TimeStamp<Real> &ts,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &uo,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &un,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                            const ROL::Ptr<const std::vector<Real>> & z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssembleDynPDEHessian_uo_zp);
+  #endif
+  if ( z_param != ROL::nullPtr ) {
+    try {
+      int size = static_cast<int>(z_param->size());
+      // Get u_coeff from u, z_coeff from z and l_coeff from l
+      scalar_view uo_coeff, un_coeff, z_coeff, l_coeff;
+      getCoeffFromStateVector(uo_coeff,uo);
+      getCoeffFromStateVector(un_coeff,un);
+      getCoeffFromControlVector(z_coeff,z);
+      getCoeffFromStateVector(l_coeff,l);
+      // Initialize Jacobian storage if not done so already
+      if (H == ROL::nullPtr) {
+        H = ROL::makePtr<Tpetra::MultiVector<>>(myUniqueStateMap_, size, true);
+      }
+      if ( pde_vecH13_overlap_ == ROL::nullPtr) {
+        pde_vecH13_overlap_ = ROL::makePtr<Tpetra::MultiVector<>>(myOverlapStateMap_, size, true);
+      }
+      // Assemble
+      std::vector<scalar_view> localVal(size);
+      pde->Hessian_uo_zp(localVal,ts,l_coeff,uo_coeff,un_coeff,z_coeff,z_param);
+      assembleParamFieldMatrix( H, localVal, pde_vecH13_overlap_, dofMgr1_ );
+    }
+    catch ( Exception::Zero & ez ) {
+      throw Exception::Zero(">>> (Assembler::assembleDynPDEHessian_uo_zp): Hessian is zero.");
+    }
+    catch ( Exception::NotImplemented & eni ) {
+      throw Exception::NotImplemented(">>> (Assembler::assembleDynPDEHessian_uo_zp): Hessian not implemented.");
+    }
+  }
+  else {
+    throw Exception::NotImplemented(">>> (Assembler::assembleDynPDEHessian_uo_zp): Hessian not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assembleDynPDEHessian_un_uo(ROL::Ptr<Tpetra::CrsMatrix<>> &H,
+                                            const ROL::Ptr<DynPDE_type> &pde,
+                                            const ROL::TimeStamp<Real> &ts,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &uo,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &un,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                            const ROL::Ptr<const std::vector<Real>> & z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssembleDynPDEHessian_un_uo);
+  #endif
+  try {
+    // Get u_coeff from u and z_coeff from z
+    scalar_view uo_coeff, un_coeff, z_coeff, l_coeff;
+    getCoeffFromStateVector(uo_coeff,uo);
+    getCoeffFromStateVector(un_coeff,un);
+    getCoeffFromControlVector(z_coeff,z);
+    getCoeffFromStateVector(l_coeff,l);
+    // Initialize Hessian storage if not done so already
+    if ( H == ROL::nullPtr ) {
+      H = ROL::makePtr<Tpetra::CrsMatrix<>>(matH11Graph_);
+    }
+    // Assemble
+    scalar_view localVal;
+    pde->Hessian_un_uo(localVal,ts,l_coeff,uo_coeff,un_coeff,z_coeff,z_param);
+    assembleFieldMatrix( H, localVal, dofMgr1_, dofMgr1_ );
+  }
+  catch (Exception::Zero &ez) {
+    throw Exception::Zero(">>> (Assembler::assembleDynPDEHessian_un_uo): Hessian is zero.");
+  }
+  catch (Exception::NotImplemented &eni) {
+    throw Exception::NotImplemented(">>> (Assembler::assembleDynPDEHessian_un_uo): Hessian not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assembleDynPDEHessian_un_un(ROL::Ptr<Tpetra::CrsMatrix<>> &H,
+                                            const ROL::Ptr<DynPDE_type> &pde,
+                                            const ROL::TimeStamp<Real> &ts,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &uo,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &un,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                            const ROL::Ptr<const std::vector<Real>> & z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssembleDynPDEHessian_un_un);
+  #endif
+  try {
+    // Get u_coeff from u and z_coeff from z
+    scalar_view uo_coeff, un_coeff, z_coeff, l_coeff;
+    getCoeffFromStateVector(uo_coeff,uo);
+    getCoeffFromStateVector(un_coeff,un);
+    getCoeffFromControlVector(z_coeff,z);
+    getCoeffFromStateVector(l_coeff,l);
+    // Initialize Hessian storage if not done so already
+    if ( H == ROL::nullPtr ) {
+      H = ROL::makePtr<Tpetra::CrsMatrix<>>(matH11Graph_);
+    }
+    // Assemble
+    scalar_view localVal;
+    pde->Hessian_un_un(localVal,ts,l_coeff,uo_coeff,un_coeff,z_coeff,z_param);
+    assembleFieldMatrix( H, localVal, dofMgr1_, dofMgr1_ );
+  }
+  catch (Exception::Zero &ez) {
+    throw Exception::Zero(">>> (Assembler::assembleDynPDEHessian_un_un): Hessian is zero.");
+  }
+  catch (Exception::NotImplemented &eni) {
+    throw Exception::NotImplemented(">>> (Assembler::assembleDynPDEHessian_un_un): Hessian not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assembleDynPDEHessian_un_zf(ROL::Ptr<Tpetra::CrsMatrix<>> &H,
+                                            const ROL::Ptr<DynPDE_type> &pde,
+                                            const ROL::TimeStamp<Real> &ts,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &uo,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &un,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                            const ROL::Ptr<const std::vector<Real>> & z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssembleDynPDEHessian_un_zf);
+  #endif
+  try {
+    // Get u_coeff from u and z_coeff from z
+    scalar_view uo_coeff, un_coeff, z_coeff, l_coeff;
+    getCoeffFromStateVector(uo_coeff,uo);
+    getCoeffFromStateVector(un_coeff,un);
+    getCoeffFromControlVector(z_coeff,z);
+    getCoeffFromStateVector(l_coeff,l);
+    // Initialize Hessian storage if not done so already
+    if ( H == ROL::nullPtr ) {
+      H = ROL::makePtr<Tpetra::CrsMatrix<>>(matH12Graph_);
+    }
+    // Assemble
+    scalar_view localVal;
+    pde->Hessian_un_zf(localVal,ts,l_coeff,uo_coeff,un_coeff,z_coeff,z_param);
+    assembleFieldMatrix( H, localVal, dofMgr1_, dofMgr2_ );
+  }
+  catch (Exception::Zero &ez) {
+    throw Exception::Zero(">>> (Assembler::assembleDynPDEHessian_un_zf): Hessian is zero.");
+  }
+  catch (Exception::NotImplemented &eni) {
+    throw Exception::NotImplemented(">>> (Assembler::assembleDynPDEHessian_un_zf): Hessian not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assembleDynPDEHessian_un_zp(ROL::Ptr<Tpetra::MultiVector<>> &H,
+                                            const ROL::Ptr<DynPDE_type> &pde,
+                                            const ROL::TimeStamp<Real> &ts,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &uo,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &un,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                            const ROL::Ptr<const std::vector<Real>> & z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssembleDynPDEHessian_un_zp);
+  #endif
+  if ( z_param != ROL::nullPtr ) {
+    try {
+      int size = static_cast<int>(z_param->size());
+      // Get u_coeff from u, z_coeff from z and l_coeff from l
+      scalar_view uo_coeff, un_coeff, z_coeff, l_coeff;
+      getCoeffFromStateVector(uo_coeff,uo);
+      getCoeffFromStateVector(un_coeff,un);
+      getCoeffFromControlVector(z_coeff,z);
+      getCoeffFromStateVector(l_coeff,l);
+      // Initialize Jacobian storage if not done so already
+      if (H == ROL::nullPtr) {
+        H = ROL::makePtr<Tpetra::MultiVector<>>(myUniqueStateMap_, size, true);
+      }
+      if ( pde_vecH13_overlap_ == ROL::nullPtr) {
+        pde_vecH13_overlap_ = ROL::makePtr<Tpetra::MultiVector<>>(myOverlapStateMap_, size, true);
+      }
+      // Assemble
+      std::vector<scalar_view> localVal(size);
+      pde->Hessian_un_zp(localVal,ts,l_coeff,uo_coeff,un_coeff,z_coeff,z_param);
+      assembleParamFieldMatrix( H, localVal, pde_vecH13_overlap_, dofMgr1_ );
+    }
+    catch ( Exception::Zero & ez ) {
+      throw Exception::Zero(">>> (Assembler::assembleDynPDEHessian_un_zp): Hessian is zero.");
+    }
+    catch ( Exception::NotImplemented & eni ) {
+      throw Exception::NotImplemented(">>> (Assembler::assembleDynPDEHessian_un_zp): Hessian not implemented.");
+    }
+  }
+  else {
+    throw Exception::NotImplemented(">>> (Assembler::assembleDynPDEHessian_un_zp): Hessian not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assembleDynPDEHessian_zf_uo(ROL::Ptr<Tpetra::CrsMatrix<>> &H,
+                                            const ROL::Ptr<DynPDE_type> &pde,
+                                            const ROL::TimeStamp<Real> &ts,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &uo,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &un,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                            const ROL::Ptr<const std::vector<Real>> & z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssembleDynPDEHessian_zf_uo);
+  #endif
+  try {
+    // Get u_coeff from u and z_coeff from z
+    scalar_view uo_coeff, un_coeff, z_coeff, l_coeff;
+    getCoeffFromStateVector(uo_coeff,uo);
+    getCoeffFromStateVector(un_coeff,un);
+    getCoeffFromControlVector(z_coeff,z);
+    getCoeffFromStateVector(l_coeff,l);
+    // Initialize Hessian storage if not done so already
+    if ( H == ROL::nullPtr ) {
+      H = ROL::makePtr<Tpetra::CrsMatrix<>>(matH21Graph_);
+    }
+    // Assemble
+    scalar_view localVal;
+    pde->Hessian_zf_uo(localVal,ts,l_coeff,uo_coeff,un_coeff,z_coeff,z_param);
+    assembleFieldMatrix( H, localVal, dofMgr2_, dofMgr1_ );
+  }
+  catch (Exception::Zero &ez) {
+    throw Exception::Zero(">>> (Assembler::assembleDynPDEHessian_zf_uo): Hessian is zero.");
+  }
+  catch (Exception::NotImplemented &eni) {
+    throw Exception::NotImplemented(">>> (Assembler::assembleDynPDEHessian_zf_uo): Hessian not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assembleDynPDEHessian_zf_un(ROL::Ptr<Tpetra::CrsMatrix<>> &H,
+                                            const ROL::Ptr<DynPDE_type> &pde,
+                                            const ROL::TimeStamp<Real> &ts,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &uo,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &un,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                            const ROL::Ptr<const std::vector<Real>> & z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssembleDynPDEHessian_zf_un);
+  #endif
+  try {
+    // Get u_coeff from u and z_coeff from z
+    scalar_view uo_coeff, un_coeff, z_coeff, l_coeff;
+    getCoeffFromStateVector(uo_coeff,uo);
+    getCoeffFromStateVector(un_coeff,un);
+    getCoeffFromControlVector(z_coeff,z);
+    getCoeffFromStateVector(l_coeff,l);
+    // Initialize Hessian storage if not done so already
+    if ( H == ROL::nullPtr ) {
+      H = ROL::makePtr<Tpetra::CrsMatrix<>>(matH21Graph_);
+    }
+    // Assemble
+    scalar_view localVal;
+    pde->Hessian_zf_un(localVal,ts,l_coeff,uo_coeff,un_coeff,z_coeff,z_param);
+    assembleFieldMatrix( H, localVal, dofMgr2_, dofMgr1_ );
+  }
+  catch (Exception::Zero &ez) {
+    throw Exception::Zero(">>> (Assembler::assembleDynPDEHessian_zf_un): Hessian is zero.");
+  }
+  catch (Exception::NotImplemented &eni) {
+    throw Exception::NotImplemented(">>> (Assembler::assembleDynPDEHessian_zf_un): Hessian not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assembleDynPDEHessian_zf_zf(ROL::Ptr<Tpetra::CrsMatrix<>> &H,
+                                            const ROL::Ptr<DynPDE_type> &pde,
+                                            const ROL::TimeStamp<Real> &ts,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &uo,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &un,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                            const ROL::Ptr<const std::vector<Real>> & z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssembleDynPDEHessian_zf_zf);
+  #endif
+  try {
+    // Get u_coeff from u and z_coeff from z
+    scalar_view uo_coeff, un_coeff, z_coeff, l_coeff;
+    getCoeffFromStateVector(uo_coeff,uo);
+    getCoeffFromStateVector(un_coeff,un);
+    getCoeffFromControlVector(z_coeff,z);
+    getCoeffFromStateVector(l_coeff,l);
+    // Initialize Hessian storage if not done so already
+    if ( H == ROL::nullPtr ) {
+      H = ROL::makePtr<Tpetra::CrsMatrix<>>(matH22Graph_);
+    }
+    // Assemble
+    scalar_view localVal;
+    pde->Hessian_zf_zf(localVal,ts,l_coeff,uo_coeff,un_coeff,z_coeff,z_param);
+    assembleFieldMatrix( H, localVal, dofMgr2_, dofMgr2_ );
+  }
+  catch (Exception::Zero &ez) {
+    throw Exception::Zero(">>> (Assembler::assembleDynPDEHessian_zf_zf): Hessian is zero.");
+  }
+  catch (Exception::NotImplemented &eni) {
+    throw Exception::NotImplemented(">>> (Assembler::assembleDynPDEHessian_zf_zf): Hessian not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assembleDynPDEHessian_zf_zp(ROL::Ptr<Tpetra::MultiVector<>> &H,
+                                            const ROL::Ptr<DynPDE_type> &pde,
+                                            const ROL::TimeStamp<Real> &ts,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &uo,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &un,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                            const ROL::Ptr<const std::vector<Real>> & z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssembleDynPDEHessian_zf_zp);
+  #endif
+  if ( z_param != ROL::nullPtr ) {
+    try {
+      int size = static_cast<int>(z_param->size());
+      // Get u_coeff from u, z_coeff from z and l_coeff from l
+      scalar_view uo_coeff, un_coeff, z_coeff, l_coeff;
+      getCoeffFromStateVector(uo_coeff,uo);
+      getCoeffFromStateVector(un_coeff,un);
+      getCoeffFromControlVector(z_coeff,z);
+      getCoeffFromStateVector(l_coeff,l);
+      // Initialize Jacobian storage if not done so already
+      if (H == ROL::nullPtr) {
+        H = ROL::makePtr<Tpetra::MultiVector<>>(myUniqueControlMap_, size, true);
+      }
+      if ( pde_vecH23_overlap_ == ROL::nullPtr) {
+        pde_vecH23_overlap_ = ROL::makePtr<Tpetra::MultiVector<>>(myOverlapControlMap_, size, true);
+      }
+      // Assemble
+      std::vector<scalar_view> localVal(size);
+      pde->Hessian_zf_zp(localVal,ts,l_coeff,uo_coeff,un_coeff,z_coeff,z_param);
+      assembleParamFieldMatrix( H, localVal, pde_vecH23_overlap_, dofMgr2_ );
+    }
+    catch ( Exception::Zero & ez ) {
+      throw Exception::Zero(">>> (Assembler::assembleDynPDEHessian_zf_zp): Hessian is zero.");
+    }
+    catch ( Exception::NotImplemented & eni ) {
+      throw Exception::NotImplemented(">>> (Assembler::assembleDynPDEHessian_zf_zp): Hessian not implemented.");
+    }
+  }
+  else {
+    throw Exception::NotImplemented(">>> (Assembler::assembleDynPDEHessian_zf_zp): Hessian not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assembleDynPDEHessian_zp_uo(ROL::Ptr<Tpetra::MultiVector<>> &H,
+                                            const ROL::Ptr<DynPDE_type> &pde,
+                                            const ROL::TimeStamp<Real> &ts,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &uo,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &un,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                            const ROL::Ptr<const std::vector<Real>> & z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssembleDynPDEHessian_zp_uo);
+  #endif
+  assembleDynPDEHessian_uo_zp(H,pde,ts,l,uo,un,z,z_param);
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assembleDynPDEHessian_zp_un(ROL::Ptr<Tpetra::MultiVector<>> &H,
+                                            const ROL::Ptr<DynPDE_type> &pde,
+                                            const ROL::TimeStamp<Real> &ts,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &uo,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &un,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                            const ROL::Ptr<const std::vector<Real>> & z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssembleDynPDEHessian_zp_un);
+  #endif
+  assembleDynPDEHessian_un_zp(H,pde,ts,l,uo,un,z,z_param);
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assembleDynPDEHessian_zp_zf(ROL::Ptr<Tpetra::MultiVector<>> &H,
+                                            const ROL::Ptr<DynPDE_type> &pde,
+                                            const ROL::TimeStamp<Real> &ts,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &uo,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &un,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                            const ROL::Ptr<const std::vector<Real>> & z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssembleDynPDEHessian_zp_zf);
+  #endif
+  assembleDynPDEHessian_zf_zp(H,pde,ts,l,uo,un,z,z_param);
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assembleDynPDEHessian_zp_zp(ROL::Ptr<std::vector<std::vector<Real>>> &H,
+                                            const ROL::Ptr<DynPDE_type> &pde,
+                                            const ROL::TimeStamp<Real> &ts,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &l,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &uo,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &un,
+                                            const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                            const ROL::Ptr<const std::vector<Real>> & z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssembleDynPDEHessian_zp_zp);
+  #endif
+  if ( z_param != ROL::nullPtr ) {
+    try {
+      int size = static_cast<int>(z_param->size());
+      // Get u_coeff from u, z_coeff from z and l_coeff from l
+      scalar_view uo_coeff, un_coeff, z_coeff, l_coeff;
+      getCoeffFromStateVector(uo_coeff,uo);
+      getCoeffFromStateVector(un_coeff,un);
+      getCoeffFromControlVector(z_coeff,z);
+      getCoeffFromStateVector(l_coeff,l);
+      // Initialize Jacobian storage if not done so already
+      if (H == ROL::nullPtr) {
+        std::vector<Real> col(size,static_cast<Real>(0));
+        H = ROL::makePtr<std::vector<std::vector<Real>>>(size,col);
+      }
+      // Assemble
+      std::vector<std::vector<scalar_view>> localVal(size);
+      for (int i=0; i<size; ++i)
+        localVal[i].resize(size);
+      pde->Hessian_zp_zp(localVal,ts,l_coeff,uo_coeff,un_coeff,z_coeff,z_param);
+      assembleParamMatrix( H, localVal, dofMgr1_ );
+    }
+    catch ( Exception::Zero & ez ) {
+      throw Exception::Zero(">>> (Assembler::assembleDynPDEHessian_zp_zp): Hessian is zero.");
+    }
+    catch ( Exception::NotImplemented & eni ) {
+      throw Exception::NotImplemented(">>> (Assembler::assembleDynPDEHessian_zp_zp): Hessian not implemented.");
+    }
+  }
+  else {
+    throw Exception::NotImplemented(">>> (Assembler::assembleDynPDEHessian_zp_zp): Hessian not implemented.");
+  }
+}
+/***************************************************************************/
+/* End of PDE assembly routines.                                           */
+/***************************************************************************/
+
+/***************************************************************************/
+/* QoI assembly routines                                                   */
+/***************************************************************************/
+template<class Real, class DeviceType>
+Real Assembler<Real,DeviceType>::assembleQoIValue(const ROL::Ptr<QoI_type> &qoi,
+                                       const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                       const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                       const ROL::Ptr<const std::vector<Real>> & z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssembleQOIValue);
+  #endif
+  Real val(0);
+  try {
+    // Get u_coeff from u and z_coeff from z
+    scalar_view u_coeff, z_coeff;
+    getCoeffFromStateVector(u_coeff,u);
+    getCoeffFromControlVector(z_coeff,z);
+    // Assemble
+    scalar_view localVal;
+    val  = qoi->value(localVal,u_coeff,z_coeff,z_param);
+    val += assembleScalar( localVal );
+  }
+  catch ( Exception::Zero & ez ) {
+    val = static_cast<Real>(0);
+  }
+  catch ( Exception::NotImplemented & eni ) {
+    throw Exception::NotImplemented(">>> (Assembler::assembleQoIValue): Value not implemented.");
+  }
+  return val;
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assembleQoIGradient1(ROL::Ptr<Tpetra::MultiVector<>> &g1,
+                                     const ROL::Ptr<QoI_type> &qoi,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                     const ROL::Ptr<const std::vector<Real>> & z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssembleQOIGradient1);
+  #endif
+  try {
+    // Initialize state QoI gradient vectors
+    if ( g1 == ROL::nullPtr ) {
+      g1 = ROL::makePtr<Tpetra::MultiVector<>>(myUniqueStateMap_, 1, true);
+    }
+    if ( qoi_vecG1_overlap_ == ROL::nullPtr ) {
+      qoi_vecG1_overlap_ = ROL::makePtr<Tpetra::MultiVector<>>(myOverlapStateMap_, 1, true);
+    }
+    // Get u_coeff from u and z_coeff from z
+    scalar_view u_coeff, z_coeff;
+    getCoeffFromStateVector(u_coeff,u);
+    getCoeffFromControlVector(z_coeff,z);
+    // Assemble
+    scalar_view localVal;
+    qoi->gradient_1(localVal,u_coeff,z_coeff,z_param);
+    assembleFieldVector( g1, localVal, qoi_vecG1_overlap_, dofMgr1_ );
+  }
+  catch ( Exception::Zero & ez ) {
+    throw Exception::Zero(">>> (Assembler::assembleQoIGradient1): Gradient is zero.");
+  }
+  catch ( Exception::NotImplemented & eni ) {
+    throw Exception::NotImplemented(">>> (Assembler::assembleQoIGradient1): Gradient not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assembleQoIGradient2(ROL::Ptr<Tpetra::MultiVector<>> &g2,
+                                     const ROL::Ptr<QoI_type> &qoi,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                     const ROL::Ptr<const std::vector<Real>> & z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssembleQOIGradient2);
+  #endif
+  try {
+    // Initialize control gradient vectors
+    if ( g2 == ROL::nullPtr ) {
+      g2 = ROL::makePtr<Tpetra::MultiVector<>>(myUniqueControlMap_, 1, true);
+    }
+    if ( qoi_vecG2_overlap_ == ROL::nullPtr ) {
+      qoi_vecG2_overlap_ = ROL::makePtr<Tpetra::MultiVector<>>(myOverlapControlMap_, 1, true);
+    }
+    // Get u_coeff from u and z_coeff from z
+    scalar_view u_coeff, z_coeff;
+    getCoeffFromStateVector(u_coeff,u);
+    getCoeffFromControlVector(z_coeff,z);
+    // Assemble
+    scalar_view localVal;
+    qoi->gradient_2(localVal,u_coeff,z_coeff,z_param);
+    assembleFieldVector( g2, localVal, qoi_vecG2_overlap_, dofMgr2_ );
+  }
+  catch ( Exception::Zero & ez ) {
+    throw Exception::Zero(">>> (Assembler::assembleQoIGradient2): Gradient is zero.");
+  }
+  catch ( Exception::NotImplemented & eni ) {
+    throw Exception::NotImplemented(">>> (Assembler::assembleQoIGradient2): Gradient not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assembleQoIGradient3(ROL::Ptr<std::vector<Real>> &g3,
+                                     const ROL::Ptr<QoI_type> &qoi,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                     const ROL::Ptr<const std::vector<Real>> & z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssembleQOIGradient3);
+  #endif
+  if ( z_param != ROL::nullPtr ) {
+    const int size = z_param->size();
+    if ( g3 == ROL::nullPtr ) {
+      g3 = ROL::makePtr<std::vector<Real>>(size,0);
+    }
+    try {
+      // Get u_coeff from u and z_coeff from z
+      scalar_view u_coeff, z_coeff;
+      getCoeffFromStateVector(u_coeff,u);
+      getCoeffFromControlVector(z_coeff,z);
+      // Assemble
+      std::vector<scalar_view> localVal(size);
+      std::vector<Real> g = qoi->gradient_3(localVal,u_coeff,z_coeff,z_param);
+      assembleParamVector( g3, localVal );
+      for (int i = 0; i < size; ++i) {
+        (*g3)[i] += g[i];
+      }
+    }
+    catch ( Exception::Zero & ez ) {
+      g3->assign(size,0);
+    }
+    catch ( Exception::NotImplemented & eni ) {
+      throw Exception::NotImplemented(">>> (Assembler::assembleQoIGradient3): Gradient not implemented.");
+    }
+  }
+  else {
+    throw Exception::NotImplemented(">>> (Assembler::assembleQoIGradient3): Gradient not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assembleQoIHessVec11(ROL::Ptr<Tpetra::MultiVector<>> &H11,
+                                     const ROL::Ptr<QoI_type> &qoi,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &v,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                     const ROL::Ptr<const std::vector<Real>> & z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssembleQOIHessVec11);
+  #endif
+	  try {
+    // Get u_coeff from u and z_coeff from z
+    scalar_view v_coeff, u_coeff, z_coeff;
+    getCoeffFromStateVector(v_coeff,v);
+    getCoeffFromStateVector(u_coeff,u);
+    getCoeffFromControlVector(z_coeff,z);
+    // Initialize state-state HessVec vectors
+    if ( H11 == ROL::nullPtr ) {
+      H11 = ROL::makePtr<Tpetra::MultiVector<>>(myUniqueStateMap_, 1, true);
+    }
+    if ( qoi_vecH11_overlap_ == ROL::nullPtr ) {
+      qoi_vecH11_overlap_  = ROL::makePtr<Tpetra::MultiVector<>>(myOverlapStateMap_, 1, true);
+    }
+    // Assemble
+    scalar_view localVal;
+    qoi->HessVec_11(localVal,v_coeff,u_coeff,z_coeff,z_param);
+    assembleFieldVector( H11, localVal, qoi_vecH11_overlap_, dofMgr1_ );
+  }
+  catch (Exception::Zero &ez) {
+    throw Exception::Zero(">>> (Assembler::assembleQoIHessVec11): Hessian is zero.");
+  }
+  catch (Exception::NotImplemented &eni) {
+    throw Exception::NotImplemented(">>> (Assembler::assembleQoIHessVec11): Hessian not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assembleQoIHessVec12(ROL::Ptr<Tpetra::MultiVector<>> &H12,
+                                     const ROL::Ptr<QoI_type> &qoi,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &v,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                     const ROL::Ptr<const std::vector<Real>> & z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssembleQOIHessVec12);
+  #endif
+  try {
+    // Get u_coeff from u and z_coeff from z
+    scalar_view v_coeff, u_coeff, z_coeff;
+    getCoeffFromControlVector(v_coeff,v);
+    getCoeffFromStateVector(u_coeff,u);
+    getCoeffFromControlVector(z_coeff,z);
+    // Initialize state-control HessVec vectors
+    if ( H12 == ROL::nullPtr ) {
+      H12 = ROL::makePtr<Tpetra::MultiVector<>>(myUniqueStateMap_, 1, true);
+    }
+    if ( qoi_vecH12_overlap_ == ROL::nullPtr ) {
+      qoi_vecH12_overlap_ = ROL::makePtr<Tpetra::MultiVector<>>(myOverlapStateMap_, 1, true);
+    }
+    // Assemble
+    scalar_view localVal;
+    qoi->HessVec_12(localVal,v_coeff,u_coeff,z_coeff,z_param);
+    assembleFieldVector( H12, localVal, qoi_vecH12_overlap_, dofMgr1_ );
+  }
+  catch (Exception::Zero &ez) {
+    throw Exception::Zero(">>> (Assembler::assembleQoIHessVec12): Hessian is zero.");
+  }
+  catch (Exception::NotImplemented &eni) {
+    throw Exception::NotImplemented(">>> (Assembler::assembleQoIHessVec12): Hessian not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assembleQoIHessVec13(ROL::Ptr<Tpetra::MultiVector<>> &H13,
+                                     const ROL::Ptr<QoI_type> &qoi,
+                                     const ROL::Ptr<const std::vector<Real>> &v,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                     const ROL::Ptr<const std::vector<Real>> &z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssembleQOIHessVec13);
+  #endif
+  if (z_param != ROL::nullPtr) {
+    try {
+      // Get u_coeff from u and z_coeff from z
+      scalar_view u_coeff, z_coeff;
+      getCoeffFromStateVector(u_coeff,u);
+      getCoeffFromControlVector(z_coeff,z);
+      // Initialize state-control HessVec vectors
+      if ( H13 == ROL::nullPtr ) {
+        H13 = ROL::makePtr<Tpetra::MultiVector<>>(myUniqueStateMap_, 1, true);
+      }
+      if ( qoi_vecH13_overlap_ == ROL::nullPtr ) {
+        qoi_vecH13_overlap_ = ROL::makePtr<Tpetra::MultiVector<>>(myOverlapStateMap_, 1, true);
+      }
+      // Assemble
+      scalar_view localVal;
+      qoi->HessVec_13(localVal,v,u_coeff,z_coeff,z_param);
+      assembleFieldVector( H13, localVal, qoi_vecH13_overlap_, dofMgr1_ );
+    }
+    catch (Exception::Zero &ez) {
+      throw Exception::Zero(">>> (Assembler::assembleQoIHessVec13): Hessian is zero.");
+    }
+    catch (Exception::NotImplemented &eni) {
+      throw Exception::NotImplemented(">>> (Assembler::assembleQoIHessVec13): Hessian not implemented.");
+    }
+  }
+  else {
+    throw Exception::NotImplemented(">>> (Assembler::assembleQoIHessVec13): Hessian not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assembleQoIHessVec21(ROL::Ptr<Tpetra::MultiVector<>> &H21,
+                                     const ROL::Ptr<QoI_type> &qoi,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &v,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                     const ROL::Ptr<const std::vector<Real>> & z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssembleQOIHessVec21);
+  #endif
+  try {
+    // Get u_coeff from u and z_coeff from z
+    scalar_view v_coeff, u_coeff, z_coeff;
+    getCoeffFromStateVector(v_coeff,v);
+    getCoeffFromStateVector(u_coeff,u);
+    getCoeffFromControlVector(z_coeff,z);
+    // Initialize control-state HessVec vectors
+    if ( H21 == ROL::nullPtr ) {
+      H21 = ROL::makePtr<Tpetra::MultiVector<>>(myUniqueControlMap_, 1, true);
+    }
+    if ( qoi_vecH21_overlap_ == ROL::nullPtr ) {
+      qoi_vecH21_overlap_ = ROL::makePtr<Tpetra::MultiVector<>>(myOverlapControlMap_, 1, true);
+    }
+    // Assemble
+    scalar_view localVal;
+    qoi->HessVec_21(localVal,v_coeff,u_coeff,z_coeff,z_param);
+    assembleFieldVector( H21, localVal, qoi_vecH21_overlap_, dofMgr2_ );
+  }
+  catch (Exception::Zero &ez) {
+    throw Exception::Zero(">>> (Assembler::assembleQoIHessVec21): Hessian is zero.");
+  }
+  catch (Exception::NotImplemented &eni) {
+    throw Exception::NotImplemented(">>> (Assembler::assembleQoIHessVec21): Hessian not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assembleQoIHessVec22(ROL::Ptr<Tpetra::MultiVector<>> &H22,
+                                     const ROL::Ptr<QoI_type> &qoi,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &v,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                     const ROL::Ptr<const std::vector<Real>> & z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssembleQOIHessVec22);
+  #endif
+  try {
+    // Get u_coeff from u and z_coeff from z
+    scalar_view v_coeff, u_coeff, z_coeff;
+    getCoeffFromControlVector(v_coeff,v);
+    getCoeffFromStateVector(u_coeff,u);
+    getCoeffFromControlVector(z_coeff,z);
+    // Initialize control-control HessVec vectors
+    if ( H22 == ROL::nullPtr ) {
+      H22 = ROL::makePtr<Tpetra::MultiVector<>>(myUniqueControlMap_, 1, true);
+    }
+    if ( qoi_vecH22_overlap_ == ROL::nullPtr ) {
+      qoi_vecH22_overlap_ = ROL::makePtr<Tpetra::MultiVector<>>(myOverlapControlMap_, 1, true);
+    }
+    // Assemble
+    scalar_view localVal;
+    qoi->HessVec_22(localVal,v_coeff,u_coeff,z_coeff,z_param);
+    assembleFieldVector( H22, localVal, qoi_vecH22_overlap_, dofMgr2_ );
+  }
+  catch (Exception::Zero &ez) {
+    throw Exception::Zero(">>> (Assembler::assembleQoIHessVec22): Hessian is zero.");
+  }
+  catch (Exception::NotImplemented &eni) {
+    throw Exception::NotImplemented(">>> (Assembler::assembleQoIHessVec22): Hessian not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assembleQoIHessVec23(ROL::Ptr<Tpetra::MultiVector<>> &H23,
+                                     const ROL::Ptr<QoI_type> &qoi,
+                                     const ROL::Ptr<const std::vector<Real>> &v,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                     const ROL::Ptr<const std::vector<Real>> & z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssembleQOIHessVec23);
+  #endif
+  if (z_param != ROL::nullPtr) {
+    try {
+      // Get u_coeff from u and z_coeff from z
+      scalar_view u_coeff, z_coeff;
+      getCoeffFromStateVector(u_coeff,u);
+      getCoeffFromControlVector(z_coeff,z);
+      // Initialize control-control HessVec vectors
+      if ( H23 == ROL::nullPtr ) {
+        H23 = ROL::makePtr<Tpetra::MultiVector<>>(myUniqueControlMap_, 1, true);
+      }
+      if ( qoi_vecH23_overlap_ == ROL::nullPtr ) {
+        qoi_vecH23_overlap_ = ROL::makePtr<Tpetra::MultiVector<>>(myOverlapControlMap_, 1, true);
+      }
+      // Assemble
+      scalar_view localVal;
+      qoi->HessVec_23(localVal,v,u_coeff,z_coeff,z_param);
+      assembleFieldVector( H23, localVal, qoi_vecH23_overlap_, dofMgr2_ );
+    }
+    catch (Exception::Zero &ez) {
+      throw Exception::Zero(">>> (Assembler::assembleQoIHessVec23): Hessian is zero.");
+    }
+    catch (Exception::NotImplemented &eni) {
+      throw Exception::NotImplemented(">>> (Assembler::assembleQoIHessVec23): Hessian not implemented.");
+    }
+  }
+  else {
+    throw Exception::NotImplemented(">>> (Assembler::assembleQoIHessVec23): Hessian not implemented.");
+  }
+
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assembleQoIHessVec31(ROL::Ptr<std::vector<Real>> &H31,
+                                     const ROL::Ptr<QoI_type> &qoi,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &v,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                     const ROL::Ptr<const std::vector<Real>> & z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssembleQOIHessVec31);
+  #endif
+  if ( z_param != ROL::nullPtr ) {
+    const int size = z_param->size();
+    if ( H31 == ROL::nullPtr ) {
+      H31 = ROL::makePtr<std::vector<Real>>(size,0);
+    }
+    try {
+      H31->assign(size,0);
+      // Get u_coeff from u and z_coeff from z
+      scalar_view v_coeff, u_coeff, z_coeff;
+      getCoeffFromStateVector(v_coeff,v);
+      getCoeffFromStateVector(u_coeff,u);
+      getCoeffFromControlVector(z_coeff,z);
+      // Assemble
+      std::vector<scalar_view> localVal(size);
+      std::vector<Real> H = qoi->HessVec_31(localVal,v_coeff,u_coeff,z_coeff,z_param);
+      assembleParamVector( H31, localVal );
+      for (int i = 0; i < size; ++i) {
+        (*H31)[i] += H[i];
+      }
+    }
+    catch ( Exception::Zero & ez ) {
+      H31->assign(size,0);
+    }
+    catch ( Exception::NotImplemented & eni ) {
+      throw Exception::NotImplemented(">>> (Assembler::assembleQoIHessVec31): HessVec not implemented.");
+    }
+  }
+  else {
+    throw Exception::NotImplemented(">>> (Assembler::assembleQoIHessVec31): HessVec not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assembleQoIHessVec32(ROL::Ptr<std::vector<Real>> &H32,
+                                     const ROL::Ptr<QoI_type> &qoi,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &v,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                     const ROL::Ptr<const std::vector<Real>> & z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssembleQOIHessVec32);
+  #endif
+  if ( z_param != ROL::nullPtr ) {
+    const int size = z_param->size();
+    if ( H32 == ROL::nullPtr ) {
+      H32 = ROL::makePtr<std::vector<Real>>(size,0);
+    }
+    try {
+      H32->assign(size,0);
+      // Get u_coeff from u and z_coeff from z
+      scalar_view v_coeff, u_coeff, z_coeff;
+      getCoeffFromControlVector(v_coeff,v);
+      getCoeffFromStateVector(u_coeff,u);
+      getCoeffFromControlVector(z_coeff,z);
+      // Assemble
+      std::vector<scalar_view> localVal(size);
+      std::vector<Real> H = qoi->HessVec_32(localVal,v_coeff,u_coeff,z_coeff,z_param);
+      assembleParamVector( H32, localVal );
+      for (int i = 0; i < size; ++i) {
+        (*H32)[i] += H[i];
+      }
+    }
+    catch ( Exception::Zero & ez ) {
+      H32->assign(size,0);
+    }
+    catch ( Exception::NotImplemented & eni ) {
+      throw Exception::NotImplemented(">>> (Assembler::assembleQoIHessVec32): HessVec not implemented.");
+    }
+  }
+  else {
+    throw Exception::NotImplemented(">>> (Assembler::assembleQoIHessVec32): HessVec not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assembleQoIHessVec33(ROL::Ptr<std::vector<Real>> &H33,
+                                     const ROL::Ptr<QoI_type> &qoi,
+                                     const ROL::Ptr<const std::vector<Real>> &v,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                     const ROL::Ptr<const std::vector<Real>> &z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssembleQOIHessVec33);
+  #endif
+  if ( z_param != ROL::nullPtr ) {
+    const int size = z_param->size();
+    if ( H33 == ROL::nullPtr ) {
+      H33 = ROL::makePtr<std::vector<Real>>(size,0);
+    }
+    try {
+      H33->assign(size,0);
+      // Get u_coeff from u and z_coeff from z
+      scalar_view u_coeff, z_coeff;
+      getCoeffFromStateVector(u_coeff,u);
+      getCoeffFromControlVector(z_coeff,z);
+      // Assemble
+      std::vector<scalar_view> localVal(size);
+      std::vector<Real> H = qoi->HessVec_33(localVal,v,u_coeff,z_coeff,z_param);
+      assembleParamVector( H33, localVal );
+      for (int i = 0; i < size; ++i) {
+        (*H33)[i] += H[i];
+      }
+    }
+    catch ( Exception::Zero & ez ) {
+      H33->assign(size,0);
+    }
+    catch ( Exception::NotImplemented & eni ) {
+      throw Exception::NotImplemented(">>> (Assembler::assembleQoIHessVec33): HessVec not implemented.");
+    }
+  }
+  else {
+    throw Exception::NotImplemented(">>> (Assembler::assembleQoIHessVec33): HessVec not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assembleQoIHessian11(ROL::Ptr<Tpetra::CrsMatrix<>> &H11,
+                                     const ROL::Ptr<QoI_type> &qoi,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                     const ROL::Ptr<const std::vector<Real>> & z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssembleQOIHessian11);
+  #endif
+  try {
+    // Get u_coeff from u and z_coeff from z
+    scalar_view u_coeff, z_coeff;
+    getCoeffFromStateVector(u_coeff,u);
+    getCoeffFromControlVector(z_coeff,z);
+    // Initialize Jacobian matrices
+    if ( H11 == ROL::nullPtr ) {
+      H11 = ROL::makePtr<Tpetra::CrsMatrix<>>(matH11Graph_);
+    }
+    // Assemble
+    scalar_view localVal;
+    qoi->Hessian_11(localVal,u_coeff,z_coeff,z_param);
+    assembleFieldMatrix( H11, localVal, dofMgr1_, dofMgr1_ );
+  }
+  catch ( Exception::Zero & ez ) {
+    throw Exception::Zero(">>> (Assembler::assembleQoIHessian11): Hessian is zero.");
+  }
+  catch ( Exception::NotImplemented & eni ) {
+    throw Exception::NotImplemented(">>> (Assembler::assembleQoIHessian11): Hessian not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assembleQoIHessian12(ROL::Ptr<Tpetra::CrsMatrix<>> &H12,
+                                     const ROL::Ptr<QoI_type> &qoi,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                     const ROL::Ptr<const std::vector<Real>> & z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssembleQOIHessian12);
+  #endif
+  try {
+    // Get u_coeff from u and z_coeff from z
+    scalar_view u_coeff, z_coeff;
+    getCoeffFromStateVector(u_coeff,u);
+    getCoeffFromControlVector(z_coeff,z);
+    // Initialize Jacobian matrices
+    if ( H12 == ROL::nullPtr ) {
+      H12 = ROL::makePtr<Tpetra::CrsMatrix<>>(matH12Graph_);
+    }
+    // Assemble
+    scalar_view localVal;
+    qoi->Hessian_12(localVal,u_coeff,z_coeff,z_param);
+    assembleFieldMatrix( H12, localVal, dofMgr1_, dofMgr2_ );
+  }
+  catch ( Exception::Zero & ez ) {
+    throw Exception::Zero(">>> (Assembler::assembleQoIHessian12): Hessian is zero.");
+  }
+  catch ( Exception::NotImplemented & eni ) {
+    throw Exception::NotImplemented(">>> (Assembler::assembleQoIHessian12): Hessian not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assembleQoIHessian21(ROL::Ptr<Tpetra::CrsMatrix<>> &H21,
+                                     const ROL::Ptr<QoI_type> &qoi,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                     const ROL::Ptr<const std::vector<Real>> & z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssembleQOIHessian21);
+  #endif
+  try {
+    // Get u_coeff from u and z_coeff from z
+    scalar_view u_coeff, z_coeff;
+    getCoeffFromStateVector(u_coeff,u);
+    getCoeffFromControlVector(z_coeff,z);
+    // Initialize Jacobian matrices
+    if ( H21 == ROL::nullPtr ) {
+      H21 = ROL::makePtr<Tpetra::CrsMatrix<>>(matH21Graph_);
+    }
+    // Assemble
+    scalar_view localVal;
+    qoi->Hessian_21(localVal,u_coeff,z_coeff,z_param);
+    assembleFieldMatrix( H21, localVal, dofMgr2_, dofMgr1_ );
+  }
+  catch ( Exception::Zero & ez ) {
+    throw Exception::Zero(">>> (Assembler::assembleQoIHessian21): Hessian is zero.");
+  }
+  catch ( Exception::NotImplemented & eni ) {
+    throw Exception::NotImplemented(">>> (Assembler::assembleQoIHessian21): Hessian not implemented.");
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assembleQoIHessian22(ROL::Ptr<Tpetra::CrsMatrix<>> &H22,
+                                     const ROL::Ptr<QoI_type> &qoi,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                     const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                     const ROL::Ptr<const std::vector<Real>> & z_param) {
+  #ifdef ROL_TIMERS
+    Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::AssembleQOIHessian22);
+  #endif
+  try {
+    // Get u_coeff from u and z_coeff from z
+    scalar_view u_coeff, z_coeff;
+    getCoeffFromStateVector(u_coeff,u);
+    getCoeffFromControlVector(z_coeff,z);
+    // Initialize Jacobian matrices
+    if ( H22 == ROL::nullPtr ) {
+      H22 = ROL::makePtr<Tpetra::CrsMatrix<>>(matH11Graph_);
+    }
+    // Assemble
+    scalar_view localVal;
+    qoi->Hessian_22(localVal,u_coeff,z_coeff,z_param);
+    assembleFieldMatrix( H22, localVal, dofMgr2_, dofMgr2_ );
+  }
+  catch ( Exception::Zero & ez ) {
+    throw Exception::Zero(">>> (Assembler::assembleQoIHessian22): Hessian is zero.");
+  }
+  catch ( Exception::NotImplemented & eni ) {
+    throw Exception::NotImplemented(">>> (Assembler::assembleQoIHessian22): Hessian not implemented.");
+  }
+}
+/***************************************************************************/
+/* End QoI assembly routines                                               */
+/***************************************************************************/
+
+/***************************************************************************/
+/* Assemble and apply Riesz operator corresponding to simulation variables */
+/***************************************************************************/
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assemblePDERieszMap1(ROL::Ptr<Tpetra::CrsMatrix<>> &R1,
+                                     const ROL::Ptr<PDE_type> &pde) {
+  try {
+    // Initialize Riesz matrix if not done so already
+    if ( R1 == ROL::nullPtr ) {
+    R1 = ROL::makePtr<Tpetra::CrsMatrix<>>(matR1Graph_);
+    }
+    // Assemble
+    scalar_view localVal;
+    pde->RieszMap_1(localVal);
+    assembleFieldMatrix( R1, localVal, dofMgr1_, dofMgr1_ );
+  }
+  catch ( Exception::NotImplemented & eni ) {
+    //throw Exception::NotImplemented(">>> (Assembler::assemblePDERieszMap1): Riesz map not implemented!");
+  }
+  catch ( Exception::Zero & ez ) {
+    //throw Exception::Zero(">>> (Assembler::assemblePDERieszMap1): Riesz map is zero!");
+  }
+}
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assembleDynPDERieszMap1(ROL::Ptr<Tpetra::CrsMatrix<>> &R1,
+                                        const ROL::Ptr<DynPDE_type> &pde) {
+  try {
+    // Initialize Riesz matrix if not done so already
+    if ( R1 == ROL::nullPtr ) {
+    R1 = ROL::makePtr<Tpetra::CrsMatrix<>>(matR1Graph_);
+    }
+    // Assemble
+    scalar_view localVal;
+    pde->RieszMap_1(localVal);
+    assembleFieldMatrix( R1, localVal, dofMgr1_, dofMgr1_ );
+  }
+  catch ( Exception::NotImplemented & eni ) {
+    //throw Exception::NotImplemented(">>> (Assembler::assemblePDERieszMap1): Riesz map not implemented!");
+  }
+  catch ( Exception::Zero & ez ) {
+    //throw Exception::Zero(">>> (Assembler::assemblePDERieszMap1): Riesz map is zero!");
+  }
+}
+/***************************************************************************/
+/* End of functions for Riesz operator of simulation variables.            */
+/***************************************************************************/
+
+/***************************************************************************/
+/* Assemble and apply Riesz operator corresponding to optimization         */
+/* variables                                                               */
+/***************************************************************************/
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assemblePDERieszMap2(ROL::Ptr<Tpetra::CrsMatrix<>> &R2,
+                                     const ROL::Ptr<PDE_type> &pde) {
+  try {
+    // Initialize Riesz matrix if not done so already
+    if ( R2 == ROL::nullPtr ) {
+      R2 = ROL::makePtr<Tpetra::CrsMatrix<>>(matR2Graph_);
+    }
+    // Assemble
+    scalar_view localVal;
+    pde->RieszMap_2(localVal);
+    assembleFieldMatrix( R2, localVal, dofMgr2_, dofMgr2_ );
+  }
+  catch ( Exception::NotImplemented & eni ) {
+    //throw Exception::NotImplemented(">>> (Assembler::assemblePDERieszMap2): Riesz map not implemented!");
+  }
+  catch ( Exception::Zero & ez ) {
+    //throw Exception::Zero(">>> (Assembler::assemblePDERieszMap2): Riesz map is zero!");
+  }
+}
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assembleDynPDERieszMap2(ROL::Ptr<Tpetra::CrsMatrix<>> &R2,
+                                        const ROL::Ptr<DynPDE_type> &pde) {
+  try {
+    // Initialize Riesz matrix if not done so already
+    if ( R2 == ROL::nullPtr ) {
+      R2 = ROL::makePtr<Tpetra::CrsMatrix<>>(matR2Graph_);
+    }
+    // Assemble
+    scalar_view localVal;
+    pde->RieszMap_2(localVal);
+    assembleFieldMatrix( R2, localVal, dofMgr2_, dofMgr2_ );
+  }
+  catch ( Exception::NotImplemented & eni ) {
+    //throw Exception::NotImplemented(">>> (Assembler::assemblePDERieszMap2): Riesz map not implemented!");
+  }
+  catch ( Exception::Zero & ez ) {
+    //throw Exception::Zero(">>> (Assembler::assemblePDERieszMap2): Riesz map is zero!");
+  }
+}
+/***************************************************************************/
+/* End of functions for Riesz operator of optimization variables.          */
+/***************************************************************************/
+
+/***************************************************************************/
+/* Compute error routines.                                                 */
+/***************************************************************************/
+template<class Real, class DeviceType>
+Real Assembler<Real,DeviceType>::computeStateError(const ROL::Ptr<const Tpetra::MultiVector<>> &soln,
+                                       const ROL::Ptr<Solution<Real>> &trueSoln,
+                                       const int cubDeg,
+                                       const ROL::Ptr<FieldHelper_type> &fieldHelper) const {
+  Real totalError(0);
+  // populate inCoeffs
+  scalar_view inCoeffs0;
+  getCoeffFromStateVector(inCoeffs0, soln);
+  // split fields
+  int numFields = 1;
+  std::vector<scalar_view> inCoeffs;
+  if (fieldHelper != ROL::nullPtr) {
+    numFields = fieldHelper->numFields();
+    fieldHelper->splitFieldCoeff(inCoeffs,inCoeffs0);
+  }
+  else {
+    inCoeffs.push_back(inCoeffs0);
+  }
+  // compute error
+  for (int fn = 0; fn < numFields; ++fn) {
+    // create fe object for error computation
+    Intrepid2::DefaultCubatureFactory cubFactory;
+    shards::CellTopology cellType = basisPtrs1_[fn]->getBaseCellTopology();
+    auto cellCub = cubFactory.create<DeviceType,Real,Real>(cellType, cubDeg);
+    ROL::Ptr<FE_type> fe
+      = ROL::makePtr<FE_type>(volCellNodes_,basisPtrs1_[fn],cellCub);
+
+    // get dimensions
+    int c = fe->gradN().extent_int(0);
+    int p = fe->gradN().extent_int(2);
+    int d = fe->gradN().extent_int(3);
+
+    // evaluate input coefficients on fe basis
+    scalar_view funcVals = scalar_view("funcVals", c, p);
+    fe->evaluateValue(funcVals, inCoeffs[fn]);
+
+    // subtract off true solution
+    std::vector<Real> x(d);
+    for (int i=0; i<c; ++i) {
+      for (int j=0; j<p; ++j) {
+        for (int k=0; k<d; ++k) {
+          x[k] = fe->cubPts()(i, j, k);
+        }
+        funcVals(i, j) -= trueSoln->evaluate(x,fn);
+      }
+    }
+
+    // compute norm squared of local error
+    scalar_view normSquaredError = scalar_view("normSquaredError", c);
+    fe->computeIntegral(normSquaredError, funcVals, funcVals, false);
+
+    Real localErrorSum(0);
+    Real globalErrorSum(0);
+    for (int i=0; i<numCells_; ++i) {
+      localErrorSum += normSquaredError(i);
+    }
+    Teuchos::reduceAll<int, Real>(*comm_, Teuchos::REDUCE_SUM, 1, &localErrorSum, &globalErrorSum);
+    totalError += globalErrorSum;
+  }
+
+  return std::sqrt(totalError);
+}
+
+template<class Real, class DeviceType>
+Real Assembler<Real,DeviceType>::computeControlError(const ROL::Ptr<const Tpetra::MultiVector<>> &soln,
+                                          const ROL::Ptr<Solution<Real>> &trueSoln,
+                                          const int cubDeg,
+                                          const ROL::Ptr<FieldHelper_type> &fieldHelper) const {
+  Real totalError(0);
+  // populate inCoeffs
+  scalar_view inCoeffs0;
+  getCoeffFromControlVector(inCoeffs0, soln);
+  // split fields
+  int numFields = 1;
+  std::vector<scalar_view> inCoeffs;
+  if (fieldHelper != ROL::nullPtr) {
+    numFields = fieldHelper->numFields();
+    fieldHelper->splitFieldCoeff(inCoeffs,inCoeffs0);
+  }
+  else {
+    inCoeffs.push_back(inCoeffs0);
+  }
+  // compute error
+  for (int fn = 0; fn < numFields; ++fn) {
+    // create fe object for error computation
+    Intrepid2::DefaultCubatureFactory cubFactory;
+    shards::CellTopology cellType = basisPtrs2_[fn]->getBaseCellTopology();
+    auto cellCub = cubFactory.create<DeviceType,Real,Real>(cellType, cubDeg);
+    ROL::Ptr<FE_type> fe = ROL::makePtr<FE_type>(volCellNodes_,basisPtrs2_[fn],cellCub);
+
+    // get dimensions
+    int c = fe->gradN().extent_int(0);
+    int p = fe->gradN().extent_int(2);
+    int d = fe->gradN().extent_int(3);
+
+    // evaluate input coefficients on fe basis
+    scalar_view funcVals = scalar_view ("funcVals", c, p);
+    fe->evaluateValue(funcVals, inCoeffs[fn]);
+
+    // subtract off true solution
+    std::vector<Real> x(d);
+    for (int i=0; i<c; ++i) {
+      for (int j=0; j<p; ++j) {
+        for (int k=0; k<d; ++k) {
+          x[k] = fe->cubPts()(i, j, k);
+        }
+        funcVals(i, j) -= trueSoln->evaluate(x,fn);
+      }
+    }
+
+    // compute norm squared of local error
+    scalar_view normSquaredError = scalar_view("normSquaredError", c);
+    fe->computeIntegral(normSquaredError, funcVals, funcVals, false);
+
+    Real localErrorSum(0);
+    Real globalErrorSum(0);
+    for (int i=0; i<numCells_; ++i) {
+      localErrorSum += normSquaredError(i);
+    }
+    Teuchos::reduceAll<int, Real>(*comm_, Teuchos::REDUCE_SUM, 1, &localErrorSum, &globalErrorSum);
+    totalError += globalErrorSum;
+  }
+
+  return std::sqrt(totalError);
+}
+/***************************************************************************/
+/* End of compute solution routines.                                       */
+/***************************************************************************/
+
+/***************************************************************************/
+/* Output routines.                                                        */
+/***************************************************************************/
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::printMeshData(std::ostream &outStream) const {
+  scalar_view nodes = meshMgr_->getNodes();
+  int_view  cellToNodeMap = meshMgr_->getCellToNodeMap();
+  if ( verbose_ && myRank_ == 0) {
+    outStream << std::endl;
+    outStream << "Number of nodes = " << meshMgr_->getNumNodes() << std::endl;
+    outStream << "Number of cells = " << meshMgr_->getNumCells() << std::endl;
+    outStream << "Number of edges = " << meshMgr_->getNumEdges() << std::endl;
+  }
+  // Print mesh to file.
+  if ((myRank_ == 0)) {
+    std::ofstream meshfile;
+    meshfile.open("cell_to_node_quad.txt");
+    for (int i=0; i<cellToNodeMap.extent_int(0); ++i) {
+      for (int j=0; j<cellToNodeMap.extent_int(1); ++j) {
+        meshfile << cellToNodeMap(i,j) << "  ";
+      }
+      meshfile << std::endl;
+    }
+    meshfile.close();
+    
+//    meshfile.open("cell_to_node_tri.txt");
+//    for (int i=0; i<cellToNodeMap.extent_int(0); ++i) {
+//      for (int j=0; j<3; ++j) {
+//        meshfile << cellToNodeMap(i,j) << "  ";
+//      }
+//      meshfile << std::endl;
+//      for (int j=2; j<5; ++j) {
+//        meshfile << cellToNodeMap(i,j%4) << "  ";
+//      }
+//      meshfile << std::endl;
+//    }
+//    meshfile.close();
+   
+    meshfile.open("nodes.txt");
+    meshfile.precision(16);
+    for (int i=0; i<nodes.extent_int(0); ++i) {
+      for (int j=0; j<nodes.extent_int(1); ++j) {
+        meshfile << std::scientific << nodes(i,j) << "  ";
+      }
+      meshfile << std::endl;
+    }
+    meshfile.close();
+  }
+} // prinf function end
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::outputTpetraVector(const ROL::Ptr<const Tpetra::MultiVector<>> &vec,
+                                         const std::string &filename) const {
+  Tpetra::MatrixMarket::Writer< Tpetra::CrsMatrix<>> vecWriter;
+  vecWriter.writeDenseFile(filename, vec);
+  std::string mapfile = "map_" + filename;
+  vecWriter.writeMapFile(mapfile, *(vec->getMap()));
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::inputTpetraVector(ROL::Ptr<Tpetra::MultiVector<>> &vec,
+                                        const std::string &filename) const {
+  Tpetra::MatrixMarket::Reader< Tpetra::CrsMatrix<>> vecReader;
+  auto map = vec->getMap();
+  auto comm = map->getComm();
+  auto vec_rcp = vecReader.readDenseFile(filename, comm, map);
+  vec->scale(1.0, *vec_rcp);
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::printDataPDE(const ROL::Ptr<PDE_type> &pde,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                   const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                   const ROL::Ptr<const std::vector<Real>> &z_param) const {
+  std::stringstream tag;
+  if (numProcs_==1) tag << "";
+  else              tag << "_" << myRank_ << "_" << numProcs_;
+
+  scalar_view u_coeff;
+  getCoeffFromStateVector(u_coeff,u);
+
+  scalar_view z_coeff;
+  getCoeffFromControlVector(z_coeff,z);
+
+  pde->printData(tag.str(),u_coeff,z_coeff,z_param);
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::printCellAveragesPDE(const ROL::Ptr<PDE_type> &pde,
+                                           const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                           const ROL::Ptr<const Tpetra::MultiVector<>> &z,
+                                           const ROL::Ptr<const std::vector<Real>> &z_param) const {
+  std::stringstream tag;
+  if (numProcs_==1) tag << "";
+  else              tag << "_" << myRank_ << "_" << numProcs_;
+
+  scalar_view u_coeff;
+  getCoeffFromStateVector(u_coeff,u);
+
+  scalar_view z_coeff;
+  getCoeffFromControlVector(z_coeff,z);
+
+  // Print to cell id file.
+  std::stringstream nameCellid;
+  nameCellid << "cellid" << tag.str() << ".txt";
+  std::ofstream fileCellid;
+  fileCellid.open(nameCellid.str());
+  for (int i = 0; i < numCells_; ++i) {
+      fileCellid << myCellIds_[i] << std::endl;
+  }
+  fileCellid.close();
+
+  // Print cell average data.
+  pde->printCellAverages(tag.str(),u_coeff,z_coeff,z_param);
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::serialPrintStateEdgeField(const ROL::Ptr<const Tpetra::MultiVector<>> &u,
+                                                const ROL::Ptr<FieldHelper_type> &fieldHelper,
+                                                const std::string &filename,
+                                                const ROL::Ptr<FE_CURL_type> &fe) const {
+  const int c = fe->curlN().extent_int(0);
+  const int f = fe->curlN().extent_int(1);
+  const int p = 1, d = 3;
+
+  scalar_view u_coeff;
+  getCoeffFromStateVector(u_coeff,u);
+
+  std::vector<scalar_view> U;
+  fieldHelper->splitFieldCoeff(U, u_coeff);
+  int numFields = U.size();
+
+  // Transform cell center to physical
+  scalar_view rx("rx", p,d);
+  scalar_view px("px", c,p,d);
+  fe->mapRefPointsToPhysical(px,rx);
+  // Transform reference values into physical space.
+  scalar_view cellJac("cellJac", c,p,d,d);
+  scalar_view cellJacInv("cellJacInv", c,p,d,d);
+  ROL::Ptr<shards::CellTopology> cellTopo
+    = ROL::makePtr<shards::CellTopology>(basisPtrs1_[0]->getBaseCellTopology());
+  scalar_view valReference("valReference", f,p,d);
+  basisPtrs1_[0]->getValues(valReference,rx,Intrepid2::OPERATOR_VALUE);
+  scalar_view valPhysical("valPhysical",c,f,p,d);
+  ct::setJacobian(cellJac,rx,volCellNodes_,*cellTopo);
+  ct::setJacobianInv(cellJacInv, cellJac);
+  fst::HCURLtransformVALUE(valPhysical, cellJacInv, valReference);
+
+  std::vector<scalar_view> uval(numFields);
+  for (int k = 0; k < numFields; ++k) {
+    uval[k] = scalar_view("uvalk", c,p,d);
+    fst::evaluate(uval[k], U[k], valPhysical);
+  }
+  // Print
+  std::ofstream file;
+  file.open(filename);
+  for (int i = 0; i < c; ++i) {
+    file << std::scientific << std::setprecision(15);
+    for (int j = 0; j < d; ++j) {
+      file << std::setw(25) << px(i,0,j);
+    }
+    for (int k = 0; k < numFields; ++k) {
+      for (int j = 0; j < d; ++j) {
+        file << std::setw(25) << uval[k](i,0,j);
+      }
+    }
+    file << std::endl;
+  }
+  file.close();
+}
+/***************************************************************************/
+/* End of output routines.                                                 */
+/***************************************************************************/
+
+/***************************************************************************/
+/* Vector generation routines.                                             */
+/***************************************************************************/
+template<class Real, class DeviceType>
+const ROL::Ptr<const Tpetra::Map<>> Assembler<Real,DeviceType>::getStateMap(void) const {
+  return myUniqueStateMap_;
+}
+
+template<class Real, class DeviceType>
+const ROL::Ptr<const Tpetra::Map<>> Assembler<Real,DeviceType>::getControlMap(void) const {
+  return myUniqueControlMap_;
+}
+
+template<class Real, class DeviceType>
+const ROL::Ptr<const Tpetra::Map<>> Assembler<Real,DeviceType>::getResidualMap(void) const {
+  return myUniqueResidualMap_;
+}
+
+template<class Real, class DeviceType>
+ROL::Ptr<Tpetra::MultiVector<>> Assembler<Real,DeviceType>::createStateVector(void) const {
+  return ROL::makePtr<Tpetra::MultiVector<>>(myUniqueStateMap_, 1, true);
+}
+
+template<class Real, class DeviceType>
+ROL::Ptr<Tpetra::MultiVector<>> Assembler<Real,DeviceType>::createControlVector(void) const {
+  return ROL::makePtr<Tpetra::MultiVector<>>(myUniqueControlMap_, 1, true);
+}
+
+template<class Real, class DeviceType>
+ROL::Ptr<Tpetra::MultiVector<>> Assembler<Real,DeviceType>::createResidualVector(void) const {
+  return ROL::makePtr<Tpetra::MultiVector<>>(myUniqueResidualMap_, 1, true);
+}
+/***************************************************************************/
+/* End of vector generation routines.                                      */
+/***************************************************************************/
+
+/***************************************************************************/
+/* Accessor routines.                                                      */
+/***************************************************************************/
+template<class Real, class DeviceType>
+const ROL::Ptr<DofManager<Real,DeviceType>> Assembler<Real,DeviceType>::getDofManager(void) const {
+  return dofMgr1_;
+}
+template<class Real, class DeviceType>
+const ROL::Ptr<DofManager<Real,DeviceType>> Assembler<Real,DeviceType>::getDofManager2(void) const {
+  return dofMgr2_;
+}
+
+template<class Real, class DeviceType>
+Teuchos::Array<typename Tpetra::Map<>::global_ordinal_type> Assembler<Real,DeviceType>::getCellIds(void) const {
+  return myCellIds_;
+}
+/***************************************************************************/
+/* End of accessor routines.                                               */
+/***************************************************************************/
+
+/*****************************************************************************/
+/******************* PRIVATE MEMBER FUNCTIONS ********************************/
+/*****************************************************************************/
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::setCommunicator(const ROL::Ptr<const Teuchos::Comm<int>> &comm,
+                                      Teuchos::ParameterList &parlist,
+                                      std::ostream &outStream) {
+  comm_ = comm;
+  // Get number of processors and my rank
+  myRank_    = comm->getRank();
+  numProcs_  = comm->getSize();
+  // Parse parameter list
+  verbose_ = parlist.sublist("PDE FEM").get("Verbose Output",false);
+  if (verbose_ && myRank_==0 ) {
+    outStream << "Initialized communicator. " << std::endl;
+  }
+  if (verbose_ && myRank_==0 ) {
+    outStream << "Total number of processors: " << numProcs_ << std::endl;
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::setBasis(
+       const std::vector<basis_ptr> &basisPtrs1,
+       const std::vector<basis_ptr> &basisPtrs2,
+       Teuchos::ParameterList &parlist,
+       std::ostream &outStream) {
+  basisPtrs1_ = basisPtrs1;
+  basisPtrs2_ = basisPtrs2;
+  if (verbose_ && myRank_==0) {
+    outStream << "Initialized PDE." << std::endl;
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::setDiscretization(Teuchos::ParameterList &parlist,
+                                  const ROL::Ptr<MeshManager_type> &meshMgr,
+                                        std::ostream &outStream) {
+  if ( meshMgr != ROL::nullPtr ) {
+    // Use MeshManager object if supplied
+    meshMgr_ = meshMgr;
+  }
+  else {
+    // Otherwise construct MeshManager objective from parameter list
+  }
+  dofMgr1_ = ROL::makePtr<DofManager_type>(meshMgr_,basisPtrs1_);
+  dofMgr2_ = ROL::makePtr<DofManager_type>(meshMgr_,basisPtrs2_);
+  if (verbose_ && myRank_==0) {
+    outStream << "Initialized discretization (MeshManager and DofManager)." << std::endl;
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::setParallelStructure(Teuchos::ParameterList &parlist,
+                                           std::ostream &outStream) {
+  int cellSplit = parlist.sublist("Geometry").get<int>("Partition type");
+  /****************************************************/
+  /*** Build parallel communication infrastructure. ***/
+  /****************************************************/
+  // Partition the cells in the mesh.  We use a basic quasi-equinumerous partitioning,
+  // where the remainder, if any, is assigned to the last processor.
+  Teuchos::Array<GO> myGlobalIds1, myGlobalIds2;
+  cellOffsets_.assign(numProcs_, 0);
+  int totalNumCells = meshMgr_->getNumCells();
+  int cellsPerProc  = totalNumCells / numProcs_;
+  numCells_         = cellsPerProc;
+  ROL::Ptr<std::vector<std::vector<int>>> procCellIds = meshMgr_->getProcCellIds();
+  switch(cellSplit) {
+    case 0:
+      if (myRank_ == 0) {
+        // remainder in the first
+        numCells_ += totalNumCells % numProcs_;
+      }
+      for (int i=1; i<numProcs_; ++i) {
+        cellOffsets_[i] = cellOffsets_[i-1] + cellsPerProc
+                          + (static_cast<int>(i==1))*(totalNumCells % numProcs_);
+      }
+      break;
+    case 1:
+      if (myRank_ == numProcs_-1) {
+        // remainder in the last
+        numCells_ += totalNumCells % numProcs_;
+      }
+      for (int i=1; i<numProcs_; ++i) {
+        cellOffsets_[i] = cellOffsets_[i-1] + cellsPerProc;
+      }
+      break;
+    case 2:
+      if (myRank_ < (totalNumCells%numProcs_)) {
+        // spread remainder, starting from the first
+        numCells_++;
+      }
+      for (int i=1; i<numProcs_; ++i) {
+        cellOffsets_[i] = cellOffsets_[i-1] + cellsPerProc
+                          + (static_cast<int>(i-1<(totalNumCells%numProcs_)));
+      }
+      break;
+    case 3:
+      // Define using MeshManager.
+      numCells_ = (*procCellIds)[myRank_].size();
+      break;
+  }
+
+  int_view cellDofs1 = dofMgr1_->getCellDofs();
+  int_view cellDofs2 = dofMgr2_->getCellDofs();
+  int numLocalDofs1 = cellDofs1.extent_int(1);
+  int numLocalDofs2 = cellDofs2.extent_int(1);
+  if (verbose_) {
+    outStream << "Cell offsets across processors: " << cellOffsets_
+              << std::endl;
+  }
+  switch(cellSplit) {
+    case 0:
+    case 1:
+    case 2:
+    for (int i=0; i<numCells_; ++i) {
+      myCellIds_.push_back(cellOffsets_[myRank_]+i);
+      for (int j=0; j<numLocalDofs1; ++j) {
+        myGlobalIds1.push_back( cellDofs1(cellOffsets_[myRank_]+i,j) );
+      }
+      for (int j=0; j<numLocalDofs2; ++j) {
+        myGlobalIds2.push_back( cellDofs2(cellOffsets_[myRank_]+i,j) );
+      }
+    }
+      break;
+    case 3:
+    for (int i=0; i<numCells_; ++i) {
+      myCellIds_.push_back((*procCellIds)[myRank_][i]);
+      for (int j=0; j<numLocalDofs1; ++j) {
+        myGlobalIds1.push_back( cellDofs1((*procCellIds)[myRank_][i],j) );
+      }
+      for (int j=0; j<numLocalDofs2; ++j) {
+        myGlobalIds2.push_back( cellDofs2((*procCellIds)[myRank_][i],j) );
+      }
+    }
+      break;
+  }
+  std::sort(myGlobalIds1.begin(), myGlobalIds1.end());
+  myGlobalIds1.erase( std::unique(myGlobalIds1.begin(),myGlobalIds1.end()),myGlobalIds1.end() );
+  std::sort(myGlobalIds2.begin(), myGlobalIds2.end());
+  myGlobalIds2.erase( std::unique(myGlobalIds2.begin(),myGlobalIds2.end()),myGlobalIds2.end() );
+
+  // Build maps.
+  myOverlapStateMap_    = ROL::makePtr<Tpetra::Map<>>
+    (Teuchos::OrdinalTraits<GO>::invalid(), myGlobalIds1, GO(0), comm_);
+  //std::cout << std::endl << myOverlapMap_->getLocalElementList()<<std::endl;
+  /** One can also use the non-member function:
+      myOverlapMap_ = Tpetra::createNonContigMap<int,int>(myGlobalIds_, comm_);
+      to build the overlap map.
+  **/
+  myUniqueStateMap_     = Tpetra::createOneToOne(myOverlapStateMap_);
+  myOverlapResidualMap_ = myOverlapStateMap_;
+  myUniqueResidualMap_  = myUniqueStateMap_;
+  myOverlapControlMap_  = ROL::makePtr<Tpetra::Map<>>
+     (Teuchos::OrdinalTraits<GO>::invalid(), myGlobalIds2, GO(0), comm_);
+  myUniqueControlMap_   = Tpetra::createOneToOne(myOverlapControlMap_);;
+  //std::cout << std::endl << myUniqueMap_->getLocalElementList() << std::endl;
+  //  myCellMap_ = ROL::makePtr<Tpetra::Map<>>(
+  //               Teuchos::OrdinalTraits<Tpetra::global_size_t>::invalid(),
+  //               myCellIds_, 0, comm_);
+
+  /****************************************/
+  /*** Assemble global graph structure. ***/
+  /****************************************/
+
+  // Estimate the max number of entries per row 
+  // using a map (row indicies can be non-contiguous)
+  GO maxEntriesPerRow1(0), maxEntriesPerRow2(0);
+  {
+    std::map<GO,GO> numEntriesCount1, numEntriesCount2;
+    for (int i=0; i<numCells_; ++i) { 
+      for (int j=0; j<numLocalDofs1; ++j) {
+        numEntriesCount1[GO(cellDofs1(myCellIds_[i],j))] += numLocalDofs1;
+      }
+      for (int j=0; j<numLocalDofs2; ++j) {
+        numEntriesCount2[GO(cellDofs2(myCellIds_[i],j))] += numLocalDofs2;
+      }
+    }
+    const auto rowIndexWithMaxEntries1 
+      = std::max_element(std::begin(numEntriesCount1), std::end(numEntriesCount1), 
+                         [](const std::pair<GO,GO> &pa, const std::pair<GO,GO> &pb) {
+                           return pa.second < pb.second;
+                         });
+    const auto rowIndexWithMaxEntries2
+      = std::max_element(std::begin(numEntriesCount2), std::end(numEntriesCount2), 
+                         [](const std::pair<GO,GO> &pa, const std::pair<GO,GO> &pb) {
+                           return pa.second < pb.second;
+                         });
+    if (!numEntriesCount1.empty())
+      maxEntriesPerRow1 = rowIndexWithMaxEntries1->second;
+    if (!numEntriesCount2.empty())
+      maxEntriesPerRow2 = rowIndexWithMaxEntries2->second;
+  }
+  
+  Teuchos::ArrayRCP<GO> cellDofs1GO(numLocalDofs1, GO());
+  matJ1Graph_ = ROL::makePtr<Tpetra::CrsGraph<>>(myUniqueStateMap_, maxEntriesPerRow1);
+  for (int i=0; i<numCells_; ++i) {
+    for (int j=0; j<numLocalDofs1; ++j)
+      cellDofs1GO[j] = cellDofs1(myCellIds_[i],j);
+    for (int j=0; j<numLocalDofs1; ++j) {
+      matJ1Graph_->insertGlobalIndices(cellDofs1GO[j],cellDofs1GO());
+    }
+  }
+  matJ1Graph_->fillComplete(myUniqueStateMap_,myUniqueStateMap_);
+  matR1Graph_  = matJ1Graph_;
+  matH11Graph_ = matJ1Graph_;
+
+  Teuchos::ArrayRCP<GO> cellDofs2GO(numLocalDofs2, GO());
+  matJ2Graph_ = ROL::makePtr<Tpetra::CrsGraph<>>(myUniqueStateMap_, std::max(maxEntriesPerRow1,maxEntriesPerRow2));
+  for (int i=0; i<numCells_; ++i) {
+    for (int j=0; j<numLocalDofs2; ++j)
+      cellDofs2GO[j] = cellDofs2(myCellIds_[i],j);
+    for (int j=0; j<numLocalDofs1; ++j) {
+      matJ2Graph_->insertGlobalIndices(GO(cellDofs1(myCellIds_[i],j)),cellDofs2GO());
+    }
+  }
+  matJ2Graph_->fillComplete(myUniqueControlMap_,myUniqueStateMap_);
+  matH21Graph_ = matJ2Graph_;
+
+  matH12Graph_ = ROL::makePtr<Tpetra::CrsGraph<>>(myUniqueControlMap_, maxEntriesPerRow1);
+  for (int i=0; i<numCells_; ++i) {
+    for (int j=0; j<numLocalDofs1; ++j)
+      cellDofs1GO[j] = cellDofs1(myCellIds_[i],j);
+    for (int j=0; j<numLocalDofs2; ++j) {
+      matH12Graph_->insertGlobalIndices(GO(cellDofs2(myCellIds_[i],j)), cellDofs1GO());
+    }
+  }
+  matH12Graph_->fillComplete(myUniqueStateMap_,myUniqueControlMap_);
+
+  matR2Graph_ = ROL::makePtr<Tpetra::CrsGraph<>>(myUniqueControlMap_, maxEntriesPerRow2);
+  for (int i=0; i<numCells_; ++i) {
+    for (int j=0; j<numLocalDofs2; ++j)
+      cellDofs2GO[j] = cellDofs2(myCellIds_[i],j);
+    for (int j=0; j<numLocalDofs2; ++j) {
+      matR2Graph_->insertGlobalIndices(cellDofs2GO[j], cellDofs2GO());
+    }
+  }
+  matR2Graph_->fillComplete(myUniqueControlMap_,myUniqueControlMap_);
+  matH22Graph_ = matR2Graph_;
+
+  if (verbose_ && myRank_==0) {
+    outStream << "Initialized parallel structures." << std::endl;
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::setCellNodes(std::ostream &outStream) {
+  // Build volume cell nodes
+  shards::CellTopology cellType = basisPtrs1_[0]->getBaseCellTopology();
+  int spaceDim = cellType.getDimension();
+  int numNodesPerCell = cellType.getNodeCount();
+  volCellNodes_ = scalar_view("volCellNodes_", numCells_, numNodesPerCell, spaceDim);
+  scalar_view nodes = meshMgr_->getNodes();
+  int_view ctn   = meshMgr_->getCellToNodeMap();
+  for (int i=0; i<numCells_; ++i) {
+    for (int j=0; j<numNodesPerCell; ++j) {
+      for (int k=0; k<spaceDim; ++k) {
+        volCellNodes_(i, j, k) = nodes(ctn(myCellIds_[i],j), k);
+      }
+    }
+  }
+  // Build boundary cell nodes
+  bdryCellIds_    = meshMgr_->getSideSets((verbose_ && myRank_==0), outStream);
+  int numSideSets = bdryCellIds_->size();
+  if (numSideSets > 0) {
+    bdryCellNodes_.resize(numSideSets);
+    bdryCellLocIds_.resize(numSideSets);
+    for (int i=0; i<numSideSets; ++i) {
+      int numLocSides = (*bdryCellIds_)[i].size();
+      bdryCellNodes_[i].resize(numLocSides);
+      bdryCellLocIds_[i].resize(numLocSides);
+      for (int j=0; j<numLocSides; ++j) {
+        if ((*bdryCellIds_)[i][j].size() > 0) {
+          int numCellsSide = (*bdryCellIds_)[i][j].size();
+          for (int k=0; k<numCellsSide; ++k) {
+            int idx = mapGlobalToLocalCellId((*bdryCellIds_)[i][j][k]);
+            if (idx > -1) {
+              bdryCellLocIds_[i][j].push_back(idx);
+              //if (myRank_==1) {std::cout << "\nrank " << myRank_ << "   bcid " << i << "  " << j << "  " << k << "  " << myCellIds_[idx] << "  " << idx;}
+            }
+          }
+          int myNumCellsSide = bdryCellLocIds_[i][j].size();
+          if (myNumCellsSide > 0) {
+            bdryCellNodes_[i][j] = scalar_view("bdryCellNodes_", myNumCellsSide, numNodesPerCell, spaceDim);
+          }
+          for (int k=0; k<myNumCellsSide; ++k) {
+            for (int l=0; l<numNodesPerCell; ++l) {
+              for (int m=0; m<spaceDim; ++m) {
+                bdryCellNodes_[i][j](k, l, m) = nodes(ctn(myCellIds_[bdryCellLocIds_[i][j][k]],l), m);
+              }
+            }
+          }
+        }
+        //if ((myRank_==1) && (myNumCellsSide > 0)) {std::cout << (*bdryCellNodes_[i][j]);}
+      }
+    }
+  }
+  bdryCellNodes_.resize(numSideSets);
+}
+
+template<class Real, class DeviceType>
+int Assembler<Real,DeviceType>::mapGlobalToLocalCellId(const int & gid) {
+  auto it = std::find(myCellIds_.begin(),myCellIds_.end(),gid);
+  if (it != myCellIds_.end()) {
+    return it-myCellIds_.begin();
+  }
+  else {
+    return -1;
+  }
+/*
+  int minId = cellOffsets_[myRank_];
+  int maxId = cellOffsets_[myRank_]+numCells_-1;
+  if ((gid >= minId) && (gid <= maxId)) {
+    return (gid - cellOffsets_[myRank_]);
+  }
+  else {
+    return -1;
+  }
+*/
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::getCoeffFromStateVector(scalar_view &xcoeff,
+                                        const ROL::Ptr<const Tpetra::MultiVector<>> &x) const {
+  if ( x != ROL::nullPtr ) {
+    // Perform import onto myOverlapMap
+    ROL::Ptr<Tpetra::MultiVector<>> xshared =
+      ROL::makePtr<Tpetra::MultiVector<>>(myOverlapStateMap_, 1, true);
+    Tpetra::Import<> importer(myUniqueStateMap_, myOverlapStateMap_);
+    xshared->doImport(*x,importer,Tpetra::REPLACE);
+    // Populate xcoeff
+    int_view cellDofs = dofMgr1_->getCellDofs();
+    int lfs = dofMgr1_->getLocalFieldSize();
+    xcoeff = scalar_view("xcoeff", numCells_, lfs);
+    Teuchos::ArrayRCP<const Real> xdata = xshared->get1dView();
+    for (int i=0; i<numCells_; ++i) {
+      for (int j=0; j<lfs; ++j) {
+        xcoeff(i,j) = xdata[xshared->getMap()->getLocalElement(cellDofs(myCellIds_[i],j))];
+      }
+    }
+    dofMgr1_->transformToIntrepidPattern(xcoeff);
+  }
+  else {
+    xcoeff = scalar_view();
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::getCoeffFromControlVector(scalar_view &xcoeff,
+                                          const ROL::Ptr<const Tpetra::MultiVector<>> &x) const {
+  if ( x != ROL::nullPtr ) {
+    // Perform import onto myOverlapMap
+    ROL::Ptr<Tpetra::MultiVector<>> xshared =
+      ROL::makePtr<Tpetra::MultiVector<>>(myOverlapControlMap_, 1, true);
+    Tpetra::Import<> importer(myUniqueControlMap_, myOverlapControlMap_);
+    xshared->doImport(*x,importer,Tpetra::REPLACE);
+    // Populate xcoeff
+    int_view cellDofs = dofMgr2_->getCellDofs();
+    int lfs = dofMgr2_->getLocalFieldSize();
+    xcoeff = scalar_view("xcoeff", numCells_, lfs);
+    Teuchos::ArrayRCP<const Real> xdata = xshared->get1dView();
+    for (int i=0; i<numCells_; ++i) {
+      for (int j=0; j<lfs; ++j) {
+        xcoeff(i,j) = xdata[xshared->getMap()->getLocalElement(cellDofs(myCellIds_[i],j))];
+      }
+    }
+    dofMgr2_->transformToIntrepidPattern(xcoeff);
+  }
+  else {
+    xcoeff = scalar_view();
+  }
+}
+
+/***************************************************************************/
+/****** GENERIC ASSEMBLY ROUTINES ******************************************/
+/***************************************************************************/
+template<class Real, class DeviceType>
+Real Assembler<Real,DeviceType>::assembleScalar(const scalar_view val) {
+  // Assembly
+  if ( val.is_allocated() ) {
+    Real myval(0), gval(0);
+    for (int i=0; i<numCells_; ++i) {
+      myval += val(i);
+    }
+    Teuchos::reduceAll<int,Real>(*comm_,Teuchos::REDUCE_SUM,1,&myval,&gval);
+    return gval;
+  }
+  return static_cast<Real>(0);
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assembleFieldVector(ROL::Ptr<Tpetra::MultiVector<>> &v,
+                                          scalar_view val,
+                                          ROL::Ptr<Tpetra::MultiVector<>> &vecOverlap,
+                                          const ROL::Ptr<DofManager_type> &dofMgr) {
+  // Set residual vectors to zero
+  v->putScalar(static_cast<Real>(0));
+  vecOverlap->putScalar(static_cast<Real>(0));
+  // Get degrees of freedom
+  int_view cellDofs = dofMgr->getCellDofs();
+  int numLocalDofs = cellDofs.extent_int(1);
+  // Transform values
+  transformToFieldPattern(val,dofMgr);
+  // assembly on the overlap map
+  for (int i=0; i<numCells_; ++i) {
+    for (int j=0; j<numLocalDofs; ++j) {
+      vecOverlap->sumIntoGlobalValue(cellDofs(myCellIds_[i],j),0,val(i,j));
+    }
+  }
+  // change map
+  Tpetra::Export<> exporter(vecOverlap->getMap(), v->getMap()); // redistribution
+  v->doExport(*vecOverlap, exporter, Tpetra::ADD);              // from the overlap map to the unique map
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assembleParamVector(ROL::Ptr<std::vector<Real>> &v,
+                                          std::vector<scalar_view> &val) {
+  int size = v->size();
+  v->assign(size,0);
+//  for (int i = 0; i < size; ++i) {
+//    dofMgr_->transformToFieldPattern(val[i]);
+//  }
+  // Assembly
+  std::vector<Real> myVal(size,0);
+  for (int j = 0; j < size; ++j) {
+    if ( val[j].is_allocated() ) {
+      for (int i=0; i<numCells_; ++i) {
+        myVal[j] += (val[j])(i);
+      }
+    }
+  }
+  Teuchos::reduceAll<int,Real>(*comm_,Teuchos::REDUCE_SUM,size,&myVal[0],&(*v)[0]);
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assembleFieldMatrix(ROL::Ptr<Tpetra::CrsMatrix<>> &M,
+                                          scalar_view val,
+                                          const ROL::Ptr<DofManager_type> &dofMgr1,
+                                          const ROL::Ptr<DofManager_type> &dofMgr2) {
+  // Transform data
+  transformToFieldPattern(val,dofMgr1,dofMgr2);
+  // Zero PDE Jacobian
+  M->resumeFill(); M->setAllToScalar(static_cast<Real>(0));
+  // Assemble PDE Jacobian
+  int_view cellDofs1 = dofMgr1->getCellDofs();
+  int_view cellDofs2 = dofMgr2->getCellDofs();
+  int numLocalDofs1 = cellDofs1.extent_int(1);
+  int numLocalDofs2 = cellDofs2.extent_int(1);
+  Teuchos::ArrayRCP<GO> cellDofs2GO(numLocalDofs2);
+  Teuchos::ArrayRCP<Real> values(numLocalDofs2);
+  for (int i=0; i<numCells_; ++i) {
+    for (int j=0; j<numLocalDofs2; ++j)
+      cellDofs2GO[j] = cellDofs2(myCellIds_[i],j);
+    for (int j=0; j<numLocalDofs1; ++j) {
+      for (int k=0; k<numLocalDofs2; ++k)
+        values[k] = val(i,j,k);
+      M->sumIntoGlobalValues(GO(cellDofs1(myCellIds_[i],j)), cellDofs2GO(), values());
+    }
+  }
+  M->fillComplete();
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assembleParamFieldMatrix(ROL::Ptr<Tpetra::MultiVector<>> &M,
+                                               std::vector<scalar_view> &val,
+                                               ROL::Ptr<Tpetra::MultiVector<>> &matOverlap,
+                                               const ROL::Ptr<DofManager_type> &dofMgr) {
+  // Initialize res
+  int size = M->getNumVectors();
+  // Compute PDE local Jacobian wrt parametric controls
+  for (int i = 0; i < size; ++i) {
+    transformToFieldPattern(val[i],dofMgr);
+  }
+  // Assemble PDE Jacobian wrt parametric controls
+  M->scale(static_cast<Real>(0));
+  matOverlap->scale(static_cast<Real>(0));
+  for (int k = 0; k < size; ++k) {
+    int_view cellDofs = dofMgr->getCellDofs();
+    int numLocalDofs = cellDofs.extent_int(1);
+    // assembly on the overlap map
+    for (int i=0; i<numCells_; ++i) {
+      for (int j=0; j<numLocalDofs; ++j) {
+        matOverlap->sumIntoGlobalValue(cellDofs(myCellIds_[i],j),
+                                        k,(val[k])[i*numLocalDofs+j]);
+      }
+    }
+    // change map
+    Tpetra::Export<> exporter(matOverlap->getMap(), M->getMap()); // redistribution
+    M->doExport(*matOverlap, exporter, Tpetra::ADD);              // from the overlap map to the unique map
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::assembleParamMatrix(ROL::Ptr<std::vector<std::vector<Real>>> &M,
+                                          std::vector<std::vector<scalar_view>> &val,
+                                          const ROL::Ptr<DofManager_type> &dofMgr) {
+  // Initialize local matrix
+  int size = M->size();
+  std::vector<scalar_view> tmp(size);
+  // Compute local matrix
+  //for (int i = 0; i < size; ++i) {
+  //  for (int j = 0; j < size; ++j) {
+  //    transformToFieldPattern(val[i][j],dofMgr);
+  //  }
+  //}
+  // Assemble PDE Jacobian wrt parametric controls
+  int cnt = 0, matSize = static_cast<int>(0.5*static_cast<Real>((size+1)*size));
+  std::vector<Real> myMat(matSize,static_cast<Real>(0));
+  std::vector<Real> globMat(matSize,static_cast<Real>(0));
+  for (int k = 0; k < size; ++k) {
+    for (int j = k; j < size; ++j) {
+      for (int i=0; i<numCells_; ++i) {
+        myMat[cnt] += (val[k][j])(i);
+      }
+      cnt++;
+    }
+  }
+  Teuchos::reduceAll<int,Real>(*comm_,Teuchos::REDUCE_SUM,matSize,&myMat[0],&globMat[0]);
+  cnt = 0;
+  for (int k = 0; k < size; ++k) {
+    for (int j = k; j < size; ++j) {
+      (*M)[k][j] += globMat[cnt];
+      if ( j != k ) { 
+        (*M)[j][k] += globMat[cnt];
+      }
+      cnt++;
+    }
+  }
+}
+
+template<class Real, class DeviceType>
+void Assembler<Real,DeviceType>::transformToFieldPattern(scalar_view array,
+                                              const ROL::Ptr<DofManager_type> &dofMgr1,
+                                              const ROL::Ptr<DofManager_type> &dofMgr2) const {
+  if ( array.is_allocated() ) {
+    int rank = array.rank();
+    int nc   = array.extent_int(0);
+    if ( rank == 2 ) {
+      int nf = array.extent_int(1);
+      scalar_view tmp("tmp", nc, nf);
+      for (int c = 0; c < nc; ++c) {
+        for (int f = 0; f < nf; ++f) {
+          tmp(c, dofMgr1->mapToFieldPattern(f)) = array(c, f);
+        }
+      }
+      Kokkos::deep_copy(array, tmp);
+    }
+    else if (rank == 3 ) {
+      int nf1 = array.extent_int(1);
+      int nf2 = array.extent_int(2);
+      scalar_view tmp("tmp", nc, nf1, nf2);
+      for (int c = 0; c < nc; ++c) {
+        for (int f1 = 0; f1 < nf1; ++f1) {
+          for (int f2 = 0; f2 < nf2; ++f2) {
+            tmp(c, dofMgr1->mapToFieldPattern(f1), dofMgr2->mapToFieldPattern(f2)) = array(c, f1, f2);
+          }
+        }
+      }
+      Kokkos::deep_copy(array, tmp);
+    }
+    else {
+      TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument,
+        ">>> PDE-OPT/TOOLS/assembler.hpp (transformToFieldPattern): Input array rank not 2 or 3!");
+    }
+  }
+}
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/TOOLS/batchmanagerK.hpp b/packages/rol/example/PDE-OPT/TOOLS/batchmanagerK.hpp
new file mode 100644
index 000000000000..4d5791b32073
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/TOOLS/batchmanagerK.hpp
@@ -0,0 +1,77 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+#ifndef PDEOPT_BATHCMANAGERK_HPP
+#define PDEOPT_BATHCMANAGERK_HPP
+
+#include "ROL_TeuchosBatchManager.hpp"
+#include "pdevectorK.hpp"
+
+template<class Real,
+         class LO=Tpetra::Map<>::local_ordinal_type,
+         class GO=Tpetra::Map<>::global_ordinal_type,
+         class Node=Tpetra::Map<>::node_type> 
+class PDE_OptVector_BatchManager : public ROL::TeuchosBatchManager<Real,GO> {
+private:
+  typedef Tpetra::MultiVector<Real,LO,GO,Node> FieldVector;
+  typedef std::vector<Real>                    ParamVector;
+  typedef PDE_OptVector<Real,LO,GO,Node>       OptVector;
+
+public:
+  PDE_OptVector_BatchManager(const ROL::Ptr<const Teuchos::Comm<int> > &comm)
+    : ROL::TeuchosBatchManager<Real,GO>(comm) {}
+
+  using ROL::TeuchosBatchManager<Real,GO>::sumAll;
+  void sumAll(ROL::Vector<Real> &input, ROL::Vector<Real> &output) {
+    // Sum all field components across processors
+    ROL::Ptr<ROL::TpetraMultiVector<Real,LO,GO,Node>> input_field_ptr
+      = dynamic_cast<OptVector&>(input).getField();
+    ROL::Ptr<ROL::TpetraMultiVector<Real,LO,GO,Node>> output_field_ptr
+      = dynamic_cast<OptVector&>(output).getField();
+
+    if ( input_field_ptr != ROL::nullPtr ) {
+      ROL::Ptr<FieldVector> input_field  = input_field_ptr->getVector();
+      ROL::Ptr<FieldVector> output_field = output_field_ptr->getVector();
+      size_t input_length  = input_field->getLocalLength();
+      size_t output_length = output_field->getLocalLength();
+      TEUCHOS_TEST_FOR_EXCEPTION(input_length != output_length, std::invalid_argument,
+        ">>> (PDE_OptVector_BatchManager::sumAll): Field dimension mismatch!");
+
+      size_t input_nvec  = input_field->getNumVectors();
+      size_t output_nvec = output_field->getNumVectors();
+      TEUCHOS_TEST_FOR_EXCEPTION(input_nvec != output_nvec, std::invalid_argument,
+        ">>> (PDE_OptVector_BatchManager::sumAll): Field dimension mismatch!");
+
+      for (size_t i = 0; i < input_nvec; ++i)
+        ROL::TeuchosBatchManager<Real,GO>::sumAll((input_field->getDataNonConst(i)).getRawPtr(),
+                                                  (output_field->getDataNonConst(i)).getRawPtr(),
+                                                  input_length);
+    }
+    // Sum all parameter components across processors
+    ROL::Ptr<ROL::StdVector<Real>> input_param_ptr
+      = dynamic_cast<OptVector&>(input).getParameter();
+    ROL::Ptr<ROL::StdVector<Real>> output_param_ptr
+      = dynamic_cast<OptVector&>(output).getParameter();
+
+    if ( input_param_ptr != ROL::nullPtr ) {
+      ROL::Ptr<ParamVector> input_param  = input_param_ptr->getVector();
+      ROL::Ptr<ParamVector> output_param = output_param_ptr->getVector();
+      size_t input_size  = static_cast<size_t>(input_param->size());
+      size_t output_size = static_cast<size_t>(output_param->size());
+      TEUCHOS_TEST_FOR_EXCEPTION(input_size != output_size, std::invalid_argument,
+        ">>> (PDE_OptVector_BatchManager::SumAll): Parameter dimension mismatch!");
+
+      ROL::TeuchosBatchManager<Real,GO>::sumAll(&input_param->front(),
+                                                &output_param->front(),
+                                                input_size);
+    }
+  }
+};
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/TOOLS/dofmanagerK.hpp b/packages/rol/example/PDE-OPT/TOOLS/dofmanagerK.hpp
new file mode 100644
index 000000000000..7356f77afe35
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/TOOLS/dofmanagerK.hpp
@@ -0,0 +1,596 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  dofmanager.hpp
+    \brief Given a mesh with connectivity information, sets up data
+           structures for the indexing of the global degrees of
+           freedom (Dofs).
+*/
+
+#ifndef DOFMANAGERK_HPP
+#define DOFMANAGERK_HPP
+
+#include "Teuchos_ParameterList.hpp"
+#include "Intrepid2_Basis.hpp"
+#include "meshmanagerK.hpp"
+
+template <class Real, class DeviceType>
+class DofManager {
+
+public:
+  using scalar_view = Kokkos::DynRankView<Real,DeviceType>;
+  using int_view = Kokkos::DynRankView<int,DeviceType>;
+  using basis_ptr = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+
+private:
+  ROL::Ptr<MeshManager<Real,DeviceType> > meshManager_;
+  std::vector<basis_ptr> intrepidBasis_;
+
+  int cellDim_;          // cell dimension
+
+  int numCells_;         // number of mesh cells
+  int numNodes_;         // number of mesh nodes
+  int numEdges_;         // number of mesh edges
+  int numFaces_;         // number of mesh faces
+
+  int_view meshCellToNodeMap_;
+  int_view meshCellToEdgeMap_;
+  int_view meshCellToFaceMap_;
+
+  // Local dof information.
+  int numBases_;                                  // number of bases (fields)
+  int numLocalNodes_;                             // number of nodes in the basic cell topology
+  int numLocalEdges_;                             // number of edges in the basic cell topology
+  int numLocalFaces_;                             // number of faces in the basic cell topology
+  int numLocalVoids_;                             // number of voids in the basic cell topology, =1
+  int numLocalNodeDofs_;                          // number of local (single-cell) node dofs for all bases
+  int numLocalEdgeDofs_;                          // number of local (single-cell) edge dofs for all bases
+  int numLocalFaceDofs_;                          // number of local (single-cell) face dofs for all bases
+  int numLocalVoidDofs_;                          // number of local (single-cell) face dofs for all bases
+  int numLocalDofs_;                              // total number of local (single-cell) face dofs for all bases
+  std::vector<int> numDofsPerNode_;               // number of dofs per node in a cell (assumed constant), per basis
+  std::vector<int> numDofsPerEdge_;               // number of dofs per edge in a cell (assumed constant), per basis
+  std::vector<int> numDofsPerFace_;               // number of dofs per face in a cell (assumed constant), per basis
+  std::vector<int> numDofsPerVoid_;               // number of dofs per void in a cell (assumed constant), per basis
+  std::vector<std::vector<int> > fieldPattern_;   // local indexing of fields into the array [0,1,...,numLocalDofs-1];
+                                                  // contains [number of bases] index vectors, where each index vector
+                                                  // is of size [basis cardinality]
+  // Global dof information.
+  int numNodeDofs_;  // number of global node dofs
+  int numEdgeDofs_;  // number of global edge dofs
+  int numFaceDofs_;  // number of global face dofs
+  int numVoidDofs_;  // number of global void dofs
+  int numDofs_;      // total number of global dofs
+
+  int_view nodeDofs_;  // global node dofs, of size [numNodes_ x numLocalNodeDofs_]
+  int_view edgeDofs_;  // global edge dofs, of size [numEdges_ x numLocalEdgeDofs_]
+  int_view faceDofs_;  // global face dofs, of size [numFaces_ x numLocalFaceDofs_]
+  int_view voidDofs_;  // global face dofs, of size [numFaces_ x numLocalFaceDofs_]
+  int_view cellDofs_;  // global cell dofs, of size [numCells_ x numLocalDofs_];
+                                                           // ordered by subcell (node, then edge, then face) and basis index
+  std::vector<int> mapToIntrepidPattern_;
+  std::vector<int> mapToFieldPattern_;
+  std::vector<int_view> fieldDofs_;  // global cell dofs, indexed by field/basis, of size [numCells_ x basis cardinality];
+                                                                          // ordered by subcell (node, then edge, then face)
+
+public:
+
+  DofManager(ROL::Ptr<MeshManager<Real,DeviceType> > &meshManager,
+             std::vector<basis_ptr> intrepidBasis) {
+
+    meshManager_ = meshManager;
+    intrepidBasis_ = intrepidBasis;
+    cellDim_  = intrepidBasis_[0]->getBaseCellTopology().getDimension();
+    numCells_ = meshManager_->getNumCells();
+    numNodes_ = meshManager_->getNumNodes();
+    numEdges_ = meshManager_->getNumEdges();
+    numFaces_ = meshManager_->getNumFaces();
+
+    // Mesh data structures.
+    meshCellToNodeMap_ = meshManager_->getCellToNodeMap();
+    meshCellToEdgeMap_ = meshManager_->getCellToEdgeMap();
+    meshCellToFaceMap_ = meshManager_->getCellToFaceMap();
+
+    // Local degree-of-freedom footprint.
+    numBases_ = static_cast<int>(intrepidBasis_.size());
+    numDofsPerNode_.resize(numBases_, 0);
+    numDofsPerEdge_.resize(numBases_, 0);
+    numDofsPerFace_.resize(numBases_, 0);
+    numDofsPerVoid_.resize(numBases_, 0);
+    numLocalDofs_ = 0;
+    for (int i=0; i<numBases_; ++i) {
+      auto dofTags = (intrepidBasis_[i])->getAllDofTags();
+      for (int j=0; j<(intrepidBasis_[i])->getCardinality(); ++j) {
+        if (dofTags(j,0) == 0) {
+          numDofsPerNode_[i] = dofTags(j,3);
+        }
+        else if (dofTags(j,0) == 1) {
+          if (cellDim_ == 1) { // 1D
+            numDofsPerVoid_[i] = dofTags(j,3);
+          }
+          else { // 2D, 3D
+            numDofsPerEdge_[i] = dofTags(j,3);
+          }
+        }
+        else if (dofTags(j,0) == 2) {
+          if (cellDim_ == 2) { // 2D
+            numDofsPerVoid_[i] = dofTags(j,3);
+          }
+          else { // 3D
+            numDofsPerFace_[i] = dofTags(j,3);
+          }
+        }
+        else if (dofTags(j,0) == 3) {
+          numDofsPerVoid_[i] = dofTags(j,3);
+        }
+      }
+      numLocalDofs_ += (intrepidBasis_[i])->getCardinality();
+    }
+    numLocalNodeDofs_ = 0;
+    numLocalEdgeDofs_ = 0;
+    numLocalFaceDofs_ = 0;
+    numLocalVoidDofs_ = 0;
+    for (int i=0; i<numBases_; ++i) {
+      numLocalNodeDofs_ += numDofsPerNode_[i];
+      numLocalEdgeDofs_ += numDofsPerEdge_[i];
+      numLocalFaceDofs_ += numDofsPerFace_[i];
+      numLocalVoidDofs_ += numDofsPerVoid_[i];
+    }
+    numLocalNodes_ = static_cast<int>( (intrepidBasis_[0])->getBaseCellTopology().getVertexCount() );
+    numLocalEdges_ = static_cast<int>( (intrepidBasis_[0])->getBaseCellTopology().getEdgeCount() );
+    numLocalFaces_ = static_cast<int>( (intrepidBasis_[0])->getBaseCellTopology().getFaceCount() );
+    numLocalVoids_ = 1;
+    computeFieldPattern();
+
+    // Global data structures.
+    computeDofArrays();
+    computeCellDofs();
+    computeFieldDofs();
+    computeDofTransforms();
+  }
+
+
+  int_view getNodeDofs() const {
+    return nodeDofs_;
+  }
+
+
+  int_view getEdgeDofs() const {
+    return edgeDofs_;
+  }
+
+
+  int_view getFaceDofs() const {
+    return faceDofs_;
+  }
+
+
+  int_view getVoidDofs() const {
+    return voidDofs_;
+  }
+
+
+  int_view getCellDofs() const {
+    return cellDofs_;
+  }
+
+
+  int_view getFieldDofs() const {
+    return fieldDofs_;
+  }
+
+
+  int_view getFieldDofs(const int & fieldNumber) const {
+    return fieldDofs_[fieldNumber];
+  }
+
+
+  int getNumNodeDofs() const {
+    return numNodeDofs_;
+  }
+
+
+  int getNumEdgeDofs() const {
+    return numEdgeDofs_;
+  }
+
+
+  int getNumFaceDofs() const {
+    return numFaceDofs_;
+  }
+
+
+  int getNumVoidDofs() const {
+    return numVoidDofs_;
+  }
+
+
+  int getNumDofs() const {
+    return numDofs_;
+  }
+
+
+  int getNumFields() const {
+    return numBases_;
+  }
+
+
+  int getLocalFieldSize() const {
+    return numLocalDofs_;
+  }
+
+
+  int getLocalFieldSize(const int & fieldNumber) const {
+    return (intrepidBasis_[fieldNumber])->getCardinality();
+  }
+
+
+  std::vector<std::vector<int> > getFieldPattern() const {
+    return fieldPattern_;
+  }
+
+
+  std::vector<int> getFieldPattern(const int & fieldNumber) const {
+    return fieldPattern_[fieldNumber];
+  }
+
+  void transformToIntrepidPattern(const scalar_view array) const {
+    if ( array.is_allocated() ) {
+      int rank = array.rank();
+      int nc   = array.extent(0);
+      if ( rank == 2 ) {
+        int nf = array.extent(1);
+        scalar_view tmp("tmp2d", nc, nf);
+        for (int c = 0; c < nc; ++c) {
+          for (int f = 0; f < nf; ++f) {
+            tmp(c, mapToIntrepidPattern_[f]) = array(c, f);
+          }
+        }
+        Kokkos::deep_copy(array, tmp);
+      }
+      else if (rank == 3 ) {
+        int nf1 = array.extent_int(1);
+        int nf2 = array.extent_int(2);
+        scalar_view tmp("tmp3d", nc, nf1, nf2);
+        for (int c = 0; c < nc; ++c) {
+          for (int f1 = 0; f1 < nf1; ++f1) {
+            for (int f2 = 0; f2 < nf2; ++f2) {
+              tmp(c, mapToIntrepidPattern_[f1], mapToIntrepidPattern_[f2]) = array(c, f1, f2);
+            }
+          }
+        }
+        Kokkos::deep_copy(array, tmp);
+      }
+      else {
+        TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument,
+          ">>> PDE-OPT/TOOLS/dofmanager.hpp (transformToIntrepidPattern): Input array rank not 2 or 3!");
+      }
+    }
+  }
+
+  int mapToFieldPattern(int f) const {
+    return mapToFieldPattern_[f];
+  }
+
+  void transformToFieldPattern(const scalar_view array) const {
+    if ( array.is_allocated() ) {
+      int rank = array.rank();
+      int nc   = array.extent_int(0);
+      if ( rank == 2 ) {
+        int nf = array.extent_int(1);
+        scalar_view tmp("tmp2d", nc, nf);
+        for (int c = 0; c < nc; ++c) {
+          for (int f = 0; f < nf; ++f) {
+            tmp(c, mapToFieldPattern_[f]) = array(c, f);
+          }
+        }
+        Kokkos::deep_copy(array, tmp);
+      }
+      else if (rank == 3 ) {
+        int nf1 = array->dimension(1);
+        int nf2 = array->dimension(2);
+        scalar_view tmp("tmp3d", nc, nf1, nf2);
+        for (int c = 0; c < nc; ++c) {
+          for (int f1 = 0; f1 < nf1; ++f1) {
+            for (int f2 = 0; f2 < nf2; ++f2) {
+              tmp(c, mapToFieldPattern_[f1], mapToFieldPattern_[f2]) = array(c, f1, f2);
+            }
+          }
+        }
+        Kokkos::deep_copy(array, tmp);
+      }
+      else {
+        TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument,
+          ">>> PDE-OPT/TOOLS/dofmanager.hpp (transformToFieldPattern): Input array rank not 2 or 3!");
+      }
+    }
+  }
+
+private:
+
+  void computeDofArrays() {
+
+    nodeDofs_ = int_view("nodeDofs_", numNodes_, numLocalNodeDofs_);
+    edgeDofs_ = int_view("edgeDofs_", numEdges_, numLocalEdgeDofs_);
+    faceDofs_ = int_view("faceDofs_", numFaces_, numLocalFaceDofs_);
+    voidDofs_ = int_view("voidDofs_", numCells_, numLocalVoidDofs_);
+
+    int dofCt = -1;
+
+    //
+    // This is the independent node id --> edge id --> cell id ordering.
+    // For example, for a Q2-Q2-Q1 basis (Taylor-Hood), we would have:
+    // 
+    //     Q2   Q2   Q1
+    //     -------------- 
+    // n1  1    1    1
+    // n2  1    1    1
+    // ...
+    // e1  1    1    0
+    // e2  1    1    0
+    // ...
+    // c1  1    1    0
+    // c2  1    1    0
+    // ...
+    //
+
+    // count node dofs
+    for (int i=0; i<numNodes_; ++i) {
+      int locNodeCt = -1;
+      for (int j=0; j<numBases_; ++j) {
+        for (int k=0; k<numDofsPerNode_[j]; ++k) {
+          nodeDofs_(i, ++locNodeCt) = ++dofCt;
+        }
+      }
+    }
+    numNodeDofs_ = dofCt+1;
+
+    // count edge dofs
+    for (int i=0; i<numEdges_; ++i) {
+      int locEdgeCt = -1;
+      for (int j=0; j<numBases_; ++j) {
+        for (int k=0; k<numDofsPerEdge_[j]; ++k) {
+          edgeDofs_(i, ++locEdgeCt) = ++dofCt;
+        }
+      }
+    }
+    numEdgeDofs_ = dofCt+1-numNodeDofs_;
+
+    // count face dofs
+    for (int i=0; i<numFaces_; ++i) {
+      int locFaceCt = -1;
+      for (int j=0; j<numBases_; ++j) {
+        for (int k=0; k<numDofsPerFace_[j]; ++k) {
+          faceDofs_(i, ++locFaceCt) = ++dofCt;
+        }
+      }
+    }
+    numFaceDofs_ = dofCt+1-numNodeDofs_-numEdgeDofs_;
+
+    // count void dofs
+    for (int i=0; i<numCells_; ++i) {
+      int locVoidCt = -1;
+      for (int j=0; j<numBases_; ++j) {
+        for (int k=0; k<numDofsPerVoid_[j]; ++k) {
+          voidDofs_(i, ++locVoidCt) = ++dofCt;
+        }
+      }
+    }
+    numVoidDofs_ = dofCt+1-numNodeDofs_-numEdgeDofs_-numFaceDofs_;
+
+    numDofs_ = numNodeDofs_+numEdgeDofs_+numFaceDofs_+numVoidDofs_;
+
+  } // computeDofArrays
+
+
+  void computeCellDofs() {
+
+    cellDofs_ = int_view("cellDofs_", numCells_, numLocalDofs_);
+
+    for (int i=0; i<numCells_; ++i) {
+      int ct = -1;
+      for (int j=0; j<numLocalNodes_; ++j) {
+        for (int k=0; k<numLocalNodeDofs_; ++k) {
+          cellDofs_(i,++ct) = nodeDofs_(meshCellToNodeMap_(i,j), k);
+        }
+      }
+      for (int j=0; j<numLocalEdges_; ++j) {
+        for (int k=0; k<numLocalEdgeDofs_; ++k) {
+          cellDofs_(i,++ct) = edgeDofs_(meshCellToEdgeMap_(i,j), k);
+        }
+      }
+      for (int j=0; j<numLocalFaces_; ++j) {
+        for (int k=0; k<numLocalFaceDofs_; ++k) {
+          cellDofs_(i,++ct) = faceDofs_(meshCellToFaceMap_(i,j), k);
+        }
+      }
+      for (int j=0; j<numLocalVoids_; ++j) {
+        for (int k=0; k<numLocalVoidDofs_; ++k) {
+          cellDofs_(i,++ct) = voidDofs_(i, k);
+        }
+      }
+    }
+  } // computeCellDofs
+
+
+  void computeFieldPattern() {
+
+    fieldPattern_.resize(numBases_);
+
+    int dofCt = -1;
+
+    // count node dofs
+    for (int i=0; i<numLocalNodes_; ++i) {
+      for (int j=0; j<numBases_; ++j) {
+        for (int k=0; k<numDofsPerNode_[j]; ++k) {
+          fieldPattern_[j].push_back(++dofCt);
+        }
+      }
+    }
+
+    // count edge dofs
+    for (int i=0; i<numLocalEdges_; ++i) {
+      for (int j=0; j<numBases_; ++j) {
+        for (int k=0; k<numDofsPerEdge_[j]; ++k) {
+          fieldPattern_[j].push_back(++dofCt);
+        }
+      }
+    }
+
+    // count face dofs
+    for (int i=0; i<numLocalFaces_; ++i) {
+      for (int j=0; j<numBases_; ++j) {
+        for (int k=0; k<numDofsPerFace_[j]; ++k) {
+          fieldPattern_[j].push_back(++dofCt);
+        }
+      }
+    }
+
+    // count void dofs
+    for (int i=0; i<numLocalVoids_; ++i) {
+      for (int j=0; j<numBases_; ++j) {
+        for (int k=0; k<numDofsPerVoid_[j]; ++k) {
+          fieldPattern_[j].push_back(++dofCt);
+        }
+      }
+    }
+
+  } // computeFieldPattern
+
+
+  void computeFieldDofs() {
+
+    fieldDofs_.resize(numBases_);
+
+    for (int fieldNum=0; fieldNum<numBases_; ++fieldNum) { 
+      int basisCard = intrepidBasis_[fieldNum]->getCardinality();
+      int_view fdofs("fieldDofs_", numCells_, basisCard);
+      for (int i=0; i<numCells_; ++i) {
+        for (int j=0; j<basisCard; ++j) {
+          fdofs(i,j) = cellDofs_(i, fieldPattern_[fieldNum][j]);
+        }
+      }
+      fieldDofs_[fieldNum] = fdofs;
+    }
+  } // computeFieldDofs
+
+  void computeDofTransforms(void) {
+    // Build basis maps
+    std::vector<std::vector<int> > map2IP(numBases_);
+    std::vector<std::vector<int> > map2FP(numBases_);
+    shards::CellTopology cellType = intrepidBasis_[0]->getBaseCellTopology();
+    int nv = cellType.getVertexCount();
+    for (int f=0; f<numBases_; ++f) { 
+      int basisDeg = intrepidBasis_[f]->getDegree();
+      if (cellDim_ == 1) {
+        if (basisDeg == 0) {
+          map2IP[f] = {0};
+          map2FP[f] = {0};
+        }
+        else if (basisDeg == 1) {
+          map2IP[f] = {0, 1};
+          map2FP[f] = {0, 1};
+        }
+        else if (basisDeg == 2) {
+          // C2 LINE basis does *not* follow the LINE<3> topology node order
+          map2IP[f] = {0, 2, 1};
+          map2FP[f] = {0, 2, 1};
+        }
+        else {
+          TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument,
+            ">>> PDE-OPT/TOOLS/dofmanager.hpp (ComputeDofTransforms): basisDeg > 2");
+        }
+      }
+      else if (cellDim_ == 2) {
+        if (basisDeg == 0) {
+          map2IP[f] = {0};
+          map2FP[f] = {0};
+        }
+        else if (basisDeg == 1) {
+          map2IP[f].resize(nv);
+          map2FP[f].resize(nv);
+          for (int i = 0; i < nv; ++i) {
+            map2IP[f][i] = i;
+            map2FP[f][i] = i;
+          }
+          //map2IP[f] = {0, 1, 2, 3}; // C1 QUAD
+          //map2FP[f] = {0, 1, 2, 3}; // C1 QUAD
+        }
+        else if (basisDeg == 2) {
+          int nbfs = (nv==3 ? 6 : 9);
+          map2IP[f].resize(nbfs);
+          map2FP[f].resize(nbfs);
+          for (int i = 0; i < nbfs; ++i) {
+            map2IP[f][i] = i;
+            map2FP[f][i] = i;
+          }
+          //map2IP[f] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; // C2 QUAD
+          //map2FP[f] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; // C2 QUAD
+        }
+        else {
+          TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument,
+            ">>> PDE-OPT/TOOLS/dofmanager.hpp (ComputeDofTransforms): basisDeg > 2");
+        }
+      }
+      else if (cellDim_ == 3) {
+        if (basisDeg == 0) {
+          map2IP[f] = {0};
+          map2FP[f] = {0};
+        }
+        else if (basisDeg == 1) {
+          map2IP[f].resize(nv);
+          map2FP[f].resize(nv);
+          for (int i = 0; i < nv; ++i) {
+            map2IP[f][i] = i;
+            map2FP[f][i] = i;
+          }
+          //map2IP[f] = {0, 1, 2, 3, 4, 5, 6, 7}; // C1 HEX
+          //map2FP[f] = {0, 1, 2, 3, 4, 5, 6, 7}; // C1 HEX
+        }
+        else if (basisDeg == 2) {
+          if (nv == 4) { // C2 TET basis follows the TET<10> topology node order
+            int nbfs = 10;
+            map2IP[f].resize(nbfs);
+            map2FP[f].resize(nbfs);
+            for (int i = 0; i < nbfs; ++i) {
+              map2IP[f][i] = i;
+              map2FP[f][i] = i;
+            }
+          }
+          else if (nv == 8) { // C2 HEX basis does *not* follow the HEX<27> topology node order
+            map2IP[f] = {0, 1, 2, 3, 4 ,5, 6, 7,                       // Nodes
+                         8, 9, 10, 11, 16, 17, 18, 19, 12, 13, 14, 15, // Edges
+                         25, 24, 26, 23, 21, 22,                       // Faces
+                         20};                                          // Voids
+            map2FP[f] = {0, 1, 2, 3, 4 ,5, 6, 7,                       // Nodes
+                         8, 9, 10, 11, 16, 17, 18, 19, 12, 13, 14, 15, // Edges
+                         26,                                           // Voids
+                         24, 25, 23, 21, 20, 22};                      // Faces
+          }
+        }
+        else {
+          TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument,
+            ">>> PDE-OPT/TOOLS/dofmanager.hpp (ComputeDofTransforms): basisDeg > 2");
+        }
+      }
+    }
+    // Apply transformation to ind
+    mapToIntrepidPattern_.clear(); mapToIntrepidPattern_.resize(numDofs_);
+    mapToFieldPattern_.clear();    mapToFieldPattern_.resize(numDofs_);
+    for (int i = 0; i < numBases_; ++i) {
+      for (int j = 0; j < static_cast<int>(map2IP[i].size()); ++j) {
+        mapToIntrepidPattern_[fieldPattern_[i][j]] = fieldPattern_[i][map2IP[i][j]];
+        mapToFieldPattern_[fieldPattern_[i][j]]    = fieldPattern_[i][map2FP[i][j]];
+      }
+    }
+  }
+
+}; // DofManager
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/TOOLS/dynpdeK.hpp b/packages/rol/example/PDE-OPT/TOOLS/dynpdeK.hpp
new file mode 100644
index 000000000000..811d29fc772e
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/TOOLS/dynpdeK.hpp
@@ -0,0 +1,270 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  dynpdeK.hpp
+    \brief Provides the interface for local (cell-based) PDE residual computations.
+*/
+
+#ifndef PDEOPT_DYNPDEK_HPP
+#define PDEOPT_DYNPDEK_HPP
+
+#include "ROL_Ptr.hpp"
+#include "ROL_TimeStamp.hpp"
+#include "Intrepid2_Basis.hpp"
+
+template <class Real, class DeviceType>
+class DynamicPDE {
+public:
+  using scalar_view = Kokkos::DynRankView<Real,DeviceType>;
+  using basis_ptr = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+
+  virtual ~DynamicPDE() {}
+
+  virtual void residual(scalar_view & res,
+                        const ROL::TimeStamp<Real> &ts,
+                        const scalar_view uo_coeff,
+                        const scalar_view un_coeff,
+                        const scalar_view z_coeff = scalar_view(),
+                        const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) = 0;
+
+  virtual void Jacobian_uo(scalar_view & jac,
+                           const ROL::TimeStamp<Real> &ts,
+                           const scalar_view uo_coeff,
+                           const scalar_view un_coeff,
+                           const scalar_view z_coeff = scalar_view(),
+                           const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> Jacobian_uo not implemented.");
+  }
+
+  virtual void Jacobian_un(scalar_view & jac,
+                           const ROL::TimeStamp<Real> &ts,
+                           const scalar_view uo_coeff,
+                           const scalar_view un_coeff,
+                           const scalar_view z_coeff = scalar_view(),
+                           const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> Jacobian_un not implemented.");
+  }
+
+  virtual void Jacobian_zf(scalar_view & jac,
+                           const ROL::TimeStamp<Real> &ts,
+                           const scalar_view uo_coeff,
+                           const scalar_view un_coeff,
+                           const scalar_view z_coeff = scalar_view(),
+                           const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> Jacobian_zf not implemented.");
+  }
+
+  virtual void Jacobian_zp(std::vector<scalar_view> & jac,
+                           const ROL::TimeStamp<Real> &ts,
+                           const scalar_view uo_coeff,
+                           const scalar_view un_coeff,
+                           const scalar_view z_coeff = scalar_view(),
+                           const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> Jacobian_3 not implemented.");
+  }
+
+  virtual void Hessian_uo_uo(scalar_view & hess,
+                             const ROL::TimeStamp<Real> &ts,
+                             const scalar_view l_coeff,
+                             const scalar_view uo_coeff,
+                             const scalar_view un_coeff,
+                             const scalar_view z_coeff = scalar_view(),
+                             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> Hessian_uo_uo not implemented.");
+  }
+
+  virtual void Hessian_uo_un(scalar_view & hess,
+                             const ROL::TimeStamp<Real> &ts,
+                             const scalar_view l_coeff,
+                             const scalar_view uo_coeff,
+                             const scalar_view un_coeff,
+                             const scalar_view z_coeff = scalar_view(),
+                             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> Hessian_uo_un not implemented.");
+  }
+
+  virtual void Hessian_uo_zf(scalar_view & hess,
+                             const ROL::TimeStamp<Real> &ts,
+                             const scalar_view l_coeff,
+                             const scalar_view uo_coeff,
+                             const scalar_view un_coeff,
+                             const scalar_view z_coeff = scalar_view(),
+                             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> Hessian_uo_zf not implemented.");
+  }
+
+  virtual void Hessian_uo_zp(std::vector<scalar_view> & hess,
+                             const ROL::TimeStamp<Real> &ts,
+                             const scalar_view l_coeff,
+                             const scalar_view uo_coeff,
+                             const scalar_view un_coeff,
+                             const scalar_view z_coeff = scalar_view(),
+                             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> Hessian_uo_zp not implemented.");
+  }
+
+  virtual void Hessian_un_uo(scalar_view & hess,
+                             const ROL::TimeStamp<Real> &ts,
+                             const scalar_view l_coeff,
+                             const scalar_view uo_coeff,
+                             const scalar_view un_coeff,
+                             const scalar_view z_coeff = scalar_view(),
+                             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> Hessian_un_uo not implemented.");
+  }
+
+  virtual void Hessian_un_un(scalar_view & hess,
+                             const ROL::TimeStamp<Real> &ts,
+                             const scalar_view l_coeff,
+                             const scalar_view uo_coeff,
+                             const scalar_view un_coeff,
+                             const scalar_view z_coeff = scalar_view(),
+                             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> Hessian_un_un not implemented.");
+  }
+
+  virtual void Hessian_un_zf(scalar_view & hess,
+                             const ROL::TimeStamp<Real> &ts,
+                             const scalar_view l_coeff,
+                             const scalar_view uo_coeff,
+                             const scalar_view un_coeff,
+                             const scalar_view z_coeff = scalar_view(),
+                             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> Hessian_un_zf not implemented.");
+  }
+
+  virtual void Hessian_un_zp(std::vector<scalar_view> & hess,
+                             const ROL::TimeStamp<Real> &ts,
+                             const scalar_view l_coeff,
+                             const scalar_view uo_coeff,
+                             const scalar_view un_coeff,
+                             const scalar_view z_coeff = scalar_view(),
+                             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> Hessian_un_zp not implemented.");
+  }
+
+  virtual void Hessian_zf_uo(scalar_view & hess,
+                             const ROL::TimeStamp<Real> &ts,
+                             const scalar_view l_coeff,
+                             const scalar_view uo_coeff,
+                             const scalar_view un_coeff,
+                             const scalar_view z_coeff = scalar_view(),
+                             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> Hessian_zf_uo not implemented.");
+  }
+
+  virtual void Hessian_zf_un(scalar_view & hess,
+                             const ROL::TimeStamp<Real> &ts,
+                             const scalar_view l_coeff,
+                             const scalar_view uo_coeff,
+                             const scalar_view un_coeff,
+                             const scalar_view z_coeff = ROL::nullPtr,
+                             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> Hessian_zf_un not implemented.");
+  }
+
+  virtual void Hessian_zf_zf(scalar_view & hess,
+                             const ROL::TimeStamp<Real> &ts,
+                             const scalar_view l_coeff,
+                             const scalar_view uo_coeff,
+                             const scalar_view un_coeff,
+                             const scalar_view z_coeff = scalar_view(),
+                             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> Hessian_zf_zf not implemented.");
+  }
+
+  virtual void Hessian_zf_zp(std::vector<scalar_view> & hess,
+                             const ROL::TimeStamp<Real> &ts,
+                             const scalar_view l_coeff,
+                             const scalar_view uo_coeff,
+                             const scalar_view un_coeff,
+                             const scalar_view z_coeff = scalar_view(),
+                             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> Hessian_zf_zp not implemented.");
+  }
+
+  virtual void Hessian_zp_uo(std::vector<scalar_view> & hess,
+                             const ROL::TimeStamp<Real> &ts,
+                             const scalar_view l_coeff,
+                             const scalar_view uo_coeff,
+                             const scalar_view un_coeff,
+                             const scalar_view z_coeff = scalar_view(),
+                             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> Hessian_zp_uo not implemented.");
+  }
+
+  virtual void Hessian_zp_un(std::vector<scalar_view> & hess,
+                             const ROL::TimeStamp<Real> &ts,
+                             const scalar_view l_coeff,
+                             const scalar_view uo_coeff,
+                             const scalar_view un_coeff,
+                             const scalar_view z_coeff = scalar_view(),
+                             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> Hessian_zp_un not implemented.");
+  }
+
+  virtual void Hessian_zp_zf(std::vector<scalar_view> & hess,
+                             const ROL::TimeStamp<Real> &ts,
+                             const scalar_view l_coeff,
+                             const scalar_view uo_coeff,
+                             const scalar_view un_coeff,
+                             const scalar_view z_coeff = scalar_view(),
+                             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> Hessian_zp_zf not implemented.");
+  }
+
+  virtual void Hessian_zp_zp(std::vector<std::vector<scalar_view>> & hess,
+                             const ROL::TimeStamp<Real> &ts,
+                             const scalar_view l_coeff,
+                             const scalar_view uo_coeff,
+                             const scalar_view un_coeff,
+                             const scalar_view z_coeff = scalar_view(),
+                             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> Hessian_zp_zp not implemented.");
+  }
+
+  virtual void RieszMap_1(scalar_view &riesz) {
+    throw Exception::NotImplemented(">>> RieszMap_1 not implemented.");
+  }
+
+  virtual void RieszMap_2(scalar_view &riesz) {
+    throw Exception::NotImplemented(">>> RieszMap_2 not implemented.");
+  }
+
+  virtual std::vector<basis_ptr> getFields() = 0;
+  //virtual std::vector<basis_ptr> getFields2() {
+  //  return getFields();
+  //}
+
+  virtual void setCellNodes(const scalar_view &cellNodes,
+                            const std::vector<std::vector<scalar_view>> &bdryCellNodes,
+                            const std::vector<std::vector<std::vector<int>>> &bdryCellLocIds) = 0;
+
+  virtual void setFieldPattern(const std::vector<std::vector<int>> &fieldPattern) {}
+  virtual void setFieldPattern(const std::vector<std::vector<int>> &fieldPattern1,
+                               const std::vector<std::vector<int>> &fieldPattern2) {
+    setFieldPattern(fieldPattern1);
+  }
+
+private:
+  std::vector<Real> param_;
+
+protected:
+  std::vector<Real> getParameter(void) const {
+    return param_;
+  }
+
+public:
+  void setParameter(const std::vector<Real> &param) {
+    param_.assign(param.begin(),param.end());
+  }
+
+}; // DynamicPDE
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/TOOLS/feK.hpp b/packages/rol/example/PDE-OPT/TOOLS/feK.hpp
new file mode 100644
index 000000000000..47d66275abf9
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/TOOLS/feK.hpp
@@ -0,0 +1,660 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  feK.hpp
+    \brief Given a set of cells with geometric node information, sets up
+           data structures used in finite element integration in HGRAD.
+*/
+
+#ifndef PDEOPT_FE_HPP
+#define PDEOPT_FE_HPP
+
+#include "ROL_Ptr.hpp"
+
+#include "Teuchos_ParameterList.hpp"
+#include "Intrepid2_Basis.hpp"
+#include "Intrepid2_Cubature.hpp"
+#include "Intrepid2_CellTools.hpp"
+#include "Intrepid2_FunctionSpaceTools.hpp"
+
+template <class Real, class DeviceType>
+class FE {
+
+public:
+  using scalar_view = Kokkos::DynRankView<Real,DeviceType>;
+  using fst = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct = Intrepid2::CellTools<DeviceType>;
+  using rst = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+
+  const scalar_view cellNodes_;                                 // coordinates of the cell nodes
+  Intrepid2::BasisPtr<DeviceType, Real, Real> basis_;           // Intrepid basis
+  const ROL::Ptr<Intrepid2::Cubature<DeviceType, Real, Real>> cubature_;  // Intrepid cubature (quadrature, integration) rule
+  const int sideId_;                                                                         // local side id for boundary integration
+
+  int c_;    // number of cells in the FE object
+  int f_;    // number of basis functions per cell
+  int p_;    // number of cubature points per cell
+  int d_;    // space dimension of the (parent) cells
+  int sd_;   // space dimension of the side cells
+
+  ROL::Ptr<shards::CellTopology> cellTopo_;   // base (parent) cell topology
+  ROL::Ptr<shards::CellTopology> sideTopo_;   // side (subcell) topology; assumed uniform
+
+  std::vector<std::vector<int> > sideDofs_;       // local dofs on cell sides; 1st index: side number; 2nd index: dof number
+
+  // Containers for local finite element data.
+  scalar_view cubPoints_;             // points of the cubature rule on the reference cell
+  scalar_view cubWeights_;            // weights of the cubature rule on the reference cell
+  scalar_view cubPointsSubcell_;      // cubature points on the side reference cell
+  scalar_view cubWeightsSubcell_;     // cubature weights on the side reference cell
+  scalar_view cellJac_;               // cell Jacobian matrices
+  scalar_view cellJacInv_;            // inverses of cell Jacobians
+  scalar_view cellJacDet_;            // determinants of cell Jacobians
+  scalar_view cellWeightedMeasure_;   // cell measure (Jacobian determinant) multiplied by the cubature weights
+  scalar_view valReference_;          // value of FE basis in reference space
+  scalar_view gradReference_;         // gradient of FE basis in reference space
+  scalar_view valPhysical_;           // value of FE basis in physical space
+  scalar_view gradPhysical_;          // gradient of FE basis in physical space
+  scalar_view gradPhysicalX_;         // x-component of gradient of FE basis in physical space
+  scalar_view gradPhysicalY_;         // y-component of gradient of FE basis in physical space
+  scalar_view gradPhysicalZ_;         // z-component of gradient of FE basis in physical space
+  scalar_view gradPhysicalXWeighted_; // x-component of gradient of FE basis in physical space weighted
+  scalar_view gradPhysicalYWeighted_; // y-component of gradient of FE basis in physical space weighted
+  scalar_view gradPhysicalZWeighted_; // z-component of gradient of FE basis in physical space weighted
+  scalar_view divPhysical_;           // divergence of FE basis in physical space
+  scalar_view valPhysicalWeighted_;   // value of FE basis in physical space multiplied by weighted cell measure
+  scalar_view gradPhysicalWeighted_;  // gradient of FE basis in physical space multiplied by weighted cell measure
+  scalar_view divPhysicalWeighted_;   // divergence of FE basis in physical space multiplied by weighted cell measure
+  scalar_view gradgradMats_;          // cell stiffness matrices
+  scalar_view valvalMats_;            // cell mass matrices
+  scalar_view cubPointsPhysical_;     // cubature points on the physical cells
+  scalar_view dofPoints_;             // degree of freedom points on the reference cell
+
+public:
+
+  FE(const scalar_view                                           cellNodes,
+     const Intrepid2::BasisPtr<DeviceType, Real, Real>           basis,
+     const ROL::Ptr<Intrepid2::Cubature<DeviceType, Real, Real>> cubature,
+     bool computeBdryDofs = true) :
+       cellNodes_(cellNodes), basis_(basis), cubature_(cubature), sideId_(-1) {
+
+    // Get base cell topology from basis.
+    cellTopo_ = ROL::makePtr<shards::CellTopology>(basis_->getBaseCellTopology());
+
+    // Compute dimensions of multidimensional array members.
+    c_  = cellNodes_.extent_int(0);
+    f_  = basis_->getCardinality();
+    p_  = cubature_->getNumPoints();
+    d_  = cellTopo_->getDimension();
+    sd_ = d_ - 1;
+
+    // Get side subcell topology.
+    sideTopo_ = ROL::nullPtr;
+
+    // Allocate multidimensional arrays.
+    cubPoints_               = scalar_view("cubPoints_", p_, d_);
+    cubWeights_              = scalar_view("cubWeights_", p_);
+    cellJac_                 = scalar_view("cellJac_", c_, p_, d_, d_);
+    cellJacInv_              = scalar_view("cellJacInv_", c_, p_, d_, d_);
+    cellJacDet_              = scalar_view("cellJacDet_", c_, p_);
+    cellWeightedMeasure_     = scalar_view("cellWeightedMeasure_", c_, p_);
+    valReference_            = scalar_view("valReference_", f_, p_);
+    gradReference_           = scalar_view("gradReference_", f_, p_, d_);
+    valPhysical_             = scalar_view("valPhysical_", c_, f_, p_);
+    gradPhysical_            = scalar_view("gradPhysical_", c_, f_, p_, d_);
+    gradPhysicalX_           = scalar_view("gradPhysicalX_", c_, f_, p_);
+    if (d_ > 1) {
+      gradPhysicalY_         = scalar_view("gradPhysicalY_", c_, f_, p_);
+    }
+    if (d_ > 2) {
+      gradPhysicalZ_         = scalar_view("gradPhysicalZ_", c_, f_, p_);
+    }
+    gradPhysicalXWeighted_   = scalar_view("gradPhysicalXWeighted_", c_, f_, p_);
+    if (d_ > 1) {
+      gradPhysicalYWeighted_ = scalar_view("gradPhysicalYWeighted_", c_, f_, p_);
+    }
+    if (d_ > 2) {
+      gradPhysicalZWeighted_ = scalar_view("gradPhysicalZWeighted_", c_, f_, p_);
+    }
+    divPhysical_             = scalar_view("divPhysical_", c_, f_, p_);
+    valPhysicalWeighted_     = scalar_view("valPhysicalWeighted_", c_, f_, p_);
+    gradPhysicalWeighted_    = scalar_view("gradPhysicalWeighted_", c_, f_, p_, d_);
+    divPhysicalWeighted_     = scalar_view("divPhysicalWeighted_", c_, f_, p_);
+    gradgradMats_            = scalar_view("gradgradMats_", c_, f_, f_);
+    valvalMats_              = scalar_view("valvalMats_", c_, f_, f_);
+    cubPointsPhysical_       = scalar_view("cubPointsPhysical_", c_, p_, d_);
+    dofPoints_               = scalar_view("dofPoints_", f_, d_);
+
+    /*** START: Fill multidimensional arrays. ***/
+
+    // Compute cubature points and weights.
+    cubature_->getCubature(cubPoints_, cubWeights_);
+
+    // Compute reference basis value and gradient.
+    basis_->getValues(gradReference_, cubPoints_, Intrepid2::OPERATOR_GRAD);       // evaluate grad operator at cubature points
+    basis_->getValues(valReference_, cubPoints_, Intrepid2::OPERATOR_VALUE);       // evaluate value operator at cubature points
+
+    // Compute cell Jacobian matrices, its inverses and determinants.
+    ct::setJacobian(cellJac_, cubPoints_, cellNodes_, *cellTopo_);  // compute cell Jacobians
+    ct::setJacobianInv(cellJacInv_, cellJac_);                       // compute inverses of cell Jacobians
+    ct::setJacobianDet(cellJacDet_, cellJac_);                       // compute determinants of cell Jacobians
+
+    // Compute weighted cell measure, i.e., det(J)*(cubature weight).
+    fst::computeCellMeasure(cellWeightedMeasure_, cellJacDet_, cubWeights_);
+
+    // Transform reference values into physical space.
+    fst::HGRADtransformVALUE(valPhysical_, valReference_);
+
+    // Multiply with weighted measure to get weighted values in physical space.
+    fst::multiplyMeasure(valPhysicalWeighted_, cellWeightedMeasure_, valPhysical_);
+
+    // Transform reference gradients into physical space.
+    fst::HGRADtransformGRAD(gradPhysical_, cellJacInv_, gradReference_);
+
+    // Multiply with weighted measure to get weighted gradients in physical space.
+    fst::multiplyMeasure(gradPhysicalWeighted_, cellWeightedMeasure_, gradPhysical_);
+
+    // Extract individual (x, y, z) components of the gradients in physical space.
+    for (int c=0; c<c_; ++c) {
+      for (int f=0; f<f_; ++f) {
+        for (int p=0; p<p_; ++p) {
+          gradPhysicalX_(c,f,p) = gradPhysical_(c,f,p,0);
+            gradPhysicalXWeighted_(c,f,p) = gradPhysicalWeighted_(c,f,p,0);
+          if (d_ > 1) {
+            gradPhysicalY_(c,f,p) = gradPhysical_(c,f,p,1);
+            gradPhysicalYWeighted_(c,f,p) = gradPhysicalWeighted_(c,f,p,1);
+          }
+          if (d_ > 2) {
+            gradPhysicalZ_(c,f,p) = gradPhysical_(c,f,p,2);
+            gradPhysicalZWeighted_(c,f,p) = gradPhysicalWeighted_(c,f,p,2);
+          }
+        }
+      }
+    }
+
+    // Build divergence in physical space.
+    rst::add(divPhysical_, gradPhysicalX_);
+    if (d_ > 1) {
+      rst::add(divPhysical_, gradPhysicalY_);
+    }
+    if (d_ > 2) {
+      rst::add(divPhysical_, gradPhysicalZ_);
+    }
+
+    // Multiply with weighted measure to get weighted divegence in physical space.
+    fst::multiplyMeasure(divPhysicalWeighted_, cellWeightedMeasure_, divPhysical_);
+
+    // Compute stiffness matrices.
+    fst::integrate(gradgradMats_, gradPhysical_, gradPhysicalWeighted_);
+
+    // Compute mass matrices.
+    fst::integrate(valvalMats_, valPhysical_, valPhysicalWeighted_);
+
+    // Map reference cubature points to cells in physical space.
+    ct::mapToPhysicalFrame(cubPointsPhysical_, cubPoints_, cellNodes_, *cellTopo_);
+
+    // Compute local degrees of freedom on reference cell sides.
+    if (computeBdryDofs) {
+      int numSides = cellTopo_->getSideCount();
+      if (cellTopo_->getDimension() == 1) {
+        numSides = 2;
+      }
+      if ( numSides ) {
+        for (int i=0; i<numSides; ++i) {
+          sideDofs_.push_back(computeBoundaryDofs(i));
+        }
+      }
+      else {
+        sideDofs_.push_back(computeBoundaryDofs(0));
+      }
+    }
+
+    // Get coordinates of DOFs in reference cell.
+    basis->getDofCoords(dofPoints_);
+
+    /*** END: Fill multidimensional arrays. ***/
+
+  }
+
+  FE(const scalar_view cellNodes,
+     const Intrepid2::BasisPtr<DeviceType, Real, Real>           basis,
+     const ROL::Ptr<Intrepid2::Cubature<DeviceType, Real, Real>> cubature,
+     const int&                                                  sideId) :
+       cellNodes_(cellNodes), basis_(basis), cubature_(cubature), sideId_(sideId) {
+
+    // Get base cell topology from basis.
+    cellTopo_ = ROL::makePtr<shards::CellTopology>(basis_->getBaseCellTopology());
+
+    // Compute dimensions of multidimensional array members.
+    c_  = cellNodes_.extent_int(0);
+    f_  = basis_->getCardinality();
+    p_  = cubature_->getNumPoints();
+    d_  = cellTopo_->getDimension();
+    sd_ = d_ - 1;
+    //std::cout << "FE: c = " << c_ << ", f = " << f_ << ", p = " << p_ << ", d = " << d_ << std::endl;
+
+    // Get side subcell topology.
+    sideTopo_ = ROL::makePtr<shards::CellTopology>(cellTopo_->getCellTopologyData(sd_, sideId_));
+
+    // Allocate multidimensional arrays.
+    cubPoints_            = scalar_view("cubPoints_", p_, d_);
+    cubWeights_           = scalar_view("cubWeights_", p_);
+    cubPointsSubcell_     = scalar_view("cubPointsSubcell_", p_, sd_);
+    cubWeightsSubcell_    = scalar_view("cubWeightsSubcell_", p_);
+    cellJac_              = scalar_view("cellJac_", c_, p_, d_, d_);
+    cellJacInv_           = scalar_view("cellJacInv_", c_, p_, d_, d_);
+    cellJacDet_           = scalar_view("cellJacDet_", c_, p_);
+    cellWeightedMeasure_  = scalar_view("cellWeightedMeasure_", c_, p_);
+    valReference_         = scalar_view("valReference_", f_, p_);
+    gradReference_        = scalar_view("gradReference_", f_, p_, d_);
+    valPhysical_          = scalar_view("valPhysical_", c_, f_, p_);
+    gradPhysical_         = scalar_view("gradPhysical_", c_, f_, p_, d_);
+    gradPhysicalX_        = scalar_view("gradPhysicalX_", c_, f_, p_);
+    if (d_ > 1) {
+      gradPhysicalY_      = scalar_view("gradPhysicalY_", c_, f_, p_);
+    }
+    if (d_ > 2) {
+      gradPhysicalZ_      = scalar_view("gradPhysicalZ_", c_, f_, p_);
+    }
+    divPhysical_          = scalar_view("divPhysical_", c_, f_, p_);
+    valPhysicalWeighted_  = scalar_view("valPhysicalWeighted_", c_, f_, p_);
+    gradPhysicalWeighted_ = scalar_view("gradPhysicalWeighted_", c_, f_, p_, d_);
+    gradPhysicalXWeighted_  = scalar_view("gradPhysicalXWeighted_", c_, f_, p_);
+    if (d_ > 1) {
+      gradPhysicalYWeighted_ = scalar_view("gradPhysicalYWeighted_", c_, f_, p_);
+    }
+    if (d_ > 2) {
+      gradPhysicalZWeighted_ = scalar_view("gradPhysicalZWeighted_", c_, f_, p_);
+    }
+    divPhysicalWeighted_  = scalar_view("divPhysicalWeighted_", c_, f_, p_);
+    gradgradMats_         = scalar_view("gradgradMats_", c_, f_, f_);
+    valvalMats_           = scalar_view("valvalMats_", c_, f_, f_);
+    cubPointsPhysical_    = scalar_view("cubPointsPhysical_", c_, p_, d_);
+    dofPoints_            = scalar_view("dofPoints_", f_, d_);
+
+    /*** START: Fill multidimensional arrays. ***/
+
+    // Compute cubature points and weights.
+    cubature_->getCubature(cubPointsSubcell_, cubWeights_);
+
+    // Compute reference basis value and gradient.
+    ct::mapToReferenceSubcell(cubPoints_, cubPointsSubcell_, sd_, sideId_, *cellTopo_);
+    basis_->getValues(gradReference_, cubPoints_, Intrepid2::OPERATOR_GRAD);       // evaluate grad operator at cubature points
+    basis_->getValues(valReference_, cubPoints_, Intrepid2::OPERATOR_VALUE);       // evaluate value operator at cubature points
+
+    // Compute cell Jacobian matrices, its inverses and determinants.
+    ct::setJacobian(cellJac_, cubPoints_, cellNodes_, *cellTopo_);  // compute cell Jacobians
+    ct::setJacobianInv(cellJacInv_, cellJac_);                      // compute inverses of cell Jacobians
+    ct::setJacobianDet(cellJacDet_, cellJac_);                      // compute determinants of cell Jacobians
+
+    // Compute weighted cell measure.
+    scalar_view scratch("scratch", c_*p_*d_*d_);
+    if (d_ == 2) {
+      fst::computeEdgeMeasure(cellWeightedMeasure_, cellJac_, cubWeights_, sideId_, *cellTopo_, scratch);
+    }
+    else if (d_ == 3) {
+      fst::computeFaceMeasure(cellWeightedMeasure_, cellJac_, cubWeights_, sideId_, *cellTopo_, scratch);
+    }
+
+    // Transform reference values into physical space.
+    fst::HGRADtransformVALUE(valPhysical_,valReference_);
+
+    // Multiply with weighted measure to get weighted values in physical space.
+    fst::multiplyMeasure(valPhysicalWeighted_, cellWeightedMeasure_, valPhysical_);
+
+    // Transform reference gradients into physical space.
+    fst::HGRADtransformGRAD(gradPhysical_, cellJacInv_, gradReference_);
+
+    // Multiply with weighted measure to get weighted gradients in physical space.
+    fst::multiplyMeasure(gradPhysicalWeighted_, cellWeightedMeasure_, gradPhysical_);
+
+    // Extract individual (x, y, z) components of the weighted gradients in physical space.
+    for (int c=0; c<c_; ++c) {
+      for (int f=0; f<f_; ++f) {
+        for (int p=0; p<p_; ++p) {
+          gradPhysicalX_(c,f,p) = gradPhysical_(c,f,p,0);
+          gradPhysicalXWeighted_(c,f,p) = gradPhysicalWeighted_(c,f,p,0);
+          if (d_ > 1) {
+          gradPhysicalY_(c,f,p) = gradPhysical_(c,f,p,1);
+          gradPhysicalYWeighted_(c,f,p) = gradPhysicalWeighted_(c,f,p,1);
+          }
+          if (d_ > 2) {
+          gradPhysicalZ_(c,f,p) = gradPhysical_(c,f,p,2);
+          gradPhysicalZWeighted_(c,f,p) = gradPhysicalWeighted_(c,f,p,2);
+          }
+        }
+      }
+    }
+
+    // Build divergence in physical space.
+    rst::add(divPhysical_, gradPhysicalX_);
+    if (d_ > 1) {
+      rst::add(divPhysical_, gradPhysicalY_);
+    }
+    if (d_ > 2) {
+      rst::add(divPhysical_, gradPhysicalZ_);
+    }
+
+    // Multiply with weighted measure to get weighted divegence in physical space.
+    fst::multiplyMeasure(divPhysicalWeighted_, cellWeightedMeasure_, divPhysical_);
+
+    // Compute stiffness matrices.
+    fst::integrate(gradgradMats_, gradPhysical_, gradPhysicalWeighted_);
+
+    // Compute mass matrices.
+    fst::integrate(valvalMats_, valPhysical_, valPhysicalWeighted_);
+
+    // Map reference cubature points to cells in physical space.
+    ct::mapToPhysicalFrame(cubPointsPhysical_,
+                                                  cubPoints_,
+                                                  cellNodes_,
+                                                  *cellTopo_);
+
+    // Compute local degrees of freedom on reference cell sides.
+    int numSides = cellTopo_->getSideCount();
+    for (int i=0; i<numSides; ++i) {
+      sideDofs_.push_back(computeBoundaryDofs(i));
+    }
+
+    // Get coordinates of DOFs in reference cell.
+      basis->getDofCoords(dofPoints_);
+
+    /*** END: Fill multidimensional arrays. ***/
+
+  }
+
+  /** \brief  Returns cell Jacobian matrices at cubature points.
+  */
+  scalar_view J() const {
+    return cellJac_;
+  }
+
+  /** \brief  Returns inverses of cell Jacobians at cubature points.
+  */
+  scalar_view invJ() const {
+    return cellJacInv_;
+  }
+
+  /** \brief  Returns determinants of cell Jacobians at cubature points.
+  */
+  scalar_view detJ() const {
+    return cellJacDet_;
+  }
+
+  /** \brief  Returns values of FE basis at cubature points in reference space.
+  */
+  scalar_view Nref() const {
+    return valReference_;
+  }
+
+  /** \brief  Returns gradients of FE basis at cubature points in reference space.
+  */
+  scalar_view gradNref() const {
+    return gradReference_;
+  }
+
+  /** \brief  Returns value of FE basis at cubature points in physical space.
+  */
+  scalar_view N() const {
+    return valPhysical_;
+  }
+
+  /** \brief  Returns value of FE basis at cubature points in physical space,
+              multiplied by weighted cell measures.
+  */
+  scalar_view NdetJ() const {
+    return valPhysicalWeighted_;
+  }
+
+  /** \brief  Returns gradient of FE basis at cubature points in physical space.
+  */
+  scalar_view gradN() const {
+    return gradPhysical_;
+  }
+
+  /** \brief  Returns gradient of FE basis at cubature points in physical space,
+              multiplied by weighted cell measures.
+  */
+  scalar_view gradNdetJ() const {
+    return gradPhysicalWeighted_;
+  }
+
+  /** \brief  Returns divergence of FE basis at cubature points in physical space.
+  */
+  scalar_view divN() const {
+    return divPhysical_;
+  }
+
+  /** \brief  Returns divergence of FE basis at cubature points in physical space,
+              multiplied by weighted cell measures.
+  */
+  scalar_view divNdetJ() const {
+    return divPhysicalWeighted_;
+  }
+
+  /** \brief  Returns x, y or z component of the gradient of FE basis at
+              cubature points in physical space.
+
+      \param  coord    [in]   - coordinate index (x=0, y=1, z=2)
+  */
+  scalar_view DND(const int & coord) const {
+    if (coord == 0) {
+      return gradPhysicalX_;
+    }
+    else if (coord == 1) {
+      return gradPhysicalY_;
+    }
+    else if (coord == 2) {
+      return gradPhysicalZ_;
+    }
+    else {
+      TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument,
+        ">>> ERROR (PDEOPT::FE::DND): Invalid coordinate argument!");
+    }
+  }
+
+  /** \brief  Returns x, y or z component of the gradient of FE basis at
+              cubature points in physical space, multiplied by weighted
+              cell measures.
+
+      \param  coord    [in]   - coordinate index (x=0, y=1, z=2)
+  */
+  scalar_view DNDdetJ(const int & coord) const {
+    if (coord == 0) {
+      return gradPhysicalXWeighted_;
+    }
+    else if (coord == 1) {
+      return gradPhysicalYWeighted_;
+    }
+    else if (coord == 2) {
+      return gradPhysicalZWeighted_;
+    }
+    else {
+      TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument,
+        ">>> ERROR (PDEOPT::FE::DNDdetJ): Invalid coordinate argument!");
+    }
+  }
+
+  /** \brief  Returns stiffness matrices on cells.
+  */
+  scalar_view stiffMat() const {
+    return gradgradMats_;
+  }
+
+  /** \brief  Returns mass matrices on cells.
+  */
+  scalar_view massMat() const {
+    return valvalMats_;
+  }
+
+  /** \brief Returns cubature points on cells in physical space.
+  */
+  scalar_view cubPts() const {
+    return cubPointsPhysical_;
+  }
+
+  /** \brief Builds FE value interpolant and evaluates it at cubature
+             points in physical space.
+  */
+  void evaluateValue(scalar_view fVals,
+                     const scalar_view inCoeffs) const {
+    fst::evaluate(fVals, inCoeffs, valPhysical_);
+  }
+
+  /** \brief Builds FE gradient interpolant and evaluates it at cubature
+             points in physical space.
+  */
+  void evaluateGradient(scalar_view fGrads,
+                        const scalar_view inCoeffs) const {
+    fst::evaluate(fGrads, inCoeffs, gradPhysical_);
+  }
+
+  /** \brief Computes integral of the product or dot-product of interpolated
+             FE fields f1 and f2, indexed by (C,P), for values, or (C,P,D),
+             for gradients.
+  */
+  void computeIntegral(scalar_view integral,
+                       const scalar_view f1,
+                       const scalar_view f2,
+                       const bool sumInto = false) const {
+    scalar_view f2Weighted;
+    if(f2.rank() == 3) {
+      f2Weighted = scalar_view("f2Weighted", f2.extent(0), f2.extent(1), f2.extent(2));
+    } else { //rank = 2
+      f2Weighted = scalar_view("f2Weighted", f2.extent(0), f2.extent(1));
+    }
+    fst::scalarMultiplyDataData(f2Weighted, cellWeightedMeasure_, f2);  // multiply with weighted measure
+                                                              
+    fst::integrate(integral, f1, f2Weighted, sumInto);                  // compute integral of f1*f2
+  }
+
+  /** \brief Computes the degrees of freedom on a side.
+  */
+  std::vector<int> computeBoundaryDofs(const int & locSideId) const {
+
+    std::vector<int> bdrydofs;
+    std::vector<int> nodeids, edgeids, faceids;
+
+    if (cellTopo_->getDimension() == 1) {
+      nodeids.push_back(locSideId);
+    }
+    else if (cellTopo_->getDimension() == 2) {
+      edgeids.push_back(locSideId);
+      int numVertices = 2;
+      for (int i=0; i<numVertices; ++i) {
+        nodeids.push_back(cellTopo_->getNodeMap(1, locSideId, i));
+      }
+      //for (unsigned i=0; i<nodeids.size(); ++i) {
+      //  std::cout << "\nnodeid = " << nodeids[i];
+      //}
+      //for (unsigned i=0; i<edgeids.size(); ++i) {
+      //  std::cout << "\nedgeid = " << edgeids[i];
+      //}
+    }
+    else if (cellTopo_->getDimension() == 3) {
+      faceids.push_back(locSideId);
+      CellTopologyData_Subcell face = cellTopo_->getCellTopologyData()->subcell[2][locSideId];
+      shards::CellTopology face_topo(face.topology);
+      int numVertices = face_topo.getVertexCount();
+      for (int i=0; i<numVertices; ++i) {
+        nodeids.push_back(cellTopo_->getNodeMap(2, locSideId, i));
+      }
+      int numEdges = face_topo.getEdgeCount();
+      for (int i=0; i<numEdges; ++i) {
+        edgeids.push_back(mapCellFaceEdge(cellTopo_->getCellTopologyData(), locSideId, i));
+      }
+      //for (unsigned i=0; i<nodeids.size(); ++i) {
+      //  std::cout << "\nnodeid = " << nodeids[i];
+      //}
+      //for (unsigned i=0; i<edgeids.size(); ++i) {
+      //  std::cout << "\nedgeid = " << edgeids[i];
+      //}
+      //for (unsigned i=0; i<faceids.size(); ++i) {
+      //  std::cout << "\nfaceid = " << faceids[i];
+      //}
+    }
+
+    auto tagToId = basis_->getAllDofOrdinal();
+    std::vector<std::vector<std::vector<int> > > tagToIdCompact;
+    int scdim = tagToId.extent_int(0);
+    tagToIdCompact.resize(scdim);
+    for (int i=0; i<scdim; ++i) {
+      for (int j=0; j<tagToId.extent_int(1); ++j) {
+        std::vector<int> ids;
+        for (int k=0; k<tagToId.extent_int(2); ++k) {
+          //std::cout << "\n  i=" << i << "  j=" << j << "  k=" << k << "  id=" << tagToId[i][j][k];
+          if (tagToId(i,j,k) != -1) {
+            ids.push_back(tagToId(i,j,k));
+          }
+        }
+        tagToIdCompact[i].push_back(ids);
+        //int dofordcompact = tagToIdCompact[i][j].size();
+        //for (int k=0; k<dofordcompact; ++k) {
+        //  std::cout << "\n  i=" << i << "  j=" << j << "  k=" << k << "  id=" << tagToIdCompact[i][j][k];
+        //}
+      }
+    }
+
+    int numNodeIds = nodeids.size();
+    if (tagToIdCompact.size() > 0) {
+      for (int i=0; i<numNodeIds; ++i) {
+        int numdofs = tagToIdCompact[0][nodeids[i]].size();
+        for (int j=0; j<numdofs; ++j) {
+          bdrydofs.push_back(tagToIdCompact[0][nodeids[i]][j]);
+        }
+      }
+    }
+
+    int numEdgeIds = edgeids.size();
+    if (tagToIdCompact.size() > 1) {
+      for (int i=0; i<numEdgeIds; ++i) {
+        int numdofs = tagToIdCompact[1][edgeids[i]].size();
+        for (int j=0; j<numdofs; ++j) {
+          bdrydofs.push_back(tagToIdCompact[1][edgeids[i]][j]);
+        }
+      }
+    }
+
+    int numFaceIds = faceids.size();
+    if (tagToIdCompact.size() > 2) {
+      for (int i=0; i<numFaceIds; ++i) {
+        int numdofs = tagToIdCompact[2][faceids[i]].size();
+        for (int j=0; j<numdofs; ++j) {
+          bdrydofs.push_back(tagToIdCompact[2][faceids[i]][j]);
+        }
+      }
+    }
+
+    //for (unsigned i=0; i<bdrydofs.size(); ++i) {
+    //  std::cout << "\ndofid = " << bdrydofs[i];
+    //}
+
+    return bdrydofs;
+
+  }
+
+  /** \brief Returns the degrees of freedom on reference cell sides.
+  */
+  const std::vector<std::vector<int> > & getBoundaryDofs() const {
+    return sideDofs_;
+  }
+
+  /** \brief Computes coordinates of degrees of freedom on cells.
+  */
+  void computeDofCoords(scalar_view dofCoords,
+                        const scalar_view cellNodes) const {
+    // Map reference DOF locations to physical space.
+    ct::mapToPhysicalFrame(dofCoords, dofPoints_, cellNodes, *cellTopo_);
+  }
+
+}; // FE
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/TOOLS/feK_curl.hpp b/packages/rol/example/PDE-OPT/TOOLS/feK_curl.hpp
new file mode 100644
index 000000000000..945c1f4365b4
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/TOOLS/feK_curl.hpp
@@ -0,0 +1,441 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  feK_curl.hpp
+    \brief Given a set of cells with geometric node information, sets up
+           data structures used in finite element integration in HCURL.
+*/
+
+#ifndef PDEOPT_FE_CURL_HPP
+#define PDEOPT_FE_CURL_HPP
+
+#include "ROL_Ptr.hpp"
+
+#include "Teuchos_ParameterList.hpp"
+#include "Intrepid2_Basis.hpp"
+#include "Intrepid2_Cubature.hpp"
+#include "Intrepid2_CellTools.hpp"
+#include "Intrepid2_FunctionSpaceTools.hpp"
+
+template <class Real, class DeviceType>
+class FE_CURL {
+
+public:
+  using scalar_view = Kokkos::DynRankView<Real,DeviceType>;
+  using fst = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct = Intrepid2::CellTools<DeviceType>;
+  using rst = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+
+  const scalar_view cellNodes_;                                 // coordinates of the cell nodes
+  Intrepid2::BasisPtr<DeviceType, Real, Real> basis_;           // Intrepid basis
+  const ROL::Ptr<Intrepid2::Cubature<DeviceType, Real, Real>> cubature_;  // Intrepid cubature (quadrature, integration) rule
+  const int sideId_;                                                                         // local side id for boundary integration
+
+  int c_;    // number of cells in the FE object
+  int f_;    // number of basis functions per cell
+  int p_;    // number of cubature points per cell
+  int d_;    // space dimension of the (parent) cells
+  int sd_;   // space dimension of the side cells
+
+  ROL::Ptr<shards::CellTopology> cellTopo_;   // base (parent) cell topology
+  ROL::Ptr<shards::CellTopology> sideTopo_;   // side (subcell) topology; assumed uniform
+
+  std::vector<std::vector<int> > sideDofs_;       // local dofs on cell sides; 1st index: side number; 2nd index: dof number
+
+  // Containers for local finite element data.
+  scalar_view cubPoints_;             // points of the cubature rule on the reference cell
+  scalar_view cubWeights_;            // weights of the cubature rule on the reference cell
+  scalar_view cubPointsSubcell_;      // cubature points on the side reference cell
+  scalar_view cubWeightsSubcell_;     // cubature weights on the side reference cell
+  scalar_view cellJac_;               // cell Jacobian matrices
+  scalar_view cellJacInv_;            // inverses of cell Jacobians
+  scalar_view cellJacDet_;            // determinants of cell Jacobians
+  scalar_view cellWeightedMeasure_;   // cell measure (Jacobian determinant) multiplied by the cubature weights
+  scalar_view valReference_;          // value of FE basis in reference space
+  scalar_view curlReference_;         // curl of FE basis in reference space
+  scalar_view valPhysical_;           // value of FE basis in physical space
+  scalar_view curlPhysical_;          // curl of FE basis in physical space
+  scalar_view valPhysicalWeighted_;   // value of FE basis in physical space multiplied by weighted cell measure
+  scalar_view curlPhysicalWeighted_;  // curl of FE basis in physical space multiplied by weighted cell measure
+  scalar_view curlcurlMats_;          // cell curl-curl matrices
+  scalar_view valvalMats_;            // cell val-val matrices
+  scalar_view cubPointsPhysical_;     // cubature points on the physical cells
+  scalar_view dofPoints_;             // degree of freedom points on the reference cell
+
+public:
+
+  FE_CURL(const scalar_view                                           cellNodes,
+          const Intrepid2::BasisPtr<DeviceType, Real, Real>           basis,
+          const ROL::Ptr<Intrepid2::Cubature<DeviceType, Real, Real>> cubature) :
+    cellNodes_(cellNodes), basis_(basis), cubature_(cubature), sideId_(-1) {
+
+    // Get base cell topology from basis.
+    cellTopo_ = ROL::makePtr<shards::CellTopology>(basis_->getBaseCellTopology());
+
+    // Compute dimensions of multidimensional array members.
+    c_  = cellNodes_.extent_int(0);
+    f_  = basis_->getCardinality();
+    p_  = cubature_->getNumPoints();
+    d_  = cellTopo_->getDimension();
+    sd_ = d_ - 1;
+
+    // Get side subcell topology.
+    sideTopo_ = ROL::nullPtr;
+
+    // Allocate multidimensional arrays.
+    cubPoints_               = scalar_view("cubPoints_", p_, d_);
+    cubWeights_              = scalar_view("cubWeights_", p_);
+    cellJac_                 = scalar_view("cellJac_", c_, p_, d_, d_);
+    cellJacInv_              = scalar_view("cellJacInv_", c_, p_, d_, d_);
+    cellJacDet_              = scalar_view("cellJacDet_", c_, p_);
+    cellWeightedMeasure_     = scalar_view("cellWeightedMeasure_", c_, p_);
+    valReference_            = scalar_view("valReference_", f_, p_, d_);
+    curlReference_           = scalar_view("curlReference_", f_, p_, d_);
+    valPhysical_             = scalar_view("valPhysical_", c_, f_, p_, d_);
+    curlPhysical_            = scalar_view("curlPhysical_", c_, f_, p_, d_);
+    valPhysicalWeighted_     = scalar_view("valPhysicalWeighted_", c_, f_, p_, d_);
+    curlPhysicalWeighted_    = scalar_view("curlPhysicalWeighted_", c_, f_, p_, d_);
+    curlcurlMats_            = scalar_view("curlcurlMats_", c_, f_, f_);
+    valvalMats_              = scalar_view("valvalMats_", c_, f_, f_);
+    cubPointsPhysical_       = scalar_view("cubPointsPhysical_", c_, p_, d_);
+    dofPoints_               = scalar_view("dofPoints_", f_, d_);
+
+    /*** START: Fill multidimensional arrays. ***/
+
+    // Compute cubature points and weights.
+    cubature_->getCubature(cubPoints_, cubWeights_);
+
+    // Compute reference basis value and curl.
+    basis_->getValues(curlReference_, cubPoints_, Intrepid2::OPERATOR_CURL);       // evaluate curl operator at cubature points
+    basis_->getValues(valReference_, cubPoints_, Intrepid2::OPERATOR_VALUE);       // evaluate value operator at cubature points
+
+    // Compute cell Jacobian matrices, its inverses and determinants.
+    ct::setJacobian(cellJac_, cubPoints_, cellNodes_, *cellTopo_);  // compute cell Jacobians
+    ct::setJacobianInv(cellJacInv_, cellJac_);                       // compute inverses of cell Jacobians
+    ct::setJacobianDet(cellJacDet_, cellJac_);                       // compute determinants of cell Jacobians
+
+    // Compute weighted cell measure, i.e., det(J)*(cubature weight).
+    fst::computeCellMeasure(cellWeightedMeasure_, cellJacDet_, cubWeights_);
+
+    // Transform reference values into physical space.
+    fst::HCURLtransformVALUE(valPhysical_, cellJacInv_, valReference_);
+
+    // Multiply with weighted measure to get weighted values in physical space.
+    fst::multiplyMeasure(valPhysicalWeighted_, cellWeightedMeasure_, valPhysical_);
+
+    // Transform reference curls into physical space.
+    fst::HCURLtransformCURL(curlPhysical_, cellJac_, cellJacDet_, curlReference_);
+
+    // Multiply with weighted measure to get weighted curls in physical space.
+    fst::multiplyMeasure(curlPhysicalWeighted_, cellWeightedMeasure_, curlPhysical_);
+
+    // Compute curl-curl matrices.
+    fst::integrate(curlcurlMats_, curlPhysical_, curlPhysicalWeighted_);
+
+    // Compute val-val matrices.
+    fst::integrate(valvalMats_, valPhysical_, valPhysicalWeighted_);
+
+    // Map reference cubature points to cells in physical space.
+    ct::mapToPhysicalFrame(cubPointsPhysical_, cubPoints_, cellNodes_, *cellTopo_);
+
+    // Compute local degrees of freedom on reference cell sides.
+    int numSides = cellTopo_->getSideCount();
+    if (cellTopo_->getDimension() == 1) {
+      numSides = 2;
+    }
+    if ( numSides ) {
+      for (int i=0; i<numSides; ++i) {
+        sideDofs_.push_back(computeBoundaryDofs(i));
+      }
+    }
+    else {
+      sideDofs_.push_back(computeBoundaryDofs(0));
+    }
+
+    // Get coordinates of DOFs in reference cell.
+    basis->getDofCoords(dofPoints_);
+
+    /*** END: Fill multidimensional arrays. ***/
+
+  }
+
+  /** \brief  Returns cell Jacobian matrices at cubature points.
+  */
+  scalar_view J() const {
+    return cellJac_;
+  }
+
+  /** \brief  Returns inverses of cell Jacobians at cubature points.
+  */
+  scalar_view invJ() const {
+    return cellJacInv_;
+  }
+
+  /** \brief  Returns determinants of cell Jacobians at cubature points.
+  */
+  scalar_view detJ() const {
+    return cellJacDet_;
+  }
+
+  /** \brief  Returns values of FE basis at cubature points in reference space.
+  */
+  scalar_view Nref() const {
+    return valReference_;
+  }
+
+  /** \brief  Returns curls of FE basis at cubature points in reference space.
+  */
+  scalar_view curlNref() const {
+    return curlReference_;
+  }
+
+  /** \brief  Returns value of FE basis at cubature points in physical space.
+  */
+  scalar_view N() const {
+    return valPhysical_;
+  }
+
+  /** \brief  Returns value of FE basis at cubature points in physical space,
+              multiplied by weighted cell measures.
+  */
+  scalar_view NdetJ() const {
+    return valPhysicalWeighted_;
+  }
+
+  /** \brief  Returns curl of FE basis at cubature points in physical space.
+  */
+  scalar_view curlN() const {
+    return curlPhysical_;
+  }
+
+  /** \brief  Returns curl of FE basis at cubature points in physical space,
+              multiplied by weighted cell measures.
+  */
+  scalar_view curlNdetJ() const {
+    return curlPhysicalWeighted_;
+  }
+
+  /** \brief  Returns curl-curl matrices on cells.
+  */
+  scalar_view curlcurlMat() const {
+    return curlcurlMats_;
+  }
+
+  /** \brief  Returns val-val matrices on cells.
+  */
+  scalar_view valvalMat() const {
+    return valvalMats_;
+  }
+
+  /** \brief Returns cubature points on cells in physical space.
+  */
+  scalar_view cubPts() const {
+    return cubPointsPhysical_;
+  }
+
+  /** \brief Builds FE value interpolant and evaluates it at cubature
+             points in physical space.  The input coefficients are
+             assumed to be 'signed' (with +/- 1 multiplying the
+             coefficient depending on the alignment with the reference
+             edge direction).
+  */
+  void evaluateValue(scalar_view fVals,
+                     const scalar_view inCoeffs) const {
+    fst::evaluate(fVals, inCoeffs, valPhysical_);
+  }
+
+  /** \brief Builds FE value interpolant and evaluates it at cubature
+             points in physical space.  The input coefficients are
+             combined with edge signs (with +/- 1 multiplying the
+             coefficient depending on the alignment with the reference
+             edge direction).
+  */
+  void evaluateValue(scalar_view fVals,
+                     const scalar_view inCoeffs,
+                     const scalar_view inSigns) const {
+    scalar_view coeffsSigned("coeffsSigned", inCoeffs.extent(0), inCoeffs.extent(3), inCoeffs.extent(2));
+    Kokkos::deep_copy(coeffsSigned, inCoeffs);
+    fst::applyFieldSigns(coeffsSigned, inSigns);
+    fst::evaluate(fVals, coeffsSigned, valPhysical_);
+  }
+
+  /** \brief Builds FE curl interpolant and evaluates it at cubature
+             points in physical space.  The input coefficients are
+             assumed to be 'signed' (with +/- 1 multiplying the
+             coefficient depending on the alignment with the reference
+             edge direction).
+  */
+  void evaluateCurl(scalar_view fCurls,
+                    const scalar_view inCoeffs) const {
+    fst::evaluate(fCurls, inCoeffs, curlPhysical_);
+  }
+
+  /** \brief Builds FE value interpolant and evaluates it at cubature
+             points in physical space.  The input coefficients are
+             combined with edge signs (with +/- 1 multiplying the
+             coefficient depending on the alignment with the reference
+             edge direction).
+  */
+  void evaluateCurl(scalar_view fCurls,
+                    const scalar_view inCoeffs,
+                    const scalar_view inSigns) const {
+    scalar_view coeffsSigned("coeffsSigned", inCoeffs.extent(0), inCoeffs.extent(3), inCoeffs.extent(2));
+    Kokkos::deep_copy(coeffsSigned, inCoeffs);
+    fst::applyFieldSigns(coeffsSigned, inSigns);
+    fst::evaluate(fCurls, coeffsSigned, curlPhysical_);
+  }
+
+  /** \brief Computes integral of the dot product of values or curls of interpolated
+             FE fields f1 and f2, indexed by (C,P,D).
+  */
+  void computeIntegral(scalar_view integral,
+                       const scalar_view f1,
+                       const scalar_view f2,
+                       const bool sumInto = false) const {
+    scalar_view f2Weighted;
+    if(f2.rank() == 3) {
+      f2Weighted = scalar_view("f2Weighted", f2.extent(0), f2.extent(1), f2.extent(2));
+    } else { //rank = 2
+      f2Weighted = scalar_view("f2Weighted", f2.extent(0), f2.extent(1));
+    }
+    fst::scalarMultiplyDataData(f2Weighted, cellWeightedMeasure_, f2);  // multiply with weighted measure
+
+    fst::integrate(integral, f1, f2Weighted, sumInto);                  // compute integral of f1*f2
+  }
+
+  /** \brief Computes the degrees of freedom on a side.
+  */
+  std::vector<int> computeBoundaryDofs(const int & locSideId) const {
+
+    std::vector<int> bdrydofs;
+    std::vector<int> nodeids, edgeids, faceids;
+
+    if (cellTopo_->getDimension() == 1) {
+      nodeids.push_back(locSideId);
+    }
+    else if (cellTopo_->getDimension() == 2) {
+      edgeids.push_back(locSideId);
+      int numVertices = 2;
+      for (int i=0; i<numVertices; ++i) {
+        nodeids.push_back(cellTopo_->getNodeMap(1, locSideId, i));
+      }
+      //for (unsigned i=0; i<nodeids.size(); ++i) {
+      //  std::cout << "\nnodeid = " << nodeids[i];
+      //}
+      //for (unsigned i=0; i<edgeids.size(); ++i) {
+      //  std::cout << "\nedgeid = " << edgeids[i];
+      //}
+    }
+    else if (cellTopo_->getDimension() == 3) {
+      faceids.push_back(locSideId);
+      CellTopologyData_Subcell face = cellTopo_->getCellTopologyData()->subcell[2][locSideId];
+      shards::CellTopology face_topo(face.topology);
+      int numVertices = face_topo.getVertexCount();
+      for (int i=0; i<numVertices; ++i) {
+        nodeids.push_back(cellTopo_->getNodeMap(2, locSideId, i));
+      }
+      int numEdges = face_topo.getEdgeCount();
+      for (int i=0; i<numEdges; ++i) {
+        edgeids.push_back(mapCellFaceEdge(cellTopo_->getCellTopologyData(), locSideId, i));
+      }
+      //for (unsigned i=0; i<nodeids.size(); ++i) {
+      //  std::cout << "\nnodeid = " << nodeids[i];
+      //}
+      //for (unsigned i=0; i<edgeids.size(); ++i) {
+      //  std::cout << "\nedgeid = " << edgeids[i];
+      //}
+      //for (unsigned i=0; i<faceids.size(); ++i) {
+      //  std::cout << "\nfaceid = " << faceids[i];
+      //}
+    }
+
+    auto tagToId = basis_->getAllDofOrdinal();
+    std::vector<std::vector<std::vector<int> > > tagToIdCompact;
+    int scdim = tagToId.extent_int(0);
+    tagToIdCompact.resize(scdim);
+    for (int i=0; i<scdim; ++i) {
+      for (int j=0; j<tagToId.extent_int(1); ++j) {
+        std::vector<int> ids;
+        for (int k=0; k<tagToId.extent_int(2); ++k) {
+          //std::cout << "\n  i=" << i << "  j=" << j << "  k=" << k << "  id=" << tagToId[i][j][k];
+          if (tagToId(i,j,k) != -1) {
+            ids.push_back(tagToId(i,j,k));
+          }
+        }
+        tagToIdCompact[i].push_back(ids);
+        //int dofordcompact = tagToIdCompact[i][j].size();
+        //for (int k=0; k<dofordcompact; ++k) {
+        //  std::cout << "\n  i=" << i << "  j=" << j << "  k=" << k << "  id=" << tagToIdCompact[i][j][k];
+        //}
+      }
+    }
+
+    int numNodeIds = nodeids.size();
+    if (tagToIdCompact.size() > 0) {
+      for (int i=0; i<numNodeIds; ++i) {
+        int numdofs = tagToIdCompact[0][nodeids[i]].size();
+        for (int j=0; j<numdofs; ++j) {
+          bdrydofs.push_back(tagToIdCompact[0][nodeids[i]][j]);
+        }
+      }
+    }
+
+    int numEdgeIds = edgeids.size();
+    if (tagToIdCompact.size() > 1) {
+      for (int i=0; i<numEdgeIds; ++i) {
+        int numdofs = tagToIdCompact[1][edgeids[i]].size();
+        for (int j=0; j<numdofs; ++j) {
+          bdrydofs.push_back(tagToIdCompact[1][edgeids[i]][j]);
+        }
+      }
+    }
+
+    int numFaceIds = faceids.size();
+    if (tagToIdCompact.size() > 2) {
+      for (int i=0; i<numFaceIds; ++i) {
+        int numdofs = tagToIdCompact[2][faceids[i]].size();
+        for (int j=0; j<numdofs; ++j) {
+          bdrydofs.push_back(tagToIdCompact[2][faceids[i]][j]);
+        }
+      }
+    }
+
+    //for (unsigned i=0; i<bdrydofs.size(); ++i) {
+    //  std::cout << "\ndofid = " << bdrydofs[i];
+    //}
+
+    return bdrydofs;
+
+  }
+
+  /** \brief Returns the degrees of freedom on reference cell sides.
+  */
+  const std::vector<std::vector<int> > & getBoundaryDofs() const {
+    return sideDofs_;
+  }
+
+  /** \brief Computes coordinates of degrees of freedom on cells.
+  */
+  void computeDofCoords(scalar_view dofCoords,
+                        const scalar_view cellNodes) const {
+    // Map reference DOF locations to physical space.
+    ct::mapToPhysicalFrame(dofCoords, dofPoints_, cellNodes, *cellTopo_);
+  }
+
+  void mapRefPointsToPhysical(scalar_view px,
+                              const scalar_view rx) const {
+    // Map input reference cell points to cells in physical space.
+    ct::mapToPhysicalFrame(px, rx, cellNodes_, *cellTopo_);
+  }
+
+}; // FE_CURL
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/TOOLS/fieldhelperK.hpp b/packages/rol/example/PDE-OPT/TOOLS/fieldhelperK.hpp
new file mode 100644
index 000000000000..18df3c423c86
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/TOOLS/fieldhelperK.hpp
@@ -0,0 +1,207 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  fieldhelperK.hpp
+    \brief
+*/
+
+#ifndef FIELDHELPERK_HPP
+#define FIELDHELPERK_HPP
+
+#include "Kokkos_DynRankView.hpp"
+#include "ROL_Ptr.hpp"
+
+namespace FieldUtils {
+
+struct FieldInfo {
+  const int numFields, numDofs;
+  const std::vector<int> numFieldDofs;
+  const std::vector<std::vector<int>> fieldPattern;
+
+  FieldInfo(const int numFields_, const int numDofs_,
+            const std::vector<int> &numFieldDofs_,
+            const std::vector<std::vector<int>> &fieldPattern_)
+    : numFields(numFields_), numDofs(numDofs_),
+      numFieldDofs(numFieldDofs_), fieldPattern(fieldPattern_) {}
+};
+
+template<typename Real, typename DeviceType>
+inline void splitFieldCoeff(std::vector<Kokkos::DynRankView<Real,DeviceType>> &U,
+                            const Kokkos::DynRankView<Real,DeviceType>        u_coeff,
+                            const ROL::Ptr<const FieldInfo>                   &info) {
+  const int numFields = info->numFields;
+  U.resize(numFields);
+  const int c = u_coeff.extent_int(0);
+  for (int i=0; i<numFields; ++i) {
+    const int numFieldDofs = info->numFieldDofs[i];
+    U[i] = Kokkos::DynRankView<Real,DeviceType>("U",c,numFieldDofs);
+    for (int j=0; j<c; ++j) {
+      for (int k=0; k<numFieldDofs; ++k) {
+        U[i](j,k) = u_coeff(j,info->fieldPattern[i][k]);
+      }
+    }
+  }
+}
+
+template<typename Real, typename DeviceType>
+inline void splitFieldCoeff(std::vector<std::vector<Kokkos::DynRankView<Real,DeviceType>>> &J,
+                            const Kokkos::DynRankView<Real,DeviceType>                     jac,
+                            const ROL::Ptr<const FieldInfo>                                &rowInfo,
+                            const ROL::Ptr<const FieldInfo>                                &colInfo) {
+  const int rowNumFields = rowInfo->numFields;
+  const int colNumFields = colInfo->numFields;
+  J.resize(rowNumFields);
+  const int c = jac.extent_int(0);
+  for (int i=0; i<rowNumFields; ++i) {
+    const int rowNumFieldDofs = rowInfo->numFieldDofs[i];
+    J[i].resize(colNumFields,ROL::nullPtr);
+    for (int j=0; j<colNumFields; ++j) {
+      const int colNumFieldDofs = colInfo->numFieldDofs[j];
+      J[i][j] = Kokkos::DynRankView<Real,DeviceType>("J",c,rowNumFieldDofs,colNumFieldDofs);
+      for (int k=0; k<c; ++k) {
+        for (int l=0; l<rowNumFieldDofs; ++l) {
+          for (int m=0; m<colNumFieldDofs; ++m) {
+            J[i][j](k,l,m) = jac(k,rowInfo->fieldPattern[i][l],colInfo->fieldPattern[j][m]);
+          }
+        }
+      }
+    }
+  }
+}
+
+template<typename Real, typename DeviceType>
+inline void combineFieldCoeff(Kokkos::DynRankView<Real,DeviceType>                     &res,
+                              const std::vector<Kokkos::DynRankView<Real,DeviceType>>  &R,
+                              const ROL::Ptr<const FieldInfo>                          &info) {
+  const int numFields = info->numFields;
+  const int c = R[0].extent_int(0);  // number of cells
+  res = Kokkos::DynRankView<Real,DeviceType>("res", c, info->numDofs);
+  for (int i=0; i<numFields; ++i) {
+    const int numFieldDofs = info->numFieldDofs[i];
+    for (int j=0; j<c; ++j) {
+      for (int k=0; k<numFieldDofs; ++k) {
+        res(j,info->fieldPattern[i][k]) = R[i](j,k);
+      }
+    }
+  }
+}
+
+template<typename Real, typename DeviceType>
+inline void combineFieldCoeff(Kokkos::DynRankView<Real,DeviceType>                                 &jac,
+                              const std::vector<std::vector<Kokkos::DynRankView<Real,DeviceType>>> &J,
+                              const ROL::Ptr<const FieldInfo>                                      &rowInfo,
+                              const ROL::Ptr<const FieldInfo>                                      &colInfo) {
+  const int rowNumFields = rowInfo->numFields;
+  const int colNumFields = colInfo->numFields;
+  const int c = J[0][0].extent_int(0);  // number of cells
+  jac = Kokkos::DynRankView<Real,DeviceType>("jac", c, rowInfo->numDofs, colInfo->numDofs);
+  for (int i=0; i<rowNumFields; ++i) {
+    const int rowNumFieldDofs = rowInfo->numFieldDofs[i];
+    for (int j=0; j<colNumFields; ++j) {
+      const int colNumFieldDofs = colInfo->numFieldDofs[j];
+      for (int k=0; k<c; ++k) {
+        for (int l=0; l<rowNumFieldDofs; ++l) {
+          for (int m=0; m<colNumFieldDofs; ++m) {
+            jac(k,rowInfo->fieldPattern[i][l],colInfo->fieldPattern[j][m]) = J[i][j](k,l,m);
+          }
+        }
+      }
+    }
+  }
+}
+
+}
+
+template<class Real, class DeviceType>
+class FieldHelper {
+  private:
+  const int numFields_, numDofs_;
+  const std::vector<int> numFieldDofs_;
+  const std::vector<std::vector<int>> fieldPattern_;
+
+  public:
+  using scalar_view =  Kokkos::DynRankView<Real,DeviceType>;
+
+  FieldHelper(const int numFields, const int numDofs,
+              const std::vector<int> &numFieldDofs,
+              const std::vector<std::vector<int>> &fieldPattern)
+    : numFields_(numFields), numDofs_(numDofs),
+      numFieldDofs_(numFieldDofs), fieldPattern_(fieldPattern) {}
+
+  void splitFieldCoeff(std::vector<scalar_view> & U,
+                       const scalar_view u_coeff) const {
+    U.resize(numFields_);
+    int  c = u_coeff.extent_int(0);
+    for (int i=0; i<numFields_; ++i) {
+      U[i] = scalar_view("U",c,numFieldDofs_[i]);
+      for (int j=0; j<c; ++j) {
+        for (int k=0; k<numFieldDofs_[i]; ++k) {
+          //U[i](j,k) = u_coeff(j,offset[i]+k);
+          U[i](j,k) = u_coeff(j,fieldPattern_[i][k]);
+        }
+      }
+    }
+  }
+
+  void splitFieldCoeff(std::vector<std::vector<scalar_view>> & J,
+                       const scalar_view jac) const {
+    J.resize(numFields_);
+    int  c = jac.extent_int(0);
+    for (int i=0; i<numFields_; ++i) {
+      J[i].resize(numFields_,ROL::nullPtr);
+      for (int j=0; j<numFields_; ++j) {
+        J[i][j] = scalar_view("J",c,numFieldDofs_[i],numFieldDofs_[j]);
+        for (int k=0; k<c; ++k) {
+          for (int l=0; l<numFieldDofs_[i]; ++l) {
+            for (int m=0; m<numFieldDofs_[j]; ++m) {
+              J[i][j](k,l,m) = jac(k,fieldPattern_[i][l],fieldPattern_[j][m]);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void combineFieldCoeff(scalar_view & res,
+                         const std::vector<scalar_view> & R) const {
+    int c = R[0].extent_int(0);  // number of cells
+    res = scalar_view("res",c, numDofs_);        
+    for (int i=0; i<numFields_; ++i) {
+      for (int j=0; j<c; ++j) {
+        for (int k=0; k<numFieldDofs_[i]; ++k) {
+          res(j,fieldPattern_[i][k]) = R[i](j,k);
+        }
+      }
+    }
+  }
+
+  void combineFieldCoeff(scalar_view & jac,
+                         const std::vector<std::vector<scalar_view>> & J) const {
+    int c = J[0][0].extent_int(0);  // number of cells
+    jac = scalar_view("jac",c, numDofs_, numDofs_);        
+    for (int i=0; i<numFields_; ++i) {
+      for (int j=0; j<numFields_; ++j) {
+        for (int k=0; k<c; ++k) {
+          for (int l=0; l<numFieldDofs_[i]; ++l) {
+            for (int m=0; m<numFieldDofs_[j]; ++m) {
+              jac(k,fieldPattern_[i][l],fieldPattern_[j][m]) = J[i][j](k,l,m);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  int numFields(void) const {
+    return numFields_;
+  }
+
+};
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/TOOLS/integralconstraintK.hpp b/packages/rol/example/PDE-OPT/TOOLS/integralconstraintK.hpp
new file mode 100644
index 000000000000..75d6019d7b27
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/TOOLS/integralconstraintK.hpp
@@ -0,0 +1,185 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+#ifndef PDE_INTEGRALCONSTRAINT_HPP
+#define PDE_INTEGRALCONSTRAINT_HPP
+
+#include "ROL_Constraint_SimOpt.hpp"
+#include "integralobjectiveK.hpp"
+
+template<class Real, class DeviceType>
+class IntegralConstraint : public ROL::Constraint_SimOpt<Real> {
+private:
+  const ROL::Ptr<QoI<Real,DeviceType>> qoi_;
+  const ROL::Ptr<Assembler<Real,DeviceType>> assembler_;
+  ROL::Ptr<IntegralObjective<Real,DeviceType>> obj_;
+  ROL::Ptr<ROL::Vector<Real>> dualUVector_;
+  ROL::Ptr<ROL::Vector<Real>> dualZVector_;
+  bool isUvecInitialized_;
+  bool isZvecInitialized_;
+
+public:
+  IntegralConstraint(const ROL::Ptr<QoI<Real,DeviceType>> &qoi,
+                     const ROL::Ptr<Assembler<Real,DeviceType>> &assembler)
+    : qoi_(qoi), assembler_(assembler),
+      isUvecInitialized_(false), isZvecInitialized_(false) {
+    obj_ = ROL::makePtr<IntegralObjective<Real,DeviceType>>(qoi,assembler);
+  }
+
+  void setParameter(const std::vector<Real> &param) {
+    ROL::Constraint_SimOpt<Real>::setParameter(param);
+    obj_->setParameter(param);
+  }
+
+  void value(ROL::Vector<Real> &c, const ROL::Vector<Real> &u, const ROL::Vector<Real> &z, Real &tol) {
+    ROL::Ptr<std::vector<Real>> cp =
+      (dynamic_cast<ROL::StdVector<Real>&>(c)).getVector();
+    (*cp)[0] = obj_->value(u,z,tol);
+  }
+
+  void applyJacobian_1(ROL::Vector<Real> &jv, const ROL::Vector<Real> &v,
+                 const ROL::Vector<Real> &u, const ROL::Vector<Real> &z, Real &tol ) {
+    if ( !isUvecInitialized_ ) {
+      dualUVector_ = u.dual().clone();
+      isUvecInitialized_ = true;
+    }
+    ROL::Ptr<std::vector<Real>> jvp =
+      (dynamic_cast<ROL::StdVector<Real>&>(jv)).getVector();
+    obj_->gradient_1(*dualUVector_,u,z,tol);
+    //(*jvp)[0] = v.dot(dualUVector_->dual());
+    (*jvp)[0] = v.apply(*dualUVector_);
+  }
+
+  void applyJacobian_2(ROL::Vector<Real> &jv, const ROL::Vector<Real> &v,
+                 const ROL::Vector<Real> &u, const ROL::Vector<Real> &z, Real &tol ) {
+    if ( !isZvecInitialized_ ) {
+      dualZVector_ = z.dual().clone();
+      isZvecInitialized_ = true;
+    }
+    ROL::Ptr<std::vector<Real>> jvp =
+      (dynamic_cast<ROL::StdVector<Real>&>(jv)).getVector();
+    obj_->gradient_2(*dualZVector_,u,z,tol);
+    //(*jvp)[0] = v.dot(dualZVector_->dual());
+    (*jvp)[0] = v.apply(*dualZVector_);
+  }
+
+  void applyAdjointJacobian_1(ROL::Vector<Real> &jv, const ROL::Vector<Real> &v,
+                        const ROL::Vector<Real> &u,  const ROL::Vector<Real> &z, Real &tol ) {
+    ROL::Ptr<const std::vector<Real>> vp =
+      (dynamic_cast<const ROL::StdVector<Real>&>(v)).getVector();
+    obj_->gradient_1(jv,u,z,tol);
+    jv.scale((*vp)[0]);
+  }
+
+  void applyAdjointJacobian_2(ROL::Vector<Real> &jv, const ROL::Vector<Real> &v,
+                        const ROL::Vector<Real> &u,  const ROL::Vector<Real> &z, Real &tol ) {
+    ROL::Ptr<const std::vector<Real>> vp =
+      (dynamic_cast<const ROL::StdVector<Real>&>(v)).getVector();
+    obj_->gradient_2(jv,u,z,tol);
+    jv.scale((*vp)[0]);
+  }
+
+  void applyAdjointHessian_11( ROL::Vector<Real> &ahwv,
+                         const ROL::Vector<Real> &w, const ROL::Vector<Real> &v, 
+                         const ROL::Vector<Real> &u, const ROL::Vector<Real> &z, Real &tol ) {
+    ROL::Ptr<const std::vector<Real>> wp =
+      (dynamic_cast<const ROL::StdVector<Real>&>(w)).getVector();
+    obj_->hessVec_11(ahwv,v,u,z,tol);
+    ahwv.scale((*wp)[0]);
+  }
+
+  void applyAdjointHessian_12( ROL::Vector<Real> &ahwv,
+                         const ROL::Vector<Real> &w, const ROL::Vector<Real> &v, 
+                         const ROL::Vector<Real> &u, const ROL::Vector<Real> &z, Real &tol ) {
+    ROL::Ptr<const std::vector<Real>> wp =
+      (dynamic_cast<const ROL::StdVector<Real>&>(w)).getVector();
+    obj_->hessVec_12(ahwv,v,u,z,tol);
+    ahwv.scale((*wp)[0]);
+  }
+
+  void applyAdjointHessian_21( ROL::Vector<Real> &ahwv,
+                         const ROL::Vector<Real> &w, const ROL::Vector<Real> &v, 
+                         const ROL::Vector<Real> &u, const ROL::Vector<Real> &z, Real &tol ) {
+    ROL::Ptr<const std::vector<Real>> wp =
+      (dynamic_cast<const ROL::StdVector<Real>&>(w)).getVector();
+    obj_->hessVec_21(ahwv,v,u,z,tol);
+    ahwv.scale((*wp)[0]);
+  }
+
+  void applyAdjointHessian_22( ROL::Vector<Real> &ahwv,
+                         const ROL::Vector<Real> &w, const ROL::Vector<Real> &v, 
+                         const ROL::Vector<Real> &u, const ROL::Vector<Real> &z, Real &tol ) {
+    ROL::Ptr<const std::vector<Real>> wp =
+      (dynamic_cast<const ROL::StdVector<Real>&>(w)).getVector();
+    obj_->hessVec_22(ahwv,v,u,z,tol);
+    ahwv.scale((*wp)[0]);
+  }
+}; // class IntegralConstraint
+
+
+template<class Real, class DeviceType>
+class IntegralOptConstraint : public ROL::Constraint<Real> {
+private:
+  const ROL::Ptr<QoI<Real,DeviceType>> qoi_;
+  const ROL::Ptr<Assembler<Real,DeviceType>> assembler_;
+  ROL::Ptr<IntegralOptObjective<Real,DeviceType>> obj_;
+  ROL::Ptr<ROL::Vector<Real>> dualZVector_;
+  bool isZvecInitialized_;
+
+public:
+  IntegralOptConstraint(const ROL::Ptr<QoI<Real,DeviceType>> &qoi,
+                        const ROL::Ptr<Assembler<Real,DeviceType>> &assembler)
+    : qoi_(qoi), assembler_(assembler), isZvecInitialized_(false) {
+    obj_ = ROL::makePtr<IntegralOptObjective<Real,DeviceType>>(qoi,assembler);
+  }
+
+  void setParameter(const std::vector<Real> &param) {
+    ROL::Constraint<Real>::setParameter(param);
+    obj_->setParameter(param);
+  }
+
+  void value(ROL::Vector<Real> &c, const ROL::Vector<Real> &z, Real &tol) {
+    ROL::Ptr<std::vector<Real>> cp =
+      (dynamic_cast<ROL::StdVector<Real>&>(c)).getVector();
+    (*cp)[0] = obj_->value(z,tol);
+  }
+
+  void applyJacobian(ROL::Vector<Real> &jv, const ROL::Vector<Real> &v,
+                     const ROL::Vector<Real> &z, Real &tol ) {
+    if ( !isZvecInitialized_ ) {
+      dualZVector_ = z.dual().clone();
+      isZvecInitialized_ = true;
+    }
+    ROL::Ptr<std::vector<Real>> jvp =
+      (dynamic_cast<ROL::StdVector<Real>&>(jv)).getVector();
+    obj_->gradient(*dualZVector_,z,tol);
+    //(*jvp)[0] = v.dot(dualZVector_->dual());
+    (*jvp)[0] = v.apply(*dualZVector_);
+  }
+
+  void applyAdjointJacobian(ROL::Vector<Real> &jv, const ROL::Vector<Real> &v,
+                            const ROL::Vector<Real> &z, Real &tol ) {
+    ROL::Ptr<const std::vector<Real>> vp =
+      (dynamic_cast<const ROL::StdVector<Real>&>(v)).getVector();
+    obj_->gradient(jv,z,tol);
+    jv.scale((*vp)[0]);
+  }
+
+  void applyAdjointHessian(ROL::Vector<Real> &ahwv,
+                           const ROL::Vector<Real> &w, const ROL::Vector<Real> &v, 
+                           const ROL::Vector<Real> &z, Real &tol ) {
+    ROL::Ptr<const std::vector<Real>> wp =
+      (dynamic_cast<const ROL::StdVector<Real>&>(w)).getVector();
+    obj_->hessVec(ahwv,v,z,tol);
+    ahwv.scale((*wp)[0]);
+  }
+
+}; // class IntegralOptConstraint
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/TOOLS/integralobjectiveK.hpp b/packages/rol/example/PDE-OPT/TOOLS/integralobjectiveK.hpp
new file mode 100644
index 000000000000..ceaa2013a3d8
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/TOOLS/integralobjectiveK.hpp
@@ -0,0 +1,1022 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+#ifndef PDE_INTEGRALOBJECTIVEK_HPP
+#define PDE_INTEGRALOBJECTIVEK_HPP
+
+#include "ROL_Objective_SimOpt.hpp"
+#include "qoiK.hpp"
+#include "assemblerK.hpp"
+#include "pdevectorK.hpp"
+
+// Do not instantiate the template in this translation unit.
+extern template class Assembler<double, Kokkos::HostSpace>;
+
+template<class Real, class DeviceType>
+class IntegralObjective : public ROL::Objective_SimOpt<Real> {
+public:
+  using QoI_type = QoI<Real, DeviceType>;
+  using Assembler_type = Assembler<Real, DeviceType>;
+private:
+  const ROL::Ptr<QoI_type> qoi_;
+  const ROL::Ptr<Assembler_type> assembler_;
+
+  ROL::Ptr<Tpetra::MultiVector<> > vecG1_;
+  ROL::Ptr<Tpetra::MultiVector<> > vecG2_;
+  ROL::Ptr<std::vector<Real> >     vecG3_;
+  ROL::Ptr<Tpetra::MultiVector<> > vecH11_;
+  ROL::Ptr<Tpetra::MultiVector<> > vecH12_;
+  ROL::Ptr<Tpetra::MultiVector<> > vecH13_;
+  ROL::Ptr<Tpetra::MultiVector<> > vecH21_;
+  ROL::Ptr<Tpetra::MultiVector<> > vecH22_;
+  ROL::Ptr<Tpetra::MultiVector<> > vecH23_;
+  ROL::Ptr<std::vector<Real> >     vecH31_;
+  ROL::Ptr<std::vector<Real> >     vecH32_;
+  ROL::Ptr<std::vector<Real> >     vecH33_;
+
+public:
+  IntegralObjective(const ROL::Ptr<QoI_type > &qoi,
+                    const ROL::Ptr<Assembler_type> &assembler)
+    : qoi_(qoi), assembler_(assembler) {}
+
+  void setParameter(const std::vector<Real> &param) {
+    ROL::Objective_SimOpt<Real>::setParameter(param);
+    qoi_->setParameter(param);
+  }
+
+  Real value(const ROL::Vector<Real> &u, const ROL::Vector<Real> &z, Real &tol) override {
+    ROL::Ptr<const Tpetra::MultiVector<> > uf = getConstField(u);
+    ROL::Ptr<const Tpetra::MultiVector<> > zf = getConstField(z);
+    ROL::Ptr<const std::vector<Real> >     zp = getConstParameter(z);
+    return assembler_->assembleQoIValue(qoi_,uf,zf,zp);
+  }
+
+  void gradient_1(ROL::Vector<Real> &g, const ROL::Vector<Real> &u,
+                  const ROL::Vector<Real> &z, Real &tol ) override {
+    g.zero();
+    try {
+      ROL::Ptr<Tpetra::MultiVector<> >       gf = getField(g);
+      ROL::Ptr<const Tpetra::MultiVector<> > uf = getConstField(u);
+      ROL::Ptr<const Tpetra::MultiVector<> > zf = getConstField(z);
+      ROL::Ptr<const std::vector<Real> >     zp = getConstParameter(z);
+      assembler_->assembleQoIGradient1(vecG1_,qoi_,uf,zf,zp);
+      gf->scale(static_cast<Real>(1),*vecG1_);
+    }
+    catch ( Exception::Zero & ez ) {
+      g.zero();
+    }
+    catch ( Exception::NotImplemented & eni ) {
+      ROL::Objective_SimOpt<Real>::gradient_1(g,u,z,tol);
+    }
+  }
+
+  void gradient_2(ROL::Vector<Real> &g, const ROL::Vector<Real> &u,
+                  const ROL::Vector<Real> &z, Real &tol ) override {
+    int NotImplemented(0), IsZero(0);
+    g.zero();
+    // Compute control field gradient
+    try {
+      // Get state and control vectors
+      ROL::Ptr<const Tpetra::MultiVector<> > uf = getConstField(u);
+      ROL::Ptr<const Tpetra::MultiVector<> > zf = getConstField(z);
+      ROL::Ptr<const std::vector<Real> >     zp = getConstParameter(z);
+      ROL::Ptr<Tpetra::MultiVector<> >       gf = getField(g);
+      assembler_->assembleQoIGradient2(vecG2_,qoi_,uf,zf,zp);
+      gf->scale(static_cast<Real>(1),*vecG2_);
+    }
+    catch ( Exception::Zero & ez ) {
+      IsZero++;
+    }
+    catch ( Exception::NotImplemented & eni ) {
+      NotImplemented++;
+    }
+    // Compute control parameter gradient
+    try {
+      // Get state and control vectors
+      ROL::Ptr<const Tpetra::MultiVector<> > uf = getConstField(u);
+      ROL::Ptr<const Tpetra::MultiVector<> > zf = getConstField(z);
+      ROL::Ptr<const std::vector<Real> >     zp = getConstParameter(z);
+      ROL::Ptr<std::vector<Real> >           gp = getParameter(g);
+      assembler_->assembleQoIGradient3(vecG3_,qoi_,uf,zf,zp);
+      gp->assign(vecG3_->begin(),vecG3_->end());
+    }
+    catch ( Exception::Zero & ez ) {
+      IsZero++;
+    }
+    catch ( Exception::NotImplemented & eni ) {
+      NotImplemented++;
+    }
+    // Zero gradient
+    if ( IsZero == 2 || (IsZero == 1 && NotImplemented == 1) ) {
+      g.zero();
+    }
+    // Not Implemented
+    if ( NotImplemented == 2 ) {
+      ROL::Objective_SimOpt<Real>::gradient_2(g,u,z,tol);
+    }
+  }
+
+  void hessVec_11( ROL::Vector<Real> &hv, const ROL::Vector<Real> &v, 
+             const ROL::Vector<Real> &u,  const ROL::Vector<Real> &z, Real &tol ) override {
+    hv.zero();
+    try {
+      ROL::Ptr<Tpetra::MultiVector<> >      hvf = getField(hv);
+      ROL::Ptr<const Tpetra::MultiVector<> > vf = getConstField(v);
+      ROL::Ptr<const Tpetra::MultiVector<> > uf = getConstField(u);
+      ROL::Ptr<const Tpetra::MultiVector<> > zf = getConstField(z);
+      ROL::Ptr<const std::vector<Real> >     zp = getConstParameter(z);
+      assembler_->assembleQoIHessVec11(vecH11_,qoi_,vf,uf,zf,zp);
+      hvf->scale(static_cast<Real>(1),*vecH11_);
+    }
+    catch (Exception::Zero &ez) {
+      hv.zero();
+    }
+    catch (Exception::NotImplemented &eni) {
+      ROL::Objective_SimOpt<Real>::hessVec_11(hv,v,u,z,tol);
+    }
+  }
+
+  void hessVec_12( ROL::Vector<Real> &hv, const ROL::Vector<Real> &v, 
+                   const ROL::Vector<Real> &u, const ROL::Vector<Real> &z, Real &tol ) override {
+    int NotImplemented(0), IsZero(0);
+    hv.zero();
+    // Compute state field/control field hessvec
+    try {
+      ROL::Ptr<Tpetra::MultiVector<> >      hvf = getField(hv);
+      ROL::Ptr<const Tpetra::MultiVector<> > vf = getConstField(v);
+      ROL::Ptr<const Tpetra::MultiVector<> > uf = getConstField(u);
+      ROL::Ptr<const Tpetra::MultiVector<> > zf = getConstField(z);
+      ROL::Ptr<const std::vector<Real> >     zp = getConstParameter(z);
+      assembler_->assembleQoIHessVec12(vecH12_,qoi_,vf,uf,zf,zp);
+      hvf->scale(static_cast<Real>(1),*vecH12_);
+    }
+    catch (Exception::Zero &ez) {
+      hv.zero();
+      IsZero++;
+    }
+    catch (Exception::NotImplemented &eni) {
+      hv.zero();
+      NotImplemented++;
+    }
+    // Compute state field/control parameter hessvec
+    try {
+      // Get state and control vectors
+      ROL::Ptr<Tpetra::MultiVector<> >      hvf = getField(hv);
+      ROL::Ptr<const std::vector<Real> >     vp = getConstParameter(v);
+      ROL::Ptr<const Tpetra::MultiVector<> > uf = getConstField(u);
+      ROL::Ptr<const Tpetra::MultiVector<> > zf = getConstField(z);
+      ROL::Ptr<const std::vector<Real> >     zp = getConstParameter(z);
+      assembler_->assembleQoIHessVec13(vecH13_,qoi_,vp,uf,zf,zp);
+      hvf->update(static_cast<Real>(1),*vecH13_,static_cast<Real>(1));
+    }
+    catch ( Exception::Zero & ez ) {
+      IsZero++;
+    }
+    catch ( Exception::NotImplemented & eni ) {
+      NotImplemented++;
+    }
+    // Zero hessvec
+    if ( IsZero == 2 || (IsZero == 1 && NotImplemented == 1) ) {
+      hv.zero();
+    }
+    // Not Implemented
+    if ( NotImplemented == 2 ) {
+      ROL::Objective_SimOpt<Real>::hessVec_12(hv,v,u,z,tol);
+    }
+  }
+
+  void hessVec_21( ROL::Vector<Real> &hv, const ROL::Vector<Real> &v, 
+                   const ROL::Vector<Real> &u, const ROL::Vector<Real> &z, Real &tol ) override {
+    int NotImplemented(0), IsZero(0);
+    hv.zero();
+    // Compute control field/state field hessvec
+    try {
+      // Get state and control vectors
+      ROL::Ptr<Tpetra::MultiVector<> >      hvf = getField(hv);
+      ROL::Ptr<const Tpetra::MultiVector<> > vf = getConstField(v);
+      ROL::Ptr<const Tpetra::MultiVector<> > uf = getConstField(u);
+      ROL::Ptr<const Tpetra::MultiVector<> > zf = getConstField(z);
+      ROL::Ptr<const std::vector<Real> >     zp = getConstParameter(z);
+      assembler_->assembleQoIHessVec21(vecH21_,qoi_,vf,uf,zf,zp);
+      hvf->scale(static_cast<Real>(1),*vecH21_);
+    }
+    catch ( Exception::Zero & ez ) {
+      hv.zero();
+      IsZero++;
+    }
+    catch ( Exception::NotImplemented & eni ) {
+      hv.zero();
+      NotImplemented++;
+    }
+    // Compute control parameter/state field hessvec
+    try {
+      // Get state and control vectors
+      ROL::Ptr<std::vector<Real> >          hvp = getParameter(hv);
+      ROL::Ptr<const Tpetra::MultiVector<> > vf = getConstField(v);
+      ROL::Ptr<const Tpetra::MultiVector<> > uf = getConstField(u);
+      ROL::Ptr<const Tpetra::MultiVector<> > zf = getConstField(z);
+      ROL::Ptr<const std::vector<Real> >     zp = getConstParameter(z);
+      assembler_->assembleQoIHessVec31(vecH31_,qoi_,vf,uf,zf,zp);
+      hvp->assign(vecH31_->begin(),vecH31_->end());
+    }
+    catch ( Exception::Zero & ez ) {
+      IsZero++;
+    }
+    catch ( Exception::NotImplemented & eni ) {
+      NotImplemented++;
+    }
+    // Zero hessvec
+    if ( IsZero == 2 || (IsZero == 1 && NotImplemented == 1) ) {
+      hv.zero();
+    }
+    // Not Implemented
+    if ( NotImplemented == 2 ) {
+      ROL::Objective_SimOpt<Real>::hessVec_21(hv,v,u,z,tol);
+    }
+  }
+
+  void hessVec_22( ROL::Vector<Real> &hv, const ROL::Vector<Real> &v, 
+             const ROL::Vector<Real> &u,  const ROL::Vector<Real> &z, Real &tol ) override {
+    int NotImplemented(0), IsZero(0);
+    hv.zero();
+    // Compute control field/field hessvec
+    try {
+      ROL::Ptr<Tpetra::MultiVector<> >      hvf = getField(hv);
+      ROL::Ptr<const Tpetra::MultiVector<> > vf = getConstField(v);
+      ROL::Ptr<const Tpetra::MultiVector<> > uf = getConstField(u);
+      ROL::Ptr<const Tpetra::MultiVector<> > zf = getConstField(z);
+      ROL::Ptr<const std::vector<Real> >     zp = getConstParameter(z);
+      assembler_->assembleQoIHessVec22(vecH22_,qoi_,vf,uf,zf,zp);
+      hvf->scale(static_cast<Real>(1),*vecH22_);
+    }
+    catch (Exception::Zero &ez) {
+      hv.zero();
+      IsZero++;
+    }
+    catch (Exception::NotImplemented &eni) {
+      hv.zero();
+      NotImplemented++;
+    }
+    // Compute control field/parameter hessvec
+    try {
+      // Get state and control vectors
+      ROL::Ptr<Tpetra::MultiVector<> >      hvf = getField(hv);
+      ROL::Ptr<const std::vector<Real> >     vp = getConstParameter(v);
+      ROL::Ptr<const Tpetra::MultiVector<> > uf = getConstField(u);
+      ROL::Ptr<const Tpetra::MultiVector<> > zf = getConstField(z);
+      ROL::Ptr<const std::vector<Real> >     zp = getConstParameter(z);
+      assembler_->assembleQoIHessVec23(vecH23_,qoi_,vp,uf,zf,zp);
+      hvf->update(static_cast<Real>(1),*vecH23_,static_cast<Real>(1));
+    }
+    catch ( Exception::Zero & ez ) {
+      IsZero++;
+    }
+    catch ( Exception::NotImplemented & eni ) {
+      NotImplemented++;
+    }
+    // Compute control parameter/field hessvec
+    try {
+      ROL::Ptr<std::vector<Real> >          hvp = getParameter(hv);
+      ROL::Ptr<const Tpetra::MultiVector<> > vf = getConstField(v);
+      ROL::Ptr<const Tpetra::MultiVector<> > uf = getConstField(u);
+      ROL::Ptr<const Tpetra::MultiVector<> > zf = getConstField(z);
+      ROL::Ptr<const std::vector<Real> >     zp = getConstParameter(z);
+      assembler_->assembleQoIHessVec32(vecH32_,qoi_,vf,uf,zf,zp);
+      hvp->assign(vecH32_->begin(),vecH32_->end());
+    }
+    catch (Exception::Zero &ez) {
+      ROL::Ptr<std::vector<Real> > hvp = getParameter(hv);
+      if ( hvp != ROL::nullPtr ) {
+        const int size = hvp->size();
+        hvp->assign(size,static_cast<Real>(0));
+      }
+      IsZero++;
+    }
+    catch (Exception::NotImplemented &eni) {
+      ROL::Ptr<std::vector<Real> > hvp = getParameter(hv);
+      if ( hvp != ROL::nullPtr ) {
+        const int size = hvp->size();
+        hvp->assign(size,static_cast<Real>(0));
+      }
+      NotImplemented++;
+    }
+    // Compute control parameter/parameter hessvec
+    try {
+      ROL::Ptr<std::vector<Real> >          hvp = getParameter(hv);
+      ROL::Ptr<const std::vector<Real> >     vp = getConstParameter(v);
+      ROL::Ptr<const Tpetra::MultiVector<> > uf = getConstField(u);
+      ROL::Ptr<const Tpetra::MultiVector<> > zf = getConstField(z);
+      ROL::Ptr<const std::vector<Real> >     zp = getConstParameter(z);
+      assembler_->assembleQoIHessVec33(vecH33_,qoi_,vp,uf,zf,zp);
+      const int size = hvp->size();
+      for (int i = 0; i < size; ++i) {
+        (*hvp)[i] += (*vecH33_)[i];
+      }
+    }
+    catch (Exception::Zero &ez) {
+      IsZero++;
+    }
+    catch (Exception::NotImplemented &eni) {
+      NotImplemented++;
+    }
+
+    // Zero hessvec
+    if ( IsZero > 0 && (IsZero + NotImplemented == 4) ) {
+      hv.zero();
+    }
+    // Not Implemented
+    if ( NotImplemented == 4 ) {
+      ROL::Objective_SimOpt<Real>::hessVec_22(hv,v,u,z,tol);
+    }
+  }
+
+private: // Vector accessor functions
+
+  ROL::Ptr<const Tpetra::MultiVector<> > getConstField(const ROL::Vector<Real> &x) const {
+    ROL::Ptr<const Tpetra::MultiVector<> > xp;
+    try {
+      xp = dynamic_cast<const ROL::TpetraMultiVector<Real>&>(x).getVector();
+    }
+    catch (std::exception &e) {
+      try {
+        ROL::Ptr<const ROL::TpetraMultiVector<Real> > xvec
+          = dynamic_cast<const PDE_OptVector<Real>&>(x).getField();
+        if (xvec == ROL::nullPtr) {
+          xp = ROL::nullPtr;
+        }
+        else {
+          xp = xvec->getVector();
+        }
+      }
+      catch (std::exception &ee) {
+        xp = ROL::nullPtr;
+      }
+    }
+    return xp;
+  }
+
+  ROL::Ptr<Tpetra::MultiVector<> > getField(ROL::Vector<Real> &x) const {
+    ROL::Ptr<Tpetra::MultiVector<> > xp;
+    try {
+      xp = dynamic_cast<ROL::TpetraMultiVector<Real>&>(x).getVector();
+    }
+    catch (std::exception &e) {
+      try {
+        ROL::Ptr<ROL::TpetraMultiVector<Real> > xvec
+          = dynamic_cast<PDE_OptVector<Real>&>(x).getField();
+        if ( xvec == ROL::nullPtr ) {
+          xp = ROL::nullPtr;
+        }
+        else {
+          xp = xvec->getVector();
+        }
+      }
+      catch (std::exception &ee) {
+        xp = ROL::nullPtr;
+      }
+    }
+    return xp;
+  }
+
+  ROL::Ptr<const std::vector<Real> > getConstParameter(const ROL::Vector<Real> &x) const {
+    ROL::Ptr<const std::vector<Real> > xp;
+    try {
+      xp = dynamic_cast<const ROL::StdVector<Real>&>(x).getVector();
+    }
+    catch (std::exception &e) {
+      try {
+        ROL::Ptr<const ROL::StdVector<Real> > xvec
+          = dynamic_cast<const PDE_OptVector<Real>&>(x).getParameter();
+        if ( xvec == ROL::nullPtr ) {
+          xp = ROL::nullPtr;
+        }
+        else {
+          xp = xvec->getVector();
+        }
+      }
+      catch (std::exception &ee) {
+        xp = ROL::nullPtr;
+      }
+    }
+    return xp;
+  }
+
+  ROL::Ptr<std::vector<Real> > getParameter(ROL::Vector<Real> &x) const {
+    ROL::Ptr<std::vector<Real> > xp;
+    try {
+      xp = dynamic_cast<ROL::StdVector<Real>&>(x).getVector();
+    }
+    catch (std::exception &e) {
+      try {
+        ROL::Ptr<ROL::StdVector<Real> > xvec
+          = dynamic_cast<PDE_OptVector<Real>&>(x).getParameter();
+        if ( xvec == ROL::nullPtr ) {
+          xp = ROL::nullPtr;
+        }
+        else {
+          xp = xvec->getVector();
+        }
+      }
+      catch (std::exception &ee) {
+        xp = ROL::nullPtr;
+      }
+    }
+    return xp;
+  }
+};
+
+
+template<class Real, class DeviceType>
+class IntegralSimObjective : public ROL::Objective<Real> {
+public:
+  using QoI_type = QoI<Real, DeviceType>;
+  using Assembler_type = Assembler<Real, DeviceType>;
+private:
+  const ROL::Ptr<QoI_type> qoi_;
+  const ROL::Ptr<Assembler_type> assembler_;
+
+  ROL::Ptr<Tpetra::MultiVector<>> vecG1_;
+  ROL::Ptr<Tpetra::MultiVector<>> vecH11_;
+
+public:
+  IntegralSimObjective(const ROL::Ptr<QoI_type> &qoi,
+                       const ROL::Ptr<Assembler_type> &assembler)
+    : qoi_(qoi), assembler_(assembler) {}
+
+  void setParameter(const std::vector<Real> &param) {
+    ROL::Objective<Real>::setParameter(param);
+    qoi_->setParameter(param);
+  }
+
+  Real value(const ROL::Vector<Real> &u, Real &tol) override {
+    ROL::Ptr<const Tpetra::MultiVector<>> uf = getConstField(u);
+    return assembler_->assembleQoIValue(qoi_,uf,ROL::nullPtr,ROL::nullPtr);
+  }
+
+  void gradient(ROL::Vector<Real> &g,
+                const ROL::Vector<Real> &u, Real &tol ) override {
+    int NotImplemented(0), IsZero(0);
+    g.zero();
+    // Compute control field gradient
+    try {
+      // Get state and control vectors
+      ROL::Ptr<const Tpetra::MultiVector<>> uf = getConstField(u);
+      ROL::Ptr<Tpetra::MultiVector<>>       gf = getField(g);
+      assembler_->assembleQoIGradient1(vecG1_,qoi_,uf,ROL::nullPtr,ROL::nullPtr);
+      gf->scale(static_cast<Real>(1),*vecG1_);
+    }
+    catch ( Exception::Zero & ez ) {
+      IsZero++;
+    }
+    catch ( Exception::NotImplemented & eni ) {
+      NotImplemented++;
+    }
+    // Zero gradient
+    if ( IsZero == 1 ) {
+      g.zero();
+    }
+    // Not Implemented
+    if ( NotImplemented == 1 ) {
+      ROL::Objective<Real>::gradient(g,u,tol);
+    }
+  }
+
+  void hessVec(ROL::Vector<Real> &hv, const ROL::Vector<Real> &v, 
+               const ROL::Vector<Real> &u, Real &tol ) override {
+    int NotImplemented(0), IsZero(0);
+    hv.zero();
+    // Compute control field/field hessvec
+    try {
+      ROL::Ptr<Tpetra::MultiVector<> >      hvf = getField(hv);
+      ROL::Ptr<const Tpetra::MultiVector<> > vf = getConstField(v);
+      ROL::Ptr<const Tpetra::MultiVector<> > uf = getConstField(u);
+      assembler_->assembleQoIHessVec11(vecH11_,qoi_,vf,uf,ROL::nullPtr,ROL::nullPtr);
+      hvf->scale(static_cast<Real>(1),*vecH11_);
+    }
+    catch (Exception::Zero &ez) {
+      hv.zero();
+      IsZero++;
+    }
+    catch (Exception::NotImplemented &eni) {
+      hv.zero();
+      NotImplemented++;
+    }
+    // Zero hessvec
+    if ( IsZero == 1 ) {
+      hv.zero();
+    }
+    // Not Implemented
+    if ( NotImplemented == 1 ) {
+      ROL::Objective<Real>::hessVec(hv,v,u,tol);
+    }
+  }
+
+private: // Vector accessor functions
+
+  ROL::Ptr<const Tpetra::MultiVector<> > getConstField(const ROL::Vector<Real> &x) const {
+    ROL::Ptr<const Tpetra::MultiVector<> > xp;
+    try {
+      xp = dynamic_cast<const ROL::TpetraMultiVector<Real>&>(x).getVector();
+    }
+    catch (std::exception &e) {
+      try {
+        ROL::Ptr<const ROL::TpetraMultiVector<Real> > xvec
+          = dynamic_cast<const PDE_OptVector<Real>&>(x).getField();
+        if (xvec == ROL::nullPtr) {
+          xp = ROL::nullPtr;
+        }
+        else {
+          xp = xvec->getVector();
+        }
+      }
+      catch (std::exception &ee) {
+        xp = ROL::nullPtr;
+      }
+    }
+    return xp;
+  }
+
+  ROL::Ptr<Tpetra::MultiVector<> > getField(ROL::Vector<Real> &x) const {
+    ROL::Ptr<Tpetra::MultiVector<> > xp;
+    try {
+      xp = dynamic_cast<ROL::TpetraMultiVector<Real>&>(x).getVector();
+    }
+    catch (std::exception &e) {
+      try {
+        ROL::Ptr<ROL::TpetraMultiVector<Real> > xvec
+          = dynamic_cast<PDE_OptVector<Real>&>(x).getField();
+        if ( xvec == ROL::nullPtr ) {
+          xp = ROL::nullPtr;
+        }
+        else {
+          xp = xvec->getVector();
+        }
+      }
+      catch (std::exception &ee) {
+        xp = ROL::nullPtr;
+      }
+    }
+    return xp;
+  }
+};
+
+template<class Real, class DeviceType>
+class IntegralOptObjective : public ROL::Objective<Real> {
+public:
+  using QoI_type = QoI<Real, DeviceType>;
+  using Assembler_type = Assembler<Real, DeviceType>;
+private:
+  const ROL::Ptr<QoI_type> qoi_;
+  const ROL::Ptr<Assembler_type> assembler_;
+
+  ROL::Ptr<Tpetra::MultiVector<> > vecG2_;
+  ROL::Ptr<std::vector<Real> >     vecG3_;
+  ROL::Ptr<Tpetra::MultiVector<> > vecH22_;
+  ROL::Ptr<Tpetra::MultiVector<> > vecH23_;
+  ROL::Ptr<std::vector<Real> >     vecH32_;
+  ROL::Ptr<std::vector<Real> >     vecH33_;
+
+public:
+  IntegralOptObjective(const ROL::Ptr<QoI_type > &qoi,
+                       const ROL::Ptr<Assembler_type> &assembler)
+    : qoi_(qoi), assembler_(assembler) {}
+
+  void setParameter(const std::vector<Real> &param) {
+    ROL::Objective<Real>::setParameter(param);
+    qoi_->setParameter(param);
+  }
+
+  Real value(const ROL::Vector<Real> &z, Real &tol) override {
+    ROL::Ptr<const Tpetra::MultiVector<> > zf = getConstField(z);
+    ROL::Ptr<const std::vector<Real> >     zp = getConstParameter(z);
+    return assembler_->assembleQoIValue(qoi_,ROL::nullPtr,zf,zp);
+  }
+
+  void gradient(ROL::Vector<Real> &g,
+                const ROL::Vector<Real> &z, Real &tol ) override {
+    int NotImplemented(0), IsZero(0);
+    g.zero();
+    // Compute control field gradient
+    try {
+      // Get state and control vectors
+      ROL::Ptr<const Tpetra::MultiVector<> > zf = getConstField(z);
+      ROL::Ptr<const std::vector<Real> >     zp = getConstParameter(z);
+      ROL::Ptr<Tpetra::MultiVector<> >       gf = getField(g);
+      assembler_->assembleQoIGradient2(vecG2_,qoi_,ROL::nullPtr,zf,zp);
+      gf->scale(static_cast<Real>(1),*vecG2_);
+    }
+    catch ( Exception::Zero & ez ) {
+      IsZero++;
+    }
+    catch ( Exception::NotImplemented & eni ) {
+      NotImplemented++;
+    }
+    // Compute control parameter gradient
+    try {
+      // Get state and control vectors
+      ROL::Ptr<const Tpetra::MultiVector<> > zf = getConstField(z);
+      ROL::Ptr<const std::vector<Real> >     zp = getConstParameter(z);
+      ROL::Ptr<std::vector<Real> >           gp = getParameter(g);
+      assembler_->assembleQoIGradient3(vecG3_,qoi_,ROL::nullPtr,zf,zp);
+      gp->assign(vecG3_->begin(),vecG3_->end());
+    }
+    catch ( Exception::Zero & ez ) {
+      IsZero++;
+    }
+    catch ( Exception::NotImplemented & eni ) {
+      NotImplemented++;
+    }
+    // Zero gradient
+    if ( IsZero == 2 || (IsZero == 1 && NotImplemented == 1) ) {
+      g.zero();
+    }
+    // Not Implemented
+    if ( NotImplemented == 2 ) {
+      ROL::Objective<Real>::gradient(g,z,tol);
+    }
+  }
+
+  void hessVec(ROL::Vector<Real> &hv, const ROL::Vector<Real> &v, 
+               const ROL::Vector<Real> &z, Real &tol ) override {
+    int NotImplemented(0), IsZero(0);
+    hv.zero();
+    // Compute control field/field hessvec
+    try {
+      ROL::Ptr<Tpetra::MultiVector<> >      hvf = getField(hv);
+      ROL::Ptr<const Tpetra::MultiVector<> > vf = getConstField(v);
+      ROL::Ptr<const Tpetra::MultiVector<> > zf = getConstField(z);
+      ROL::Ptr<const std::vector<Real> >     zp = getConstParameter(z);
+      assembler_->assembleQoIHessVec22(vecH22_,qoi_,vf,ROL::nullPtr,zf,zp);
+      hvf->scale(static_cast<Real>(1),*vecH22_);
+    }
+    catch (Exception::Zero &ez) {
+      hv.zero();
+      IsZero++;
+    }
+    catch (Exception::NotImplemented &eni) {
+      hv.zero();
+      NotImplemented++;
+    }
+    // Compute control field/parameter hessvec
+    try {
+      // Get state and control vectors
+      ROL::Ptr<Tpetra::MultiVector<> >      hvf = getField(hv);
+      ROL::Ptr<const std::vector<Real> >     vp = getConstParameter(v);
+      ROL::Ptr<const Tpetra::MultiVector<> > zf = getConstField(z);
+      ROL::Ptr<const std::vector<Real> >     zp = getConstParameter(z);
+      assembler_->assembleQoIHessVec23(vecH23_,qoi_,vp,ROL::nullPtr,zf,zp);
+      hvf->update(static_cast<Real>(1),*vecH23_,static_cast<Real>(1));
+    }
+    catch ( Exception::Zero & ez ) {
+      IsZero++;
+    }
+    catch ( Exception::NotImplemented & eni ) {
+      NotImplemented++;
+    }
+    // Compute control parameter/field hessvec
+    try {
+      ROL::Ptr<std::vector<Real> >          hvp = getParameter(hv);
+      ROL::Ptr<const Tpetra::MultiVector<> > vf = getConstField(v);
+      ROL::Ptr<const Tpetra::MultiVector<> > zf = getConstField(z);
+      ROL::Ptr<const std::vector<Real> >     zp = getConstParameter(z);
+      assembler_->assembleQoIHessVec32(vecH32_,qoi_,vf,ROL::nullPtr,zf,zp);
+      hvp->assign(vecH32_->begin(),vecH32_->end());
+    }
+    catch (Exception::Zero &ez) {
+      ROL::Ptr<std::vector<Real> > hvp = getParameter(hv);
+      if ( hvp != ROL::nullPtr ) {
+        const int size = hvp->size();
+        hvp->assign(size,static_cast<Real>(0));
+      }
+      IsZero++;
+    }
+    catch (Exception::NotImplemented &eni) {
+      ROL::Ptr<std::vector<Real> > hvp = getParameter(hv);
+      if ( hvp != ROL::nullPtr ) {
+        const int size = hvp->size();
+        hvp->assign(size,static_cast<Real>(0));
+      }
+      NotImplemented++;
+    }
+    // Compute control parameter/parameter hessvec
+    try {
+      ROL::Ptr<std::vector<Real> >          hvp = getParameter(hv);
+      ROL::Ptr<const std::vector<Real> >     vp = getConstParameter(v);
+      ROL::Ptr<const Tpetra::MultiVector<> > zf = getConstField(z);
+      ROL::Ptr<const std::vector<Real> >     zp = getConstParameter(z);
+      assembler_->assembleQoIHessVec33(vecH33_,qoi_,vp,ROL::nullPtr,zf,zp);
+      const int size = hvp->size();
+      for (int i = 0; i < size; ++i) {
+        (*hvp)[i] += (*vecH33_)[i];
+      }
+    }
+    catch (Exception::Zero &ez) {
+      IsZero++;
+    }
+    catch (Exception::NotImplemented &eni) {
+      NotImplemented++;
+    }
+
+    // Zero hessvec
+    if ( IsZero > 0 && (IsZero + NotImplemented == 4) ) {
+      hv.zero();
+    }
+    // Not Implemented
+    if ( NotImplemented == 4 ) {
+      ROL::Objective<Real>::hessVec(hv,v,z,tol);
+    }
+  }
+
+private: // Vector accessor functions
+
+  ROL::Ptr<const Tpetra::MultiVector<> > getConstField(const ROL::Vector<Real> &x) const {
+    ROL::Ptr<const Tpetra::MultiVector<> > xp;
+    try {
+      xp = dynamic_cast<const ROL::TpetraMultiVector<Real>&>(x).getVector();
+    }
+    catch (std::exception &e) {
+      try {
+        ROL::Ptr<const ROL::TpetraMultiVector<Real> > xvec
+          = dynamic_cast<const PDE_OptVector<Real>&>(x).getField();
+        if (xvec == ROL::nullPtr) {
+          xp = ROL::nullPtr;
+        }
+        else {
+          xp = xvec->getVector();
+        }
+      }
+      catch (std::exception &ee) {
+        xp = ROL::nullPtr;
+      }
+    }
+    return xp;
+  }
+
+  ROL::Ptr<Tpetra::MultiVector<> > getField(ROL::Vector<Real> &x) const {
+    ROL::Ptr<Tpetra::MultiVector<> > xp;
+    try {
+      xp = dynamic_cast<ROL::TpetraMultiVector<Real>&>(x).getVector();
+    }
+    catch (std::exception &e) {
+      try {
+        ROL::Ptr<ROL::TpetraMultiVector<Real> > xvec
+          = dynamic_cast<PDE_OptVector<Real>&>(x).getField();
+        if ( xvec == ROL::nullPtr ) {
+          xp = ROL::nullPtr;
+        }
+        else {
+          xp = xvec->getVector();
+        }
+      }
+      catch (std::exception &ee) {
+        xp = ROL::nullPtr;
+      }
+    }
+    return xp;
+  }
+
+  ROL::Ptr<const std::vector<Real> > getConstParameter(const ROL::Vector<Real> &x) const {
+    ROL::Ptr<const std::vector<Real> > xp;
+    try {
+      xp = dynamic_cast<const ROL::StdVector<Real>&>(x).getVector();
+    }
+    catch (std::exception &e) {
+      try {
+        ROL::Ptr<const ROL::StdVector<Real> > xvec
+          = dynamic_cast<const PDE_OptVector<Real>&>(x).getParameter();
+        if ( xvec == ROL::nullPtr ) {
+          xp = ROL::nullPtr;
+        }
+        else {
+          xp = xvec->getVector();
+        }
+      }
+      catch (std::exception &ee) {
+        xp = ROL::nullPtr;
+      }
+    }
+    return xp;
+  }
+
+  ROL::Ptr<std::vector<Real> > getParameter(ROL::Vector<Real> &x) const {
+    ROL::Ptr<std::vector<Real> > xp;
+    try {
+      xp = dynamic_cast<ROL::StdVector<Real>&>(x).getVector();
+    }
+    catch (std::exception &e) {
+      try {
+        ROL::Ptr<ROL::StdVector<Real> > xvec
+          = dynamic_cast<PDE_OptVector<Real>&>(x).getParameter();
+        if ( xvec == ROL::nullPtr ) {
+          xp = ROL::nullPtr;
+        }
+        else {
+          xp = xvec->getVector();
+        }
+      }
+      catch (std::exception &ee) {
+        xp = ROL::nullPtr;
+      }
+    }
+    return xp;
+  }
+};
+
+template<class Real, class DeviceType>
+class IntegralAffineSimObjective : public ROL::Objective_SimOpt<Real> {
+public:
+  using QoI_type = QoI<Real, DeviceType>;
+  using Assembler_type = Assembler<Real, DeviceType>;
+private:
+  const ROL::Ptr<QoI_type > qoi_;
+  const ROL::Ptr<Assembler_type> assembler_;
+
+  Real shift_;
+  ROL::Ptr<Tpetra::MultiVector<> > vecG1_;
+  ROL::Ptr<Tpetra::MultiVector<> > ufield_;
+  ROL::Ptr<Tpetra::MultiVector<> > zfield_;
+  ROL::Ptr<std::vector<Real> >     zparam_;
+
+  bool isAssembled_;
+
+  void assemble(const ROL::Ptr<const Tpetra::MultiVector<> > &zf,
+                const ROL::Ptr<const std::vector<Real> >     &zp) {
+    if ( !isAssembled_ ) {
+      ufield_ = assembler_->createStateVector();
+      ufield_->putScalar(static_cast<Real>(0));
+      if ( zf != ROL::nullPtr ) {
+        zfield_ = assembler_->createControlVector();
+        zfield_->putScalar(static_cast<Real>(0));
+      }
+      if ( zp != ROL::nullPtr ) {
+       zparam_ = ROL::makePtr<std::vector<Real>>(zp->size(),static_cast<Real>(0));
+      }
+      shift_ = assembler_->assembleQoIValue(qoi_,ufield_,zfield_,zparam_);
+      assembler_->assembleQoIGradient1(vecG1_,qoi_,ufield_,zfield_,zparam_);
+      isAssembled_ = true;
+    }
+  }
+
+public:
+  IntegralAffineSimObjective(const ROL::Ptr<QoI_type> &qoi,
+                             const ROL::Ptr<Assembler_type> &assembler)
+    : qoi_(qoi), assembler_(assembler), isAssembled_(false) {}
+
+  void setParameter(const std::vector<Real> &param) {
+    ROL::Objective_SimOpt<Real>::setParameter(param);
+    qoi_->setParameter(param);
+    isAssembled_ = false;
+  }
+
+  Real value(const ROL::Vector<Real> &u, const ROL::Vector<Real> &z, Real &tol) override {
+    ROL::Ptr<const Tpetra::MultiVector<> > uf = getConstField(u);
+    ROL::Ptr<const Tpetra::MultiVector<> > zf = getConstField(z);
+    ROL::Ptr<const std::vector<Real> >     zp = getConstParameter(z);
+
+    assemble(zf,zp);
+
+    const size_t n = uf->getNumVectors();
+    Teuchos::Array<Real> val(n,0);
+    vecG1_->dot(*uf, val.view(0,n));
+    
+    return val[0] + shift_;
+  }
+
+  void gradient_1(ROL::Vector<Real> &g, const ROL::Vector<Real> &u,
+                  const ROL::Vector<Real> &z, Real &tol ) override {
+    ROL::Ptr<Tpetra::MultiVector<> >       gf = getField(g);
+    ROL::Ptr<const Tpetra::MultiVector<> > zf = getConstField(z);
+    ROL::Ptr<const std::vector<Real> >     zp = getConstParameter(z);
+
+    assemble(zf,zp);
+
+    gf->scale(static_cast<Real>(1),*vecG1_);
+  }
+
+  void gradient_2(ROL::Vector<Real> &g, const ROL::Vector<Real> &u,
+                  const ROL::Vector<Real> &z, Real &tol ) override {
+    g.zero();
+  }
+
+  void hessVec_11( ROL::Vector<Real> &hv, const ROL::Vector<Real> &v, 
+             const ROL::Vector<Real> &u,  const ROL::Vector<Real> &z, Real &tol ) override {
+    hv.zero();
+  }
+
+  void hessVec_12( ROL::Vector<Real> &hv, const ROL::Vector<Real> &v, 
+                   const ROL::Vector<Real> &u, const ROL::Vector<Real> &z, Real &tol ) override {
+    hv.zero();
+  }
+
+  void hessVec_21( ROL::Vector<Real> &hv, const ROL::Vector<Real> &v, 
+                   const ROL::Vector<Real> &u, const ROL::Vector<Real> &z, Real &tol ) override {
+    hv.zero();
+  }
+
+  void hessVec_22( ROL::Vector<Real> &hv, const ROL::Vector<Real> &v, 
+             const ROL::Vector<Real> &u,  const ROL::Vector<Real> &z, Real &tol ) override {
+    hv.zero();
+  }
+
+private: // Vector accessor functions
+
+  ROL::Ptr<const Tpetra::MultiVector<> > getConstField(const ROL::Vector<Real> &x) const {
+    ROL::Ptr<const Tpetra::MultiVector<> > xp;
+    try {
+      xp = dynamic_cast<const ROL::TpetraMultiVector<Real>&>(x).getVector();
+    }
+    catch (std::exception &e) {
+      try {
+        ROL::Ptr<const ROL::TpetraMultiVector<Real> > xvec
+          = dynamic_cast<const PDE_OptVector<Real>&>(x).getField();
+        if (xvec == ROL::nullPtr) {
+          xp = ROL::nullPtr;
+        }
+        else {
+          xp = xvec->getVector();
+        }
+      }
+      catch (std::exception &ee) {
+        xp = ROL::nullPtr;
+      }
+    }
+    return xp;
+  }
+
+  ROL::Ptr<Tpetra::MultiVector<> > getField(ROL::Vector<Real> &x) const {
+    ROL::Ptr<Tpetra::MultiVector<> > xp;
+    try {
+      xp = dynamic_cast<ROL::TpetraMultiVector<Real>&>(x).getVector();
+    }
+    catch (std::exception &e) {
+      try {
+        ROL::Ptr<ROL::TpetraMultiVector<Real> > xvec
+          = dynamic_cast<PDE_OptVector<Real>&>(x).getField();
+        if ( xvec == ROL::nullPtr ) {
+          xp = ROL::nullPtr;
+        }
+        else {
+          xp = xvec->getVector();
+        }
+      }
+      catch (std::exception &ee) {
+        xp = ROL::nullPtr;
+      }
+    }
+    return xp;
+  }
+
+  ROL::Ptr<const std::vector<Real> > getConstParameter(const ROL::Vector<Real> &x) const {
+    ROL::Ptr<const std::vector<Real> > xp;
+    try {
+      xp = dynamic_cast<const ROL::StdVector<Real>&>(x).getVector();
+    }
+    catch (std::exception &e) {
+      try {
+        ROL::Ptr<const ROL::StdVector<Real> > xvec
+          = dynamic_cast<const PDE_OptVector<Real>&>(x).getParameter();
+        if ( xvec == ROL::nullPtr ) {
+          xp = ROL::nullPtr;
+        }
+        else {
+          xp = xvec->getVector();
+        }
+      }
+      catch (std::exception &ee) {
+        xp = ROL::nullPtr;
+      }
+    }
+    return xp;
+  }
+
+  ROL::Ptr<std::vector<Real> > getParameter(ROL::Vector<Real> &x) const {
+    ROL::Ptr<std::vector<Real> > xp;
+    try {
+      xp = dynamic_cast<ROL::StdVector<Real>&>(x).getVector();
+    }
+    catch (std::exception &e) {
+      try {
+        ROL::Ptr<ROL::StdVector<Real> > xvec
+          = dynamic_cast<PDE_OptVector<Real>&>(x).getParameter();
+        if ( xvec == ROL::nullPtr ) {
+          xp = ROL::nullPtr;
+        }
+        else {
+          xp = xvec->getVector();
+        }
+      }
+      catch (std::exception &ee) {
+        xp = ROL::nullPtr;
+      }
+    }
+    return xp;
+  }
+};
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/TOOLS/linearpdeconstraintK.hpp b/packages/rol/example/PDE-OPT/TOOLS/linearpdeconstraintK.hpp
new file mode 100644
index 000000000000..4703ef71923d
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/TOOLS/linearpdeconstraintK.hpp
@@ -0,0 +1,592 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  constraint.hpp
+    \brief Defines the SimOpt constraint for the 'poisson' example.
+*/
+
+#ifndef ROL_PDEOPT_LINEARPDECONSTRAINTK_H
+#define ROL_PDEOPT_LINEARPDECONSTRAINTK_H
+
+#include "ROL_Constraint_SimOpt.hpp"
+#include "pdeK.hpp"
+#include "assemblerK.hpp"
+#include "solver.hpp"
+#include "pdevectorK.hpp"
+
+// Do not instantiate the template in this translation unit.
+extern template class Assembler<double,Kokkos::HostSpace>;
+
+//// Global Timers.
+#ifdef ROL_TIMERS
+namespace ROL {
+  namespace PDEOPT {
+    ROL::Ptr<Teuchos::Time> LinearPDEConstraintSolverConstruct_Jacobian1        = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Linear PDE Constraint Solver Construction Time for Jacobian1");
+    ROL::Ptr<Teuchos::Time> LinearPDEConstraintSolverConstruct_AdjointJacobian1 = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Linear PDE Constraint Solver Construction Time for Adjoint Jacobian1");
+    ROL::Ptr<Teuchos::Time> LinearPDEConstraintSolverSolve_Jacobian1            = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Linear PDE Constraint Solver Solution Time for Jacobian1");
+    ROL::Ptr<Teuchos::Time> LinearPDEConstraintSolverSolve_AdjointJacobian1     = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Linear PDE Constraint Solver Solution Time for Adjoint Jacobian1");
+    ROL::Ptr<Teuchos::Time> LinearPDEConstraintApplyJacobian1                   = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Linear PDE Constraint Apply Jacobian1");
+    ROL::Ptr<Teuchos::Time> LinearPDEConstraintApplyJacobian2                   = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Linear PDE Constraint Apply Jacobian2");
+    ROL::Ptr<Teuchos::Time> LinearPDEConstraintApplyJacobian3                   = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Linear PDE Constraint Apply Jacobian3");
+    ROL::Ptr<Teuchos::Time> LinearPDEConstraintApplyAdjointJacobian1            = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Linear PDE Constraint Apply Adjoint Jacobian1");
+    ROL::Ptr<Teuchos::Time> LinearPDEConstraintApplyAdjointJacobian2            = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Linear PDE Constraint Apply Adjoint Jacobian2");
+    ROL::Ptr<Teuchos::Time> LinearPDEConstraintApplyAdjointJacobian3            = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: Linear PDE Constraint Apply Adjoint Jacobian3");
+  }
+}
+#endif
+
+
+template<class Real, class DeviceType>
+class Linear_PDE_Constraint : public ROL::Constraint_SimOpt<Real> {
+public:
+  using Assembler_type = Assembler<Real,DeviceType>;
+  using PDE_type = PDE<Real,DeviceType>;
+private:
+  const ROL::Ptr<PDE_type> pde_;
+  ROL::Ptr<Assembler_type> assembler_;
+  ROL::Ptr<Solver<Real> >    solver_;
+  bool initZvec_, initZpar_, isStochastic_;
+  bool assembleRHS_, assembleJ1_, assembleJ2_, assembleJ3_, setSolver_;
+
+  ROL::Ptr<Tpetra::MultiVector<> > cvec_;
+  ROL::Ptr<Tpetra::MultiVector<> > uvec_;
+  ROL::Ptr<Tpetra::MultiVector<> > zvec_;
+  ROL::Ptr<std::vector<Real> >     zpar_;
+
+  ROL::Ptr<Tpetra::MultiVector<> > vecR_;
+  ROL::Ptr<Tpetra::CrsMatrix<> >   matJ1_;
+  ROL::Ptr<Tpetra::CrsMatrix<> >   matJ2_;
+  ROL::Ptr<Tpetra::MultiVector<> > vecJ3_;
+
+  void assemble(const ROL::Ptr<const Tpetra::MultiVector<> > &zf,
+                const ROL::Ptr<const std::vector<Real> > &zp) {
+    // Initialize field component of z.
+    if (initZvec_ && zf != ROL::nullPtr) {
+      zvec_ = assembler_->createControlVector();
+      zvec_->putScalar(static_cast<Real>(0));
+    }
+    initZvec_ = false;
+    // Initialize parameter component of z.
+    if (initZpar_ && zp != ROL::nullPtr) {
+      zpar_ = ROL::makePtr<std::vector<Real>>(zp->size(),static_cast<Real>(0));
+    }
+    initZpar_ = false;
+    // Assemble affine term.
+    if (assembleRHS_) {
+      assembler_->assemblePDEResidual(vecR_,pde_,uvec_,zvec_,zpar_);
+    }
+    assembleRHS_ = false;
+    // Assemble jacobian_1.
+    if (assembleJ1_) {
+      assembler_->assemblePDEJacobian1(matJ1_,pde_,uvec_,zvec_,zpar_);
+    }
+    assembleJ1_ = false;
+    // Assemble jacobian_2.
+    if (assembleJ2_ && zf != ROL::nullPtr) {
+      assembler_->assemblePDEJacobian2(matJ2_,pde_,uvec_,zvec_,zpar_);
+    }
+    assembleJ2_ = false;
+    // Assemble jacobian_3.
+    if (assembleJ3_ && zp != ROL::nullPtr) {
+      assembler_->assemblePDEJacobian3(vecJ3_,pde_,uvec_,zvec_,zpar_);
+    }
+    assembleJ3_ = false;
+  }
+
+  void setSolver(void) {
+    solver_->setA(matJ1_);
+    setSolver_ = false;
+  }
+
+  void solveForward(ROL::Ptr<Tpetra::MultiVector<> > &x,
+                    const ROL::Ptr<const Tpetra::MultiVector<> > &b) {
+    if (setSolver_) {
+      #ifdef ROL_TIMERS
+        Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::LinearPDEConstraintSolverConstruct_Jacobian1);
+      #endif
+      setSolver();
+    }
+
+    #ifdef ROL_TIMERS
+      Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::LinearPDEConstraintSolverSolve_Jacobian1);
+    #endif
+    solver_->solve(x,b,false);
+  }
+
+  void solveAdjoint(ROL::Ptr<Tpetra::MultiVector<> > &x,
+                    const ROL::Ptr<const Tpetra::MultiVector<> > &b) {
+    if (setSolver_) {
+      #ifdef ROL_TIMERS
+        Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::LinearPDEConstraintSolverConstruct_AdjointJacobian1);
+      #endif
+      setSolver();
+    }
+
+    #ifdef ROL_TIMERS
+      Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::LinearPDEConstraintSolverSolve_AdjointJacobian1);
+    #endif
+    solver_->solve(x,b,true);
+  }
+
+  void applyJacobian1(const ROL::Ptr<Tpetra::MultiVector<> > &Jv,
+                      const ROL::Ptr<const Tpetra::MultiVector<> > &v) {
+    #ifdef ROL_TIMERS
+      Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::LinearPDEConstraintApplyJacobian1);
+    #endif
+    matJ1_->apply(*v,*Jv);
+  }
+
+  void applyAdjointJacobian1(const ROL::Ptr<Tpetra::MultiVector<> > &Jv,
+                             const ROL::Ptr<const Tpetra::MultiVector<> > &v) {
+    #ifdef ROL_TIMERS
+      Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::LinearPDEConstraintApplyAdjointJacobian1);
+    #endif
+    matJ1_->apply(*v,*Jv,Teuchos::TRANS);
+  }
+
+  void applyJacobian2(const ROL::Ptr<Tpetra::MultiVector<> > &Jv,
+                      const ROL::Ptr<const Tpetra::MultiVector<> > &v) {
+    #ifdef ROL_TIMERS
+      Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::LinearPDEConstraintApplyJacobian2);
+    #endif
+    matJ2_->apply(*v,*Jv);
+  }
+
+  void applyAdjointJacobian2(const ROL::Ptr<Tpetra::MultiVector<> > &Jv,
+                             const ROL::Ptr<const Tpetra::MultiVector<> > &v) {
+    #ifdef ROL_TIMERS
+      Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::LinearPDEConstraintApplyAdjointJacobian2);
+    #endif
+    matJ2_->apply(*v,*Jv,Teuchos::TRANS);
+  }
+
+  // Application routines
+  void applyJacobian3(const ROL::Ptr<Tpetra::MultiVector<> > &Jv,
+                      const ROL::Ptr<const std::vector<Real> > &v,
+                      const bool zeroOut = true) const {
+    #ifdef ROL_TIMERS
+      Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::LinearPDEConstraintApplyJacobian3);
+    #endif
+    if ( zeroOut ) {
+      Jv->putScalar(static_cast<Real>(0));
+    }
+    const size_t size = static_cast<size_t>(v->size());
+    for (size_t i = 0; i < size; ++i) {
+      Teuchos::ArrayView<const size_t> col(&i,1);
+      Jv->update((*v)[i],*(vecJ3_->subView(col)),static_cast<Real>(1));
+    }
+  }
+
+  void applyAdjointJacobian3(const ROL::Ptr<std::vector<Real> > &Jv,
+                             const ROL::Ptr<const Tpetra::MultiVector<> > &v) const {
+    #ifdef ROL_TIMERS
+      Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::LinearPDEConstraintApplyAdjointJacobian3);
+    #endif
+    Teuchos::Array<Real> val(1,0);
+    const size_t size = static_cast<size_t>(Jv->size());
+    for (size_t i = 0; i < size; ++i) {
+      Teuchos::ArrayView<const size_t> col(&i,1);
+      vecJ3_->subView(col)->dot(*v, val.view(0,1));
+      (*Jv)[i] = val[0];
+    }
+  }
+
+public:
+  virtual ~Linear_PDE_Constraint() {}
+
+  Linear_PDE_Constraint(const ROL::Ptr<PDE_type> &pde,
+                        const ROL::Ptr<MeshManager<Real,DeviceType> > &meshMgr,
+                        const ROL::Ptr<const Teuchos::Comm<int> > &comm,
+                        Teuchos::ParameterList &parlist,
+                        std::ostream &outStream = std::cout,
+                        const bool isStochastic = false)
+    : ROL::Constraint_SimOpt<Real>(), pde_(pde),
+      initZvec_(true), initZpar_(true), isStochastic_(isStochastic),
+      assembleRHS_(true), assembleJ1_(true), assembleJ2_(true), assembleJ3_(true), setSolver_(true) {
+    // Construct assembler.
+    assembler_ = ROL::makePtr<Assembler_type>(pde_->getFields(),meshMgr,comm,parlist,outStream);
+    assembler_->setCellNodes(*pde_);
+    // Construct solver.
+    solver_ = ROL::makePtr<Solver<Real>>(parlist.sublist("Solver"));
+    // Initialize zero vectors.
+    cvec_ = assembler_->createResidualVector();
+    uvec_ = assembler_->createStateVector();
+    uvec_->putScalar(static_cast<Real>(0));
+  }
+
+  void setParameter(const std::vector<Real> &param) {
+    if (isStochastic_) {
+      ROL::Constraint_SimOpt<Real>::setParameter(param);
+      pde_->setParameter(param);
+      assembleRHS_ = true;
+      assembleJ1_  = true;
+      assembleJ2_  = true;
+      assembleJ3_  = true;
+      setSolver_   = true;
+    }
+  }
+
+  const ROL::Ptr<Assembler_type> getAssembler(void) const {
+    return assembler_;
+  }
+
+  const ROL::Ptr<PDE_type> getPDE(void) const {
+    return pde_;
+  }
+
+  using ROL::Constraint_SimOpt<Real>::update_1;
+  void update_1(const ROL::Vector<Real> &u, bool flag = true, int iter = -1) {}
+
+  using ROL::Constraint_SimOpt<Real>::update_2;
+  void update_2(const ROL::Vector<Real> &z, bool flag = true, int iter = -1) {}
+
+  using ROL::Constraint_SimOpt<Real>::update;
+  void update(const ROL::Vector<Real> &u, const ROL::Vector<Real> &z, bool flag = true, int iter = -1) {
+    update_1(u,flag,iter);
+    update_2(z,flag,iter);
+  }
+
+  void update_1(const ROL::Vector<Real> &u, ROL::UpdateType type, int iter = -1) {}
+  void update_2(const ROL::Vector<Real> &z, ROL::UpdateType type, int iter = -1) {}
+  void update(const ROL::Vector<Real> &u, const ROL::Vector<Real> &z, ROL::UpdateType type, int iter = -1) {
+    update_1(u,type,iter);
+    update_2(z,type,iter);
+  }
+
+  using ROL::Constraint_SimOpt<Real>::value;
+  void value(ROL::Vector<Real> &c, const ROL::Vector<Real> &u, const ROL::Vector<Real> &z, Real &tol) {
+    ROL::Ptr<Tpetra::MultiVector<> >       cf = getField(c);
+    ROL::Ptr<const Tpetra::MultiVector<> > uf = getConstField(u);
+    ROL::Ptr<const Tpetra::MultiVector<> > zf = getConstField(z);
+    ROL::Ptr<const std::vector<Real> >     zp = getConstParameter(z);
+
+    assemble(zf,zp);
+    c.zero();
+    cvec_->putScalar(static_cast<Real>(0));
+
+    const Real one(1);
+    cf->scale(one,*vecR_);
+    if (zf != ROL::nullPtr) {
+      applyJacobian2(cvec_,zf);
+      cf->update(one,*cvec_,one);
+    }
+    if (zp != ROL::nullPtr) {
+      applyJacobian3(cvec_,zp,false);
+      cf->update(one,*cvec_,one);
+    }
+    applyJacobian1(cvec_,uf);
+    cf->update(one,*cvec_,one);
+  }
+
+  void solve(ROL::Vector<Real> &c, ROL::Vector<Real> &u, const ROL::Vector<Real> &z, Real &tol) {
+    ROL::Ptr<Tpetra::MultiVector<> >       cf = getField(c);
+    ROL::Ptr<Tpetra::MultiVector<> >       uf = getField(u);
+    ROL::Ptr<const Tpetra::MultiVector<> > zf = getConstField(z);
+    ROL::Ptr<const std::vector<Real> >     zp = getConstParameter(z);
+
+    assemble(zf,zp);
+    c.zero();
+    cvec_->putScalar(static_cast<Real>(0));
+
+    const Real one(1);
+    cf->scale(one,*vecR_);
+    if (zf != ROL::nullPtr) {
+      applyJacobian2(cvec_,zf);
+      cf->update(one,*cvec_,one);
+    }
+    if (zp != ROL::nullPtr) {
+      applyJacobian3(cvec_,zp,false);
+      cf->update(one,*cvec_,one);
+    }
+
+    solveForward(uf,cf);
+    uf->scale(-one);
+
+    applyJacobian1(cvec_,uf);
+    cf->update(one,*cvec_,one);
+  }
+
+  void applyJacobian_1(ROL::Vector<Real> &jv,
+                 const ROL::Vector<Real> &v,
+                 const ROL::Vector<Real> &u,
+                 const ROL::Vector<Real> &z, Real &tol) {
+    ROL::Ptr<Tpetra::MultiVector<> >      jvf = getField(jv);
+    ROL::Ptr<const Tpetra::MultiVector<> > vf = getConstField(v);
+    ROL::Ptr<const Tpetra::MultiVector<> > zf = getConstField(z);
+    ROL::Ptr<const std::vector<Real> >     zp = getConstParameter(z);
+
+    assemble(zf,zp);
+    applyJacobian1(jvf,vf);
+  }
+
+
+  void applyJacobian_2(ROL::Vector<Real> &jv,
+                 const ROL::Vector<Real> &v,
+                 const ROL::Vector<Real> &u,
+                 const ROL::Vector<Real> &z, Real &tol) {
+    jv.zero();
+    ROL::Ptr<Tpetra::MultiVector<> >      jvf = getField(jv);
+    ROL::Ptr<const Tpetra::MultiVector<> > zf = getConstField(z);
+    ROL::Ptr<const std::vector<Real> >     zp = getConstParameter(z);
+
+    assemble(zf,zp);
+
+    const Real one(1);
+    ROL::Ptr<const Tpetra::MultiVector<> > vf = getConstField(v);
+    if (vf != ROL::nullPtr) {
+      applyJacobian2(cvec_,vf);
+      jvf->update(one,*cvec_,one);
+    }
+    ROL::Ptr<const std::vector<Real> >     vp = getConstParameter(v);
+    bool zeroOut = (vf == ROL::nullPtr);
+    if (vp != ROL::nullPtr) {
+      applyJacobian3(cvec_,vp,zeroOut);
+      jvf->update(one,*cvec_,one);
+    }
+  }
+
+
+  void applyAdjointJacobian_1(ROL::Vector<Real> &ajv,
+                        const ROL::Vector<Real> &v,
+                        const ROL::Vector<Real> &u,
+                        const ROL::Vector<Real> &z, Real &tol) {
+    ROL::Ptr<Tpetra::MultiVector<> >     ajvf = getField(ajv);
+    ROL::Ptr<const Tpetra::MultiVector<> > vf = getConstField(v);
+    ROL::Ptr<const Tpetra::MultiVector<> > zf = getConstField(z);
+    ROL::Ptr<const std::vector<Real> >     zp = getConstParameter(z);
+
+    assemble(zf,zp);
+    applyAdjointJacobian1(ajvf,vf);
+  }
+
+
+  void applyAdjointJacobian_2(ROL::Vector<Real> &ajv,
+                        const ROL::Vector<Real> &v,
+                        const ROL::Vector<Real> &u,
+                        const ROL::Vector<Real> &z, Real &tol) {
+    ROL::Ptr<Tpetra::MultiVector<> >     ajvf = getField(ajv);
+    ROL::Ptr<std::vector<Real> >         ajvp = getParameter(ajv);
+    ROL::Ptr<const Tpetra::MultiVector<> > vf = getConstField(v);
+    ROL::Ptr<const Tpetra::MultiVector<> > zf = getConstField(z);
+    ROL::Ptr<const std::vector<Real> >     zp = getConstParameter(z);
+
+    assemble(zf,zp);
+
+    if (zf != ROL::nullPtr) {
+      applyAdjointJacobian2(ajvf,vf);
+    }
+    if (zp != ROL::nullPtr) {
+      applyAdjointJacobian3(ajvp,vf);
+    }
+  }
+
+
+  void applyInverseJacobian_1(ROL::Vector<Real> &ijv,
+                        const ROL::Vector<Real> &v,
+                        const ROL::Vector<Real> &u,
+                        const ROL::Vector<Real> &z, Real &tol) {
+    ROL::Ptr<Tpetra::MultiVector<> >     ijvf = getField(ijv);
+    ROL::Ptr<const Tpetra::MultiVector<> > vf = getConstField(v);
+    ROL::Ptr<const Tpetra::MultiVector<> > zf = getConstField(z);
+    ROL::Ptr<const std::vector<Real> >     zp = getConstParameter(z);
+
+    assemble(zf,zp);
+    solveForward(ijvf,vf);
+  }
+
+
+  void applyInverseAdjointJacobian_1(ROL::Vector<Real> &iajv,
+                               const ROL::Vector<Real> &v,
+                               const ROL::Vector<Real> &u,
+                               const ROL::Vector<Real> &z, Real &tol) {
+    ROL::Ptr<Tpetra::MultiVector<> >    iajvf = getField(iajv);
+    ROL::Ptr<const Tpetra::MultiVector<> > vf = getConstField(v);
+    ROL::Ptr<const Tpetra::MultiVector<> > zf = getConstField(z);
+    ROL::Ptr<const std::vector<Real> >     zp = getConstParameter(z);
+
+    assemble(zf,zp);
+    solveAdjoint(iajvf,vf);
+  }
+
+
+  void applyAdjointHessian_11(ROL::Vector<Real> &ahwv,
+                        const ROL::Vector<Real> &w,
+                        const ROL::Vector<Real> &v,
+                        const ROL::Vector<Real> &u,
+                        const ROL::Vector<Real> &z, Real &tol) {
+    ahwv.zero();
+  }
+
+
+  void applyAdjointHessian_12(ROL::Vector<Real> &ahwv,
+                        const ROL::Vector<Real> &w,
+                        const ROL::Vector<Real> &v,
+                        const ROL::Vector<Real> &u,
+                        const ROL::Vector<Real> &z, Real &tol) {
+    ahwv.zero();
+  }
+
+
+  void applyAdjointHessian_21(ROL::Vector<Real> &ahwv,
+                        const ROL::Vector<Real> &w,
+                        const ROL::Vector<Real> &v,
+                        const ROL::Vector<Real> &u,
+                        const ROL::Vector<Real> &z, Real &tol) {
+    ahwv.zero();
+  }
+
+
+  void applyAdjointHessian_22(ROL::Vector<Real> &ahwv,
+                        const ROL::Vector<Real> &w,
+                        const ROL::Vector<Real> &v,
+                        const ROL::Vector<Real> &u,
+                        const ROL::Vector<Real> &z, Real &tol) {
+    ahwv.zero();
+  }
+
+  /***************************************************************************/
+  /* Output routines.                                                        */
+  /***************************************************************************/
+  void printMeshData(std::ostream &outStream) const {
+    assembler_->printMeshData(outStream);
+  }
+
+  void outputTpetraData() const {
+    Tpetra::MatrixMarket::Writer< Tpetra::CrsMatrix<> > matWriter;
+    if (matJ1_ != ROL::nullPtr) {
+      matWriter.writeSparseFile("jacobian1.txt", matJ1_);
+    }
+    else {
+      std::ofstream emptyfile;
+      emptyfile.open("jacobian1.txt");
+      emptyfile.close();
+    }
+    if (matJ2_ != ROL::nullPtr) {
+      matWriter.writeSparseFile("jacobian2.txt", matJ2_);
+    }
+    else {
+      std::ofstream emptyfile;
+      emptyfile.open("jacobian2.txt");
+      emptyfile.close();
+    }
+    if (vecR_ != ROL::nullPtr) {
+      matWriter.writeDenseFile("residual.txt", vecR_);
+    }
+    else {
+      std::ofstream emptyfile;
+      emptyfile.open("residual.txt");
+      emptyfile.close();
+    }
+  }
+
+  void outputTpetraVector(const ROL::Ptr<const Tpetra::MultiVector<> > &vec,
+                          const std::string &filename) const {
+    Tpetra::MatrixMarket::Writer< Tpetra::CrsMatrix<> > vecWriter;
+    vecWriter.writeDenseFile(filename, vec);
+  }
+  /***************************************************************************/
+  /* End of output routines.                                                 */
+  /***************************************************************************/
+
+private: // Vector accessor functions
+
+  ROL::Ptr<const Tpetra::MultiVector<> > getConstField(const ROL::Vector<Real> &x) const {
+    ROL::Ptr<const Tpetra::MultiVector<> > xp;
+    try {
+      xp = dynamic_cast<const ROL::TpetraMultiVector<Real>&>(x).getVector();
+    }
+    catch (std::exception &e) {
+      try {
+        ROL::Ptr<const ROL::TpetraMultiVector<Real> > xvec
+          = dynamic_cast<const PDE_OptVector<Real>&>(x).getField();
+        if (xvec == ROL::nullPtr) {
+          xp = ROL::nullPtr;
+        }
+        else {
+          xp = xvec->getVector();
+        }
+      }
+      catch (std::exception &ee) {
+        xp = ROL::nullPtr;
+      }
+    }
+    return xp;
+  }
+
+  ROL::Ptr<Tpetra::MultiVector<> > getField(ROL::Vector<Real> &x) const {
+    ROL::Ptr<Tpetra::MultiVector<> > xp;
+    try {
+      xp = dynamic_cast<ROL::TpetraMultiVector<Real>&>(x).getVector();
+    }
+    catch (std::exception &e) {
+      try {
+        ROL::Ptr<ROL::TpetraMultiVector<Real> > xvec
+          = dynamic_cast<PDE_OptVector<Real>&>(x).getField();
+        if ( xvec == ROL::nullPtr ) {
+          xp = ROL::nullPtr;
+        }
+        else {
+          xp = xvec->getVector();
+        }
+      }
+      catch (std::exception &ee) {
+        xp = ROL::nullPtr;
+      }
+    }
+    return xp;
+  }
+
+  ROL::Ptr<const std::vector<Real> > getConstParameter(const ROL::Vector<Real> &x) const {
+    ROL::Ptr<const std::vector<Real> > xp;
+    try {
+      xp = dynamic_cast<const ROL::StdVector<Real>&>(x).getVector();
+    }
+    catch (std::exception &e) {
+      try {
+        ROL::Ptr<const ROL::StdVector<Real> > xvec
+          = dynamic_cast<const PDE_OptVector<Real>&>(x).getParameter();
+        if ( xvec == ROL::nullPtr ) {
+          xp = ROL::nullPtr;
+        }
+        else {
+          xp = xvec->getVector();
+        }
+      }
+      catch (std::exception &ee) {
+        xp = ROL::nullPtr;
+      }
+    }
+    return xp;
+  }
+
+  ROL::Ptr<std::vector<Real> > getParameter(ROL::Vector<Real> &x) const {
+    ROL::Ptr<std::vector<Real> > xp;
+    try {
+      xp = dynamic_cast<ROL::StdVector<Real>&>(x).getVector();
+    }
+    catch (std::exception &e) {
+      try {
+        ROL::Ptr<ROL::StdVector<Real> > xvec
+          = dynamic_cast<PDE_OptVector<Real>&>(x).getParameter();
+        if ( xvec == ROL::nullPtr ) {
+          xp = ROL::nullPtr;
+        }
+        else {
+          xp = xvec->getVector();
+        }
+      }
+      catch (std::exception &ee) {
+        xp = ROL::nullPtr;
+      }
+    }
+    return xp;
+  }
+};
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/TOOLS/meshmanagerK.hpp b/packages/rol/example/PDE-OPT/TOOLS/meshmanagerK.hpp
new file mode 100644
index 000000000000..13ce09f4ca71
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/TOOLS/meshmanagerK.hpp
@@ -0,0 +1,1147 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  MeshManager.hpp
+    \brief Defines the MeshManager classes.
+*/
+
+#ifndef MESHMANAGERK_HPP
+#define MESHMANAGERK_HPP
+
+#include "Kokkos_DynRankView.hpp"
+#include "ROL_ParameterList.hpp"
+#include "ROL_Ptr.hpp"
+
+/** \class  MeshManager
+    \brief  This is the pure virtual parent class for mesh construction
+            and management; it enables the generation of a few select
+            meshes and the setup of data structures for computing
+            cell/subcell adjacencies, for example, the cell-to-node
+            adjacency map and the cell-to-edge adjacency map.
+*/
+template <class Real, class DeviceType>
+class MeshManager {
+
+public:
+  using scalar_view = Kokkos::DynRankView<Real,DeviceType>;
+  using int_view = Kokkos::DynRankView<int,DeviceType>;
+
+
+  /** \brief  Destructor
+   */
+  virtual ~MeshManager() {}
+
+  /** \brief Returns node coordinates.
+             Format: number_of_nodes x 2 (Real)
+                     (node_index)  x, y coordinates
+  */
+  virtual scalar_view getNodes() const = 0;
+
+  /** \brief Returns cell to node adjacencies.
+             Format: number_of_cells x number_of_nodes_per_cell (int)
+                     (cell_index)  node_index1  node_index2  ...
+  */
+  virtual int_view getCellToNodeMap() const = 0;
+
+  /** \brief Returns cell to edge adjacencies.
+             Format: number_of_cells x number_of_edges_per_cell (int)
+                     (cell_index)  edge_index1  edge_index2  ...
+  */
+  virtual int_view getCellToEdgeMap() const {
+    return int_view(); // default due to lack of edges in 1D
+  }
+
+  /** \brief Returns cell to face adjacencies.
+             Format: number_of_cells x number_of_faces_per_cell (int)
+                     (cell_index)  face_index1  face_index2  ...
+  */
+  virtual int_view getCellToFaceMap() const {
+    return int_view(); // default due to lack of faces in 1D and 2D
+  }
+
+  /** \brief Returns cell IDs per processor.
+             Format: number_of_procs vectors, each with number_of_cells cell IDs (ints)
+  */
+  virtual ROL::Ptr<std::vector<std::vector<int>>> getProcCellIds() const {
+    return ROL::makePtr<std::vector<std::vector<int>>>();
+    // default due to support for in-line partitioning,
+    // where this functionality is bypassed
+  }
+
+  /** \brief Returns sideset information.
+             Format: The std::vector components are indexed by the local side number (0, 1, 2, ...);
+                     the FieldConTainer is a 1D array of cell indices.
+             Input:  Sideset number.  Its meaning is context-dependent.
+  */
+  //virtual ROL::Ptr<std::vector<std::vector<Intrepid::FieldContainer<int> > > > getSideSets(
+  //            std::ostream & outStream = std::cout,
+  //            const bool verbose = false) const = 0;
+  virtual ROL::Ptr<std::vector<std::vector<std::vector<int> > > > getSideSets(
+              const bool verbose = false,
+              std::ostream & outStream = std::cout) const = 0;
+
+  /** \brief Returns number of cells.
+  */
+  virtual int getNumCells() const = 0;
+
+  /** \brief Returns number of nodes.
+  */
+  virtual int getNumNodes() const = 0;
+
+  /** \brief Returns number of edges.
+  */
+  virtual int getNumEdges() const {
+    return 0; // default due to lack of edges in 1D
+  }
+
+  /** \brief Returns number of faces.
+  */
+  virtual int getNumFaces() const {
+    return 0; // default due to lack of faces in 1D and 2D
+  }
+
+}; // MeshManager
+
+
+
+/** \class  MeshManager_BackwardFacingStepChannel
+    \brief  Mesh construction and mesh management for the
+            backward-facing step channel geometry, on
+            quadrilateral grids.
+*/
+template <class Real, class DeviceType>
+class MeshManager_BackwardFacingStepChannel : public MeshManager<Real,DeviceType> {
+
+/* Backward-facing step geometry.
+
+  ***************************************************
+  *        *           *                            *
+  *   3    *     4     *             5              *
+  *        *           *                            *
+  ********** * * * * * * * * * * * * * * * * * * * **
+           *     1     *             2              *
+           *           *                            *
+           ******************************************
+
+*/
+
+public:
+  using scalar_view = typename MeshManager<Real,DeviceType>::scalar_view;
+  using int_view = typename MeshManager<Real,DeviceType>::int_view;
+
+private:
+  Real channelH_;   // channel height (height of regions 1+4)
+  Real channelW_;   // channel width (width of regions 3+4+5)
+  Real stepH_;      // step height (height of region 1)
+  Real stepW_;      // step width (width of region 3)
+  Real observeW_;   // width of observation region (width of region 1)
+  
+  int ref_;          // mesh refinement level
+
+  int nx1_;
+  int nx2_;
+  int nx3_;
+  int nx4_;
+  int nx5_;
+  int ny1_;
+  int ny2_;
+  int ny3_;
+  int ny4_;
+  int ny5_;
+
+  int numCells_;
+  int numNodes_;
+  int numEdges_;
+
+  scalar_view meshNodes_;
+  int_view    meshCellToNodeMap_;
+  int_view    meshCellToEdgeMap_;
+  ROL::Ptr<std::vector<std::vector<std::vector<int> > > >  meshSideSets_;
+
+public:
+
+  MeshManager_BackwardFacingStepChannel(ROL::ParameterList &parlist) {
+    // Geometry data.
+    channelH_ = parlist.sublist("Geometry").get(   "Channel height", 1.0);
+    channelW_ = parlist.sublist("Geometry").get(    "Channel width", 8.0);
+    stepH_    = parlist.sublist("Geometry").get(      "Step height", 0.5);
+    stepW_    = parlist.sublist("Geometry").get(       "Step width", 1.0);
+    observeW_ = parlist.sublist("Geometry").get("Observation width", 3.0);
+    // Mesh data.
+    ref_   = parlist.sublist("Geometry").get(      "Refinement level", 1);
+    /*nx1_   = parlist.sublist("Geometry").get( "Observation region NX", 3*2*ref_);
+    ny1_   = parlist.sublist("Geometry").get( "Observation region NY", 1*ref_);
+    nx2_   = parlist.sublist("Geometry").get("Laminar flow region NX", 4*2*ref_);
+    ny2_   = ny1_;
+    nx3_   = parlist.sublist("Geometry").get(      "Inflow region NX", 1*2*ref_);
+    ny3_   = parlist.sublist("Geometry").get(      "Inflow region NY", 1*ref_);*/
+    nx1_   = parlist.sublist("Geometry").get( "Observation region NX", 4*ref_);
+    ny1_   = parlist.sublist("Geometry").get( "Observation region NY", 5*ref_);
+    nx2_   = parlist.sublist("Geometry").get("Laminar flow region NX", 2*ref_);
+    ny2_   = ny1_;
+    nx3_   = parlist.sublist("Geometry").get(      "Inflow region NX", 1*ref_);
+    ny3_   = parlist.sublist("Geometry").get(      "Inflow region NY", 2*ref_);
+    nx4_   = nx1_;
+    ny4_   = ny3_;
+    nx5_   = nx2_;
+    ny5_   = ny3_;
+    numCells_ = (nx1_ + nx2_)*ny1_  +  (nx3_ + nx1_ + nx2_)*ny3_;
+    numNodes_ = (nx1_ + nx2_ + 1)*ny1_  +  (nx3_ + nx1_ + nx2_ + 1)*(ny3_ + 1);
+    numEdges_ = (2*(nx1_ + nx2_)+1)*ny1_ + (2*(nx3_ + nx1_ + nx2_)+1)*ny3_ + (nx3_ + nx1_ + nx2_);
+    // Compute mesh data structures.
+    computeNodes();
+    computeCellToNodeMap();
+    computeCellToEdgeMap();
+    computeSideSets();
+  }
+
+
+  scalar_view getNodes() const override {
+    return meshNodes_;
+  }
+
+
+  int_view getCellToNodeMap() const override {
+    return meshCellToNodeMap_;
+  }
+
+
+  int_view getCellToEdgeMap() const override {
+    return meshCellToEdgeMap_;
+  }
+
+
+  ROL::Ptr<std::vector<std::vector<std::vector<int> > > > getSideSets(
+              const bool verbose = false,
+              std::ostream & outStream = std::cout) const override { 
+    return meshSideSets_;
+  }
+
+
+  int getNumCells() const override {
+    return numCells_;
+  } // getNumCells
+
+
+  int getNumNodes() const override {
+    return numNodes_;
+  } // getNumNodes
+
+
+  int getNumEdges() const override {
+    return numEdges_;
+  } // getNumEdges
+
+
+private:
+
+  void computeNodes() {
+
+    meshNodes_ = scalar_view("meshNodes_", numNodes_, 2);
+    auto nodes = meshNodes_;
+
+    Real dy1 = stepH_ / ny1_;
+    Real dy3 = (channelH_ - stepH_) / ny3_;
+    Real dx1 = observeW_ / nx1_;
+    Real dx2 = (channelW_ - stepW_ - observeW_) / nx2_;
+    Real dx3 = stepW_ / nx3_;
+    int nodeCt = 0;
+
+    // bottom region
+    for (int j=0; j<ny1_; ++j) {
+      for (int i=0; i<=nx1_; ++i) {
+        nodes(nodeCt, 0) = stepW_ + i*dx1;
+        nodes(nodeCt, 1) = j*dy1;
+        ++nodeCt;
+      }
+      for (int i=0; i<nx2_; ++i) {
+        nodes(nodeCt, 0) = stepW_ + observeW_ + (i+1)*dx2;
+        nodes(nodeCt, 1) = j*dy1;
+        ++nodeCt;
+      }
+    }
+
+    // top region
+    for (int j=0; j<=ny3_; ++j) {
+      for (int i=0; i<=nx3_; ++i) {
+        nodes(nodeCt, 0) = i*dx3;
+        nodes(nodeCt, 1) = stepH_ + j*dy3;
+        ++nodeCt;
+      }
+      for (int i=0; i<nx1_; ++i) {
+        nodes(nodeCt, 0) = stepW_ + (i+1)*dx1;
+        nodes(nodeCt, 1) = stepH_ + j*dy3;
+        ++nodeCt;
+      }
+      for (int i=0; i<nx2_; ++i) {
+        nodes(nodeCt, 0) = stepW_ + observeW_ + (i+1)*dx2;
+        nodes(nodeCt, 1) = stepH_ + j*dy3;
+        ++nodeCt;
+      }
+    }
+
+  }  // computeNodes
+
+
+  void computeCellToNodeMap() {
+
+    meshCellToNodeMap_ = int_view("meshCellToNodeMap_", numCells_, 4);
+    auto ctn = meshCellToNodeMap_;
+
+    int cellCt = 0;
+
+    // bottom region
+    for (int j=0; j<ny1_-1; ++j) {
+      for (int i=0; i<nx1_+nx2_; ++i) {
+        ctn(cellCt, 0) = j*(nx1_+nx2_+1) + i;
+        ctn(cellCt, 1) = j*(nx1_+nx2_+1) + (i+1);
+        ctn(cellCt, 2) = (j+1)*(nx1_+nx2_+1) + (i+1);
+        ctn(cellCt, 3) = (j+1)*(nx1_+nx2_+1) + i;
+        ++cellCt;
+      }
+    }
+
+    // transition region
+    for (int i=0; i<nx1_+nx2_; ++i) {
+      ctn(cellCt, 0) = (ny1_-1)*(nx1_+nx2_+1) + i;
+      ctn(cellCt, 1) = (ny1_-1)*(nx1_+nx2_+1) + (i+1);
+      ctn(cellCt, 2) = ny1_*(nx1_+nx2_+1) + nx3_ + (i+1);
+      ctn(cellCt, 3) = ny1_*(nx1_+nx2_+1) + nx3_ + i;
+      ++cellCt;
+    }
+
+    // top region
+    for (int j=0; j<ny3_; ++j) {
+      for (int i=0; i<nx3_+nx1_+nx2_; ++i) {
+        ctn(cellCt, 0) = ny1_*(nx1_+nx2_+1) + j*(nx3_+nx1_+nx2_+1) + i;
+        ctn(cellCt, 1) = ny1_*(nx1_+nx2_+1) + j*(nx3_+nx1_+nx2_+1) + (i+1);
+        ctn(cellCt, 2) = ny1_*(nx1_+nx2_+1) + (j+1)*(nx3_+nx1_+nx2_+1) + (i+1);
+        ctn(cellCt, 3) = ny1_*(nx1_+nx2_+1) + (j+1)*(nx3_+nx1_+nx2_+1) + i;
+        ++cellCt;
+      }
+    }
+
+  } // computeCellToNodeMap
+
+
+  void computeCellToEdgeMap() {
+
+    meshCellToEdgeMap_ = int_view("meshCellToEdgeMap_", numCells_, 4);
+    auto cte = meshCellToEdgeMap_;
+
+    int cellCt = 0;
+
+    // bottom region
+    for (int j=0; j<ny1_-1; ++j) {
+      for (int i=0; i<nx1_+nx2_; ++i) {
+        cte(cellCt, 0) = j*(2*(nx1_+nx2_)+1) + i;
+        cte(cellCt, 1) = j*(2*(nx1_+nx2_)+1) + (nx1_+nx2_) + (i+1);
+        cte(cellCt, 2) = (j+1)*(2*(nx1_+nx2_)+1) + i;
+        cte(cellCt, 3) = j*(2*(nx1_+nx2_)+1) + (nx1_+nx2_) + i;
+        ++cellCt;
+      }
+    }
+
+    // transition region
+    for (int i=0; i<nx1_+nx2_; ++i) {
+      cte(cellCt, 0) = (ny1_-1)*(2*(nx1_+nx2_)+1) + i;
+      cte(cellCt, 1) = (ny1_-1)*(2*(nx1_+nx2_)+1) + (nx1_+nx2_) + (i+1);
+      cte(cellCt, 2) = ny1_*(2*(nx1_+nx2_)+1) + nx3_ + i;
+      cte(cellCt, 3) = (ny1_-1)*(2*(nx1_+nx2_)+1) + (nx1_+nx2_) + i;
+      ++cellCt;
+    }
+
+    // top region
+    for (int j=0; j<ny3_; ++j) {
+      for (int i=0; i<nx3_+nx1_+nx2_; ++i) {
+        cte(cellCt, 0) = ny1_*(2*(nx1_+nx2_)+1) + j*(2*(nx3_+nx1_+nx2_)+1) + i;
+        cte(cellCt, 1) = ny1_*(2*(nx1_+nx2_)+1) + j*(2*(nx3_+nx1_+nx2_)+1) + (nx3_+nx1_+nx2_) + (i+1);
+        cte(cellCt, 2) = ny1_*(2*(nx1_+nx2_)+1) + (j+1)*(2*(nx3_+nx1_+nx2_)+1) + i;
+        cte(cellCt, 3) = ny1_*(2*(nx1_+nx2_)+1) + j*(2*(nx3_+nx1_+nx2_)+1) + (nx3_+nx1_+nx2_) + i;
+        ++cellCt;
+      }
+    }
+
+  } // computeCellToEdgeMap
+
+
+  void setRefinementLevel(const int &refLevel) {
+    ref_ = refLevel;
+    computeNodes();
+    computeCellToNodeMap();
+    computeCellToEdgeMap();
+  } // setRefinementLevel
+
+
+  virtual void computeSideSets() {
+
+    meshSideSets_ = ROL::makePtr<std::vector<std::vector<std::vector<int> > >>(11);
+    int numSides = 4;
+    (*meshSideSets_)[0].resize(numSides); // bottom
+    (*meshSideSets_)[1].resize(numSides); // right lower
+    (*meshSideSets_)[2].resize(numSides); // right upper
+    (*meshSideSets_)[3].resize(numSides); // top
+    (*meshSideSets_)[4].resize(numSides); // left upper
+    (*meshSideSets_)[5].resize(numSides); // middle
+    (*meshSideSets_)[6].resize(numSides); // left lower
+    (*meshSideSets_)[7].resize(1); // L-corner cell
+    (*meshSideSets_)[8].resize(numSides); // top corner cell
+    (*meshSideSets_)[9].resize(1); // between side 2 and 3
+    (*meshSideSets_)[10].resize(numSides); // top corner cell
+    (*meshSideSets_)[0][0].resize(nx1_+nx2_);
+    (*meshSideSets_)[1][1].resize(ny2_);
+    (*meshSideSets_)[2][1].resize(ny5_);
+    (*meshSideSets_)[3][2].resize(nx3_+nx4_+nx5_);
+    (*meshSideSets_)[4][3].resize(ny3_);
+    (*meshSideSets_)[5][0].resize(nx3_);
+    (*meshSideSets_)[6][3].resize(ny1_-1);
+    (*meshSideSets_)[7][0].resize(1);
+    (*meshSideSets_)[8][3].resize(1);
+    (*meshSideSets_)[9][0].resize(1);
+    (*meshSideSets_)[10][3].resize(ny1_);
+
+    for (int i=0; i<nx1_+nx2_; ++i) {
+      (*meshSideSets_)[0][0][i] = i;
+    }
+    for (int i=0; i<ny2_; ++i) {
+      (*meshSideSets_)[1][1][i] = (i+1)*(nx1_+nx2_) - 1;
+    }
+    int offset = nx1_*ny1_+nx2_*ny2_;
+    for (int i=0; i<ny5_; ++i) {
+      (*meshSideSets_)[2][1][i] = offset + (i+1)*(nx3_+nx4_+nx5_) - 1;
+    }
+    for (int i=0; i<nx3_+nx4_+nx5_; ++i) {
+      (*meshSideSets_)[3][2][i] = offset + (ny3_-1)*(nx3_+nx4_+nx5_) + i;
+    }
+    for (int i=0; i<ny3_; ++i) {
+      (*meshSideSets_)[4][3][i] = offset + i*(nx3_+nx4_+nx5_);
+    }
+    for (int i=0; i<nx3_; ++i) {
+      (*meshSideSets_)[5][0][i] = offset + i;
+    }
+    for (int i=0; i<ny1_-1; ++i) {
+      (*meshSideSets_)[6][3][i] = i*(nx1_+nx2_);
+    }
+    (*meshSideSets_)[7][0][0] = offset + nx3_;
+    (*meshSideSets_)[8][3][0] = (ny1_-1)*(nx1_+nx2_);
+    (*meshSideSets_)[9][0][0] = offset + ny5_*(nx3_+nx4_+nx5_) - 1;
+    for (int i=0; i<ny1_; ++i) {
+      (*meshSideSets_)[10][3][i] = i*(nx1_+nx2_);
+    }
+
+  } // computeSideSets
+
+}; // MeshManager_BackwardFacingStepChannel
+
+
+
+/** \class  MeshManager_Rectangle
+    \brief  Mesh construction and mesh management for the
+            rectangle geometry, on quadrilateral grids.
+*/
+template <class Real, class DeviceType>
+class MeshManager_Rectangle : public MeshManager<Real, DeviceType> {
+
+/* Rectangle geometry.
+ 
+      ***********************
+      *                     *   :
+      *                     *   |
+      *                     * height
+      *                     *   |
+      *                     *   :
+      *                     *
+      ***********************
+  (X0,Y0)   :--width--:
+
+*/
+
+public:
+  using scalar_view = typename MeshManager<Real,DeviceType>::scalar_view;
+  using int_view = typename MeshManager<Real,DeviceType>::int_view;
+
+private:
+  Real width_;   // rectangle height
+  Real height_;  // rectangle width
+  Real X0_;      // x coordinate of bottom left corner
+  Real Y0_;      // y coordinate of bottom left corner
+
+  int nx_;
+  int ny_;
+
+  int numCells_;
+  int numNodes_;
+  int numEdges_;
+
+  scalar_view meshNodes_;
+  int_view    meshCellToNodeMap_;
+  int_view    meshCellToEdgeMap_;
+
+  ROL::Ptr<std::vector<std::vector<std::vector<int> > > >  meshSideSets_;
+
+public:
+
+  MeshManager_Rectangle(ROL::ParameterList &parlist) {
+    // Geometry data.
+    width_  = parlist.sublist("Geometry").get( "Width", 3.0);
+    height_ = parlist.sublist("Geometry").get("Height", 1.0);
+    X0_     = parlist.sublist("Geometry").get(    "X0", 0.0);
+    Y0_     = parlist.sublist("Geometry").get(    "Y0", 0.0);
+    // Mesh data.
+    nx_ = parlist.sublist("Geometry").get("NX", 3);
+    ny_ = parlist.sublist("Geometry").get("NY", 1);
+    numCells_ = nx_ * ny_;
+    numNodes_ = (nx_+1) * (ny_+1);
+    numEdges_ = (nx_+1)*ny_ + (ny_+1)*nx_;
+    // Compute and store mesh data structures.
+    computeNodes();
+    computeCellToNodeMap();
+    computeCellToEdgeMap();
+    computeSideSets();
+  }
+
+
+  scalar_view getNodes() const override {
+    return meshNodes_;
+  }
+
+
+  int_view getCellToNodeMap() const override {
+    return meshCellToNodeMap_;
+  }
+
+
+  int_view getCellToEdgeMap() const override {
+    return meshCellToEdgeMap_;
+  }
+
+
+  virtual ROL::Ptr<std::vector<std::vector<std::vector<int>>>> getSideSets(
+              const bool verbose = false,
+              std::ostream & outStream = std::cout) const override { 
+    return meshSideSets_;
+  }
+
+
+  int getNumCells() const override {
+    return numCells_;
+  } // getNumCells
+
+
+  int getNumNodes() const override {
+    return numNodes_;
+  } // getNumNodes
+
+
+  int getNumEdges() const override {
+    return numEdges_;
+  } // getNumEdges
+
+private:
+
+  void computeNodes() {
+
+    meshNodes_ = scalar_view("meshNodes_", numNodes_, 2);
+    auto nodes = meshNodes_;
+
+    Real dx = width_ / nx_;
+    Real dy = height_ / ny_;
+    int nodeCt = 0;
+
+    for (int j=0; j<=ny_; ++j) {
+      Real ycoord = Y0_ + j*dy;
+      for (int i=0; i<=nx_; ++i) {
+        nodes(nodeCt, 0) = X0_ + i*dx;
+        nodes(nodeCt, 1) = ycoord; 
+        ++nodeCt;
+      }
+    }
+
+  } // computeNodes
+
+
+  void computeCellToNodeMap() {
+
+    meshCellToNodeMap_ = int_view("meshCellToNodeMap_", numCells_, 4);
+    auto ctn = meshCellToNodeMap_;
+
+    int cellCt = 0;
+
+    for (int j=0; j<ny_; ++j) {
+      for (int i=0; i<nx_; ++i) {
+        ctn(cellCt, 0) = j*(nx_+1) + i;
+        ctn(cellCt, 1) = j*(nx_+1) + (i+1);
+        ctn(cellCt, 2) = (j+1)*(nx_+1) + (i+1);
+        ctn(cellCt, 3) = (j+1)*(nx_+1) + i;
+        ++cellCt;
+      }
+    }
+
+  } // computeCellToNodeMap
+
+
+  void computeCellToEdgeMap() {
+
+    meshCellToEdgeMap_ = int_view("meshCellToEdgeMap_", numCells_, 4);
+    auto cte = meshCellToEdgeMap_;
+
+    int cellCt = 0;
+
+    for (int j=0; j<ny_; ++j) {
+      for (int i=0; i<nx_; ++i) {
+        cte(cellCt, 0) = j*(2*nx_+1) + i;
+        cte(cellCt, 1) = j*(2*nx_+1) + nx_ + (i+1);
+        cte(cellCt, 2) = (j+1)*(2*nx_+1) + i;
+        cte(cellCt, 3) = j*(2*nx_+1) + nx_ + i;
+        ++cellCt;
+      }
+    }
+
+  } // computeCellToEdgeMap
+
+
+  virtual void computeSideSets() {
+
+    meshSideSets_ = ROL::makePtr<std::vector<std::vector<std::vector<int> > >>(1);
+    int numSides = 4;
+    (*meshSideSets_)[0].resize(numSides);
+    (*meshSideSets_)[0][0].resize(nx_);
+    (*meshSideSets_)[0][1].resize(ny_);
+    (*meshSideSets_)[0][2].resize(nx_);
+    (*meshSideSets_)[0][3].resize(ny_);
+
+    for (int i=0; i<nx_; ++i) {
+      (*meshSideSets_)[0][0][i] = i;
+    }
+    for (int i=0; i<ny_; ++i) {
+      (*meshSideSets_)[0][1][i] = (i+1)*nx_-1;
+    }
+    for (int i=0; i<nx_; ++i) {
+      (*meshSideSets_)[0][2][i] = i + nx_*(ny_-1);
+    }
+    for (int i=0; i<ny_; ++i) {
+      (*meshSideSets_)[0][3][i] = i*nx_;
+    }
+
+  } // computeSideSets
+
+
+}; // MeshManager_Rectangle
+
+
+template<class Real, class DeviceType>
+class MeshManager_Interval : public MeshManager<Real, DeviceType> {
+
+/* Interval geometry [X0,X0+width] */
+
+public:
+  using scalar_view = typename MeshManager<Real,DeviceType>::scalar_view;
+  using int_view = typename MeshManager<Real,DeviceType>::int_view;
+
+private:
+  Real width_;    // Interval width
+  Real X0_;       // x coordinate left corner
+
+  int nx_;
+
+  int numCells_;
+  int numNodes_;
+
+  scalar_view meshNodes_;
+  int_view meshCellToNodeMap_;
+
+  ROL::Ptr<std::vector<std::vector<std::vector<int> > > > meshSideSets_;
+
+public:
+
+  MeshManager_Interval(ROL::ParameterList &parlist) {
+
+    // Geometry data
+    width_    = parlist.sublist("Geometry").get("Width", 1.0);
+    X0_       = parlist.sublist("Geometry").get("X0", 0.0);
+
+    // Mesh data
+    nx_       = parlist.sublist("Geometry").get("NX",10);
+
+    numCells_ = nx_;
+    numNodes_ = nx_+1;
+
+    // Compute and store mesh data structures
+    computeNodes();
+    computeCellToNodeMap();
+    computeSideSets();
+
+  }
+
+  scalar_view getNodes() const override {
+    return meshNodes_;
+  }
+
+  int_view getCellToNodeMap() const override {
+    return meshCellToNodeMap_;
+  }
+
+  ROL::Ptr<std::vector<std::vector<std::vector<int> > > > getSideSets(
+              const bool verbose = false,
+              std::ostream & outStream = std::cout) const override {
+    return meshSideSets_;
+  }
+
+  int getNumCells() const override {
+    return numCells_;
+  }
+
+  int getNumNodes() const override {
+    return numNodes_;
+  } // getNumNodes
+
+
+private:
+
+  void computeNodes() {
+
+    meshNodes_ = scalar_view("meshNodes_", numNodes_,1);
+    auto nodes = meshNodes_;
+
+    Real dx = width_ / nx_;
+
+    for( int i=0; i<nx_+1; ++i ) {
+      nodes(i, 0) = X0_ + i*dx;
+    }
+  } // computeNodes
+
+
+  void computeCellToNodeMap() {
+
+    meshCellToNodeMap_ = int_view("meshCellToNodeMap_", numCells_,2);
+    auto ctn = meshCellToNodeMap_;
+
+    for( int i=0; i<nx_; ++i ) {
+      ctn(i,0) = i;
+      ctn(i,1) = i+1;
+    }
+  } // computeCellToNodeMap
+
+
+  virtual void computeSideSets() {
+    int numSideSets = 2;
+    int numSides = 2;
+    meshSideSets_
+      = ROL::makePtr<std::vector<std::vector<std::vector<int> > >>(numSideSets);
+
+    (*meshSideSets_)[0].resize(numSides);
+    (*meshSideSets_)[0][0].resize(1);
+    (*meshSideSets_)[0][0][0] = 0;
+    (*meshSideSets_)[0][1].resize(0);
+
+    (*meshSideSets_)[1].resize(numSides);
+    (*meshSideSets_)[1][0].resize(0);
+    (*meshSideSets_)[1][1].resize(1);
+    (*meshSideSets_)[1][1][0] = numCells_-1;
+
+  } // computeSideSets
+
+}; // MeshManager_Interval
+
+
+template<class Real, class DeviceType>
+class MeshManager_Fractional_Cylinder : public MeshManager<Real, DeviceType> {
+
+/* Line geometry [X0,X0+width] */
+
+public:
+  using scalar_view = typename MeshManager<Real,DeviceType>::scalar_view;
+  using int_view = typename MeshManager<Real,DeviceType>::int_view;
+
+private:
+  Real width_;    // Interval width
+  Real X0_;       // x coordinate left corner
+
+  int nx_;
+
+  Real gamma_;
+
+  int numCells_;
+  int numNodes_;
+  int numEdges_;
+
+  scalar_view meshNodes_;
+  int_view    meshCellToNodeMap_;
+  int_view    meshCellToEdgeMap_;
+
+  ROL::Ptr<std::vector<std::vector<std::vector<int> > > > meshSideSets_;
+
+public:
+
+  MeshManager_Fractional_Cylinder(ROL::ParameterList &parlist) : X0_(0) {
+    // Mesh data
+    gamma_ = parlist.sublist("Geometry").sublist("Cylinder").get("Grading Parameter",0.5);
+    nx_    = parlist.sublist("Geometry").sublist("Cylinder").get("NI",10);
+    width_ = parlist.sublist("Geometry").sublist("Cylinder").get("Height",2.0);
+
+    std::cout << "GAMMA: " << gamma_ << "  NX: " << nx_ << "  WIDTH: " << width_ << std::endl;
+
+    numCells_ = nx_;
+    numNodes_ = nx_+1;
+    numEdges_ = 2;
+
+    std::cout << "NUMCELLS: " << numCells_ << "  NUMNODES: " << numNodes_ << "  NUMEDGES: " << numEdges_ << std::endl;
+
+    // Compute and store mesh data structures
+    computeNodes();
+    computeCellToNodeMap();
+    computeCellToEdgeMap();
+    computeSideSets();
+  }
+
+  scalar_view getNodes() const override {
+    return meshNodes_;
+  }
+
+  int_view getCellToNodeMap() const override {
+    return meshCellToNodeMap_; 
+  }
+
+  int_view getCellToEdgeMap() const override {
+    return meshCellToEdgeMap_;
+  }
+
+  ROL::Ptr<std::vector<std::vector<std::vector<int> > > > getSideSets(
+              const bool verbose = false,
+              std::ostream & outStream = std::cout) const override { 
+    return meshSideSets_;
+  }
+
+  int getNumCells() const override {
+    return numCells_;
+  }
+
+  int getNumNodes() const override {
+    return numNodes_;
+  } // getNumNodes
+
+
+  int getNumEdges() const override {
+    return numEdges_;
+  } // getNumEdges
+
+
+private:
+
+  void computeNodes() {
+    meshNodes_ = scalar_view("meshNodes_", numNodes_,1);
+
+    for( int i=0; i<nx_+1; ++i ) {
+      meshNodes_(i, 0) = X0_ + std::pow(static_cast<Real>(i)/nx_,gamma_) * width_;
+    }
+  } // computeNodes
+
+
+  void computeCellToNodeMap() {
+    meshCellToNodeMap_ = int_view("meshCellToNodeMap_", numCells_,2);
+
+    for( int i=0; i<nx_; ++i ) {
+      meshCellToNodeMap_(i,0) = i;
+      meshCellToNodeMap_(i,1) = i+1;
+    }
+  } // computeCellToNodeMap
+
+
+  void computeCellToEdgeMap() {
+    meshCellToEdgeMap_ = int_view("meshCellToEdgeMap_", numCells_,1);
+
+    for( int i=0; i<nx_; ++i ) {
+      meshCellToEdgeMap_(i,0) = i;
+    }
+  } // computeCellToEdgeMap
+
+
+  virtual void computeSideSets() {
+    int numSideSets = 2;
+    int numSides = 2;
+    meshSideSets_
+      = ROL::makePtr<std::vector<std::vector<std::vector<int> > >>(numSideSets);
+
+    (*meshSideSets_)[0].resize(numSides);
+    (*meshSideSets_)[0][0].resize(1);
+    (*meshSideSets_)[0][0][0] = 0;
+    (*meshSideSets_)[0][1].resize(0);
+
+    (*meshSideSets_)[1].resize(numSides);
+    (*meshSideSets_)[1][0].resize(0);
+    (*meshSideSets_)[1][1].resize(1);
+    (*meshSideSets_)[1][1][0] = numCells_-1;
+
+  } // computeSideSets
+
+}; // MeshManager_Fractional_Cylinder
+
+
+/** \class  MeshManager_Brick
+    \brief  Mesh construction and mesh management for the
+            brick geometry, on hexahedral grids.
+*/
+template <class Real, class DeviceType>
+class MeshManager_Brick : public MeshManager<Real, DeviceType> {
+
+/* Brick geometry.
+
+                   ***********************
+                 *                     * *
+  :--depth--:  *                     *   *
+             *                     *     *
+           ***********************       *
+           *                     *   :   *
+           *                     *   |   *
+           *                     * height
+           *                     *   |
+           *                     *   :
+           *                     * *
+           ***********************
+      (X0,Y0,Z0) :--width--:
+
+  (X0,Y0,Z0) denote the *smallest* values of each coordinate.
+
+*/
+
+public:
+  using scalar_view = typename MeshManager<Real,DeviceType>::scalar_view;
+  using int_view = typename MeshManager<Real,DeviceType>::int_view;
+
+private:
+  Real width_;   // rectangle width
+  Real depth_;   // rectangle depth
+  Real height_;  // rectangle height
+  Real X0_;      // x coordinate of bottom left front corner
+  Real Y0_;      // y coordinate of bottom left front corner
+  Real Z0_;      // z coordinate of bottom left front corner
+
+  int nx_;
+  int ny_;
+  int nz_;
+
+  int numCells_;
+  int numNodes_;
+  int numEdges_;
+
+  scalar_view meshNodes_;
+  int_view  meshCellToNodeMap_;
+  int_view  meshCellToEdgeMap_;
+
+  ROL::Ptr<std::vector<std::vector<std::vector<int> > > >  meshSideSets_;
+
+public:
+
+  MeshManager_Brick(ROL::ParameterList &parlist) {
+    // Geometry data.
+    width_  = parlist.sublist("Geometry").get( "Width", 3.0);
+    depth_  = parlist.sublist("Geometry").get( "Depth", 2.0);
+    height_ = parlist.sublist("Geometry").get("Height", 1.0);
+    X0_     = parlist.sublist("Geometry").get(    "X0", 0.0);
+    Y0_     = parlist.sublist("Geometry").get(    "Y0", 0.0);
+    Z0_     = parlist.sublist("Geometry").get(    "Z0", 0.0);
+    // Mesh data.
+    nx_ = parlist.sublist("Geometry").get("NX", 3);
+    ny_ = parlist.sublist("Geometry").get("NY", 2);
+    nz_ = parlist.sublist("Geometry").get("NZ", 1);
+    numCells_ = nx_ * ny_ * nz_;
+    numNodes_ = (nx_+1) * (ny_+1) * (nz_+1);
+    numEdges_ = ((nx_+1)*ny_ + (ny_+1)*nx_)*(nz_+1) + (nx_+1)*(ny_+1)*nz_;
+    // Compute and store mesh data structures.
+    computeNodes(); 
+    computeCellToNodeMap(); 
+    computeCellToEdgeMap();
+    computeSideSets();
+  }
+
+
+  scalar_view getNodes() const override {
+    return meshNodes_;
+  }
+
+
+  int_view getCellToNodeMap() const override {
+    return meshCellToNodeMap_;
+  }
+
+
+  int_view getCellToEdgeMap() const override {
+    return meshCellToEdgeMap_;
+  }
+
+
+  ROL::Ptr<std::vector<std::vector<std::vector<int> > > > getSideSets(
+              const bool verbose = false,
+              std::ostream & outStream = std::cout) const override { 
+    return meshSideSets_;
+  }
+
+
+  int getNumCells() const override {
+    return numCells_;
+  } // getNumCells
+
+
+  int getNumNodes() const override {
+    return numNodes_;
+  } // getNumNodes
+
+
+  int getNumEdges() const override {
+    return numEdges_;
+  } // getNumEdges
+
+private:
+
+  void computeNodes() {
+
+    meshNodes_ = scalar_view("meshNodes_", numNodes_, 3);
+
+    Real dx = width_ / nx_;
+    Real dy = depth_ / ny_;
+    Real dz = height_ / nz_;
+    int nodeCt = 0;
+
+    for (int k=0; k<=nz_; ++k) {
+      Real zcoord = Z0_ + k*dz;
+      for (int j=0; j<=ny_; ++j) {
+        Real ycoord = Y0_ + j*dy;
+        for (int i=0; i<=nx_; ++i) {
+          meshNodes_(nodeCt, 0) = X0_ + i*dx;
+          meshNodes_(nodeCt, 1) = ycoord; 
+          meshNodes_(nodeCt, 2) = zcoord; 
+          ++nodeCt;
+        }
+      }
+    }
+
+  } // computeNodes
+
+
+  void computeCellToNodeMap() {
+
+    meshCellToNodeMap_ = int_view("meshCellToNodeMap_", numCells_, 8);
+    auto ctn = meshCellToNodeMap_;
+
+    int cellCt = 0;
+
+    int numNodesXY = (nx_+1)*(ny_+1);
+
+    for (int k=0; k<nz_; ++k) {
+      for (int j=0; j<ny_; ++j) {
+        for (int i=0; i<nx_; ++i) {
+          //
+          ctn(cellCt, 0) = k*numNodesXY + j*(nx_+1) + i;
+          ctn(cellCt, 1) = k*numNodesXY + j*(nx_+1) + (i+1);
+          ctn(cellCt, 2) = k*numNodesXY + (j+1)*(nx_+1) + (i+1);
+          ctn(cellCt, 3) = k*numNodesXY + (j+1)*(nx_+1) + i;
+          //
+          ctn(cellCt, 4) = (k+1)*numNodesXY + j*(nx_+1) + i;
+          ctn(cellCt, 5) = (k+1)*numNodesXY + j*(nx_+1) + (i+1);
+          ctn(cellCt, 6) = (k+1)*numNodesXY + (j+1)*(nx_+1) + (i+1);
+          ctn(cellCt, 7) = (k+1)*numNodesXY + (j+1)*(nx_+1) + i;
+          //
+          ++cellCt;
+        }
+      }
+    }
+
+  } // computeCellToNodeMap
+
+
+  void computeCellToEdgeMap() {
+
+    meshCellToEdgeMap_ = int_view("meshCellToEdgeMap_", numCells_, 12);
+    auto cte = meshCellToEdgeMap_;
+
+    int cellCt = 0;
+
+    int numEdgesXY  = ((nx_+1)*ny_ + (ny_+1)*nx_);
+    int numEdgesZ   = (nx_+1)*(ny_+1);
+    int numEdgesXYZ = numEdgesXY + numEdgesZ;
+
+    for (int k=0; k<nz_; ++k) {
+      for (int j=0; j<ny_; ++j) {
+        for (int i=0; i<nx_; ++i) {
+          // bottom
+          cte(cellCt,  0) = k*numEdgesXYZ + j*(2*nx_+1) + i;
+          cte(cellCt,  1) = k*numEdgesXYZ + j*(2*nx_+1) + nx_ + (i+1);
+          cte(cellCt,  2) = k*numEdgesXYZ + (j+1)*(2*nx_+1) + i;
+          cte(cellCt,  3) = k*numEdgesXYZ + j*(2*nx_+1) + nx_ + i;
+          // verticals
+          cte(cellCt,  8) = k*numEdgesXYZ + numEdgesXY + j*(nx_+1) + i;
+          cte(cellCt,  9) = k*numEdgesXYZ + numEdgesXY + j*(nx_+1) + (i+1);
+          cte(cellCt, 10) = k*numEdgesXYZ + numEdgesXY + (j+1)*(nx_+1) + (i+1);
+          cte(cellCt, 11) = k*numEdgesXYZ + numEdgesXY + (j+1)*(nx_+1) + i;
+          // top
+          cte(cellCt,  4) = (k+1)*numEdgesXYZ + j*(2*nx_+1) + i;
+          cte(cellCt,  5) = (k+1)*numEdgesXYZ + j*(2*nx_+1) + nx_ + (i+1);
+          cte(cellCt,  6) = (k+1)*numEdgesXYZ + (j+1)*(2*nx_+1) + i;
+          cte(cellCt,  7) = (k+1)*numEdgesXYZ + j*(2*nx_+1) + nx_ + i;
+          ++cellCt;
+        }
+      }
+    }
+
+  } // computeCellToEdgeMap
+
+
+  virtual void computeSideSets() {
+
+    // single sideset (all of the boundary)
+    meshSideSets_ = ROL::makePtr<std::vector<std::vector<std::vector<int> > >>(1);
+    // the sideset has six sides with local side ids from 0 to 5
+    int numSides = 6;
+    (*meshSideSets_)[0].resize(numSides);
+    (*meshSideSets_)[0][0].resize(nx_*nz_);
+    (*meshSideSets_)[0][1].resize(ny_*nz_);
+    (*meshSideSets_)[0][2].resize(nx_*nz_);
+    (*meshSideSets_)[0][3].resize(ny_*nz_);
+    (*meshSideSets_)[0][4].resize(nx_*ny_);
+    (*meshSideSets_)[0][5].resize(nx_*ny_);
+
+    int nxny = nx_*ny_;
+
+    for (int j=0; j<nz_; ++j) {
+      for (int i=0; i<nx_; ++i) {
+        (*meshSideSets_)[0][0][i+nx_*j] = i+nxny*j;
+      }
+    }
+    for (int j=0; j<nz_; ++j) {
+      for (int i=0; i<ny_; ++i) {
+        (*meshSideSets_)[0][1][i+ny_*j] = (i+1)*nx_-1 + nxny*j;
+      }
+    }
+    for (int j=0; j<nz_; ++j) {
+      for (int i=0; i<nx_; ++i) {
+        (*meshSideSets_)[0][2][i+nx_*j] = nx_*(ny_-1) + i + nxny*j;
+      }
+    }
+    for (int j=0; j<nz_; ++j) {
+      for (int i=0; i<ny_; ++i) {
+        (*meshSideSets_)[0][3][i+ny_*j] = i*nx_ + nxny*j;
+      }
+    }
+    for (int j=0; j<ny_; ++j) {
+      for (int i=0; i<nx_; ++i) {
+        (*meshSideSets_)[0][4][i+nx_*j] = i + nx_*j;
+      }
+    }
+    for (int j=0; j<ny_; ++j) {
+      for (int i=0; i<nx_; ++i) {
+        (*meshSideSets_)[0][5][i+nx_*j] = i + nx_*j + nxny*(nz_-1);
+      }
+    }
+
+  } // computeSideSets
+
+
+}; // MeshManager_Brick
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/TOOLS/meshreaderK.hpp b/packages/rol/example/PDE-OPT/TOOLS/meshreaderK.hpp
new file mode 100644
index 000000000000..56c35fa010e1
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/TOOLS/meshreaderK.hpp
@@ -0,0 +1,617 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  meshreaderK.hpp
+    \brief Defines the MeshReader class.
+*/
+
+#ifndef MESHREADERK_HPP
+#define MESHREADERK_HPP
+
+#include "Shards_CellTopology.hpp"
+#include "ROL_Types.hpp"
+
+#include <iostream>
+#include <fstream>
+#include <cstring>
+#include "meshmanagerK.hpp"
+
+
+/** \class  MeshReader
+    \brief  This class implements the pure virtual MeshManager interface,
+            through methods that read in text (ASCII) files generated
+            from Exodus mesh files using the 'ncdump' tool.
+*/
+template <class Real, class DeviceType>
+class MeshReader : public MeshManager<Real, DeviceType> {
+
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using int_view = Kokkos::DynRankView<int, DeviceType>;
+
+
+private:
+
+  Teuchos::ParameterList parlist_;
+
+  int spaceDim_;
+  int numNodes_;
+  int numCells_;
+  int numEdges_;
+  int numFaces_;
+  int numSideSets_;
+  int numNodesPerCell_;
+  int numEdgesPerCell_;
+  int numFacesPerCell_;
+  int numNodesPerFace_;
+  int numProcs_;
+
+  ROL::Ptr<shards::CellTopology> cellTopo_;
+
+  scalar_view meshNodes_;
+  int_view  meshCellToNodeMap_;
+  int_view  meshCellToEdgeMap_;
+  int_view  meshCellToFaceMap_;
+
+  ROL::Ptr<std::vector<std::vector<std::vector<int>>>>  meshSideSets_;
+
+  ROL::Ptr<std::vector<std::vector<int>>>  procCellIds_;
+
+public:
+
+  /** \brief Constructor.  Parses the mesh file, and fills all data structures.
+  */
+  MeshReader(Teuchos::ParameterList & parlist, int numProcs = 0) : parlist_(parlist),
+      spaceDim_(0), numNodes_(0), numCells_(0), numEdges_(0), numFaces_(0),
+      numSideSets_(0), numNodesPerCell_(0), numEdgesPerCell_(0), numFacesPerCell_(0),
+      numNodesPerFace_(0) {
+    std::string   filename = parlist.sublist("Mesh").get("File Name", "mesh.txt");
+    std::ifstream inputfile;
+    std::string   line;
+    std::string   token;
+    std::string   shape;
+
+    inputfile.open(filename);
+
+    // Check if file readable.
+    if (!inputfile.good()) {
+      throw std::runtime_error("\nMeshReader: Could not open mesh file!\n");
+    }
+
+    // Parse file header.
+    bool processHeader = true;
+    while (processHeader) {
+
+      std::getline(inputfile, line);    // consider: while (getline(inputfile, line).good())
+      std::stringstream ssline(line);
+
+      while (ssline >> token) {
+
+        // Get space dimension.
+        if (token == "num_dim") {
+          ssline >> token;  // skip "="
+          ssline >> spaceDim_;
+          break;
+        }
+
+        // Get number of nodes.
+        if (token == "num_nodes") {
+          ssline >> token;  // skip "="
+          ssline >> numNodes_;
+          break;
+        }
+
+        // Get number of cells.
+        if (token == "num_elem") {
+          ssline >> token;  // skip "="
+          ssline >> numCells_;
+          break;
+        }
+
+        // Get number of sidesets.
+        if (token == "num_side_sets") {
+          ssline >> token;  // skip "="
+          ssline >> numSideSets_;
+          break;
+        }
+
+        // Get cell shape.
+        if (token == "connect1:elem_type") {
+          ssline >> token;  // skip "="
+          ssline >> shape;
+          processHeader = false;
+          break;
+        }
+
+      } // end tokenizing
+
+    } // end parse header
+
+    if (shape.find("TRI") != std::string::npos) {
+      cellTopo_ = ROL::makePtr<shards::CellTopology>( shards::getCellTopologyData<shards::Triangle<3>>() );
+    }
+    else if (shape.find("QUAD") != std::string::npos) {
+      cellTopo_ = ROL::makePtr<shards::CellTopology>( shards::getCellTopologyData<shards::Quadrilateral<4>>() );
+    }
+    else if (shape.find("TET") != std::string::npos) {
+      cellTopo_ = ROL::makePtr<shards::CellTopology>( shards::getCellTopologyData<shards::Tetrahedron<4>>() );
+    }
+    else if (shape.find("HEX") != std::string::npos) {
+      cellTopo_ = ROL::makePtr<shards::CellTopology>( shards::getCellTopologyData<shards::Hexahedron<8>>() );
+    }
+    else {
+      std::cout << shape << std::endl;
+      throw ROL::Exception::NotImplemented(">>> Cell shape not recognized!");
+    }
+
+    // Set up internal storage.
+    numNodesPerCell_ = cellTopo_->getVertexCount();
+    numFacesPerCell_ = cellTopo_->getFaceCount();
+    numEdgesPerCell_ = cellTopo_->getEdgeCount();
+    numNodesPerFace_ = cellTopo_->getVertexCount(2,0);
+    meshNodes_ = scalar_view("meshNodes_", numNodes_, spaceDim_);
+    meshCellToNodeMap_ = int_view("meshCellToNodeMap_", numCells_, numNodesPerCell_);
+
+    // Parse node coordinates.
+    std::vector<int> coordCount(3, 0);
+    std::vector<std::string> coordLabel(3);
+    coordLabel[0] = "coordx";
+    coordLabel[1] = "coordy";
+    coordLabel[2] = "coordz";
+    char semicolon = ';';
+
+    for (int d=0; d<spaceDim_; ++d) {
+
+      bool processCoordsHeader = true;
+      while (processCoordsHeader) {
+
+        std::getline(inputfile, line);
+        std::stringstream ssline(line);
+
+        while (ssline >> token) {
+
+          // Get node coordinates.
+          if (token == coordLabel[d]) {
+            ssline >> token;  // skip "="
+            while (std::getline(ssline, token, ',')) {
+              if (token != " ") {
+                meshNodes_(coordCount[d]++, d) = atof(token.c_str());
+              }
+            }
+            processCoordsHeader = false;
+          }
+        }
+
+      }
+
+      bool processCoords = true;
+      if (token.find(semicolon) != std::string::npos) {
+        processCoords = false;
+      }
+      while (processCoords) {
+
+        std::getline(inputfile, line);
+        std::stringstream ssline(line);
+
+        while (std::getline(ssline, token, ',')) {
+          // Get node coordinates.
+          if (token.find(semicolon) == std::string::npos) {
+            if (token != " ") {
+              meshNodes_(coordCount[d]++, d) = atof(token.c_str());
+            }
+          }
+          else { // encountered "some-number ;"
+            std::string onlyNumber = token.substr(0, token.size()-1);
+            meshNodes_(coordCount[d]++, d) = atof(onlyNumber.c_str());
+            processCoords = false;
+          }
+        }
+      } // end process coords
+
+    } // end dimension loop
+
+
+    // Find connectivity information.
+    bool searchConnect = true;
+    while (searchConnect) {
+
+      std::getline(inputfile, line);
+      std::stringstream ssline(line);
+
+      while (ssline >> token) {
+        if (token == "connect1") { // found connectivity, cell-to-node map
+          searchConnect = false;
+          break;
+        }
+      }
+    }
+
+    // Process connectivity information.
+    bool processConnectivity = true;
+    int  cellCount = 0;
+    int  nodeId = 0;
+    while (processConnectivity) {
+
+      std::getline(inputfile, line);
+      std::stringstream ssline(line);
+
+      while (std::getline(ssline, token, ',')) {
+        // Get node ids.
+        if (token.find(semicolon) == std::string::npos) {
+          if (token != " ") {
+            meshCellToNodeMap_(cellCount, nodeId++) = atoi(token.c_str())-1;
+            if (nodeId == numNodesPerCell_) {
+              nodeId = 0;
+              cellCount++;
+            }
+          }
+        }
+        else { // encountered "some-number ;"
+          std::string onlyNumber = token.substr(0, token.size()-1);
+          meshCellToNodeMap_(cellCount, numNodesPerCell_-1) = atoi(onlyNumber.c_str())-1;
+          processConnectivity = false;
+        }
+      }
+    } // end process connectivity
+
+
+    // Parse side sets.
+    meshSideSets_ = ROL::makePtr<std::vector<std::vector<std::vector<int>>>>(numSideSets_);
+    for (int ss=0; ss<numSideSets_; ++ss) {
+      if (spaceDim_ == 2) {
+        (*meshSideSets_)[ss].resize(numEdgesPerCell_);
+      }
+      else if (spaceDim_ == 3) {
+        (*meshSideSets_)[ss].resize(numFacesPerCell_);
+      }
+    }
+    std::vector<int> ssCellIds;
+
+    for (int ss=0; ss<numSideSets_; ++ss) {
+
+      ssCellIds.clear();
+
+      bool processSSHeaderCell = true;
+      while (processSSHeaderCell) {
+
+        std::getline(inputfile, line);
+        std::stringstream ssline(line);
+
+        while (ssline >> token) {
+
+          // Get cell ids.
+          if (token.substr(0,7) == "elem_ss") {
+            ssline >> token;  // skip "="
+            while (std::getline(ssline, token, ',')) {
+              if (token != " ") {
+                ssCellIds.push_back(atoi(token.c_str())-1);
+              }
+            }
+            processSSHeaderCell = false;
+          }
+        }
+
+      }
+
+      bool processSSCell = true;
+      if (token.find(semicolon) != std::string::npos) {
+        processSSCell = false;
+      }
+      while (processSSCell) {
+
+        std::getline(inputfile, line);
+        std::stringstream ssline(line);
+
+        while (std::getline(ssline, token, ',')) {
+          // Get cell ids.
+          if (token.find(semicolon) == std::string::npos) {
+            if (token != " ") {
+              ssCellIds.push_back(atoi(token.c_str())-1);
+            }
+          }
+          else { // encountered "some-number ;"
+            std::string onlyNumber = token.substr(0, token.size()-1);
+            ssCellIds.push_back(atoi(token.c_str())-1);
+            processSSCell = false;
+          }
+        }
+      } // end process side set cells
+
+      bool processSSHeaderSide = true;
+      int cellIdx = 0;
+      while (processSSHeaderSide) {
+
+        std::getline(inputfile, line);
+        std::stringstream ssline(line);
+
+        while (ssline >> token) {
+
+          // Get local side ids.
+          if (token.substr(0,7) == "side_ss") {
+            ssline >> token;  // skip "="
+            while (std::getline(ssline, token, ',')) {
+              if (token != " ") {
+                (*meshSideSets_)[ss][atoi(token.c_str())-1].push_back(ssCellIds[cellIdx]);
+                cellIdx++;
+              }
+            }
+            processSSHeaderSide = false;
+          }
+        }
+
+      }
+
+      bool processSSSide = true;
+      if (token.find(semicolon) != std::string::npos) {
+        processSSSide = false;
+      }
+      while (processSSSide) {
+
+        std::getline(inputfile, line);
+        std::stringstream ssline(line);
+
+        while (std::getline(ssline, token, ',')) {
+          // Get cell ids.
+          if (token.find(semicolon) == std::string::npos) {
+            if (token != " ") {
+              (*meshSideSets_)[ss][atoi(token.c_str())-1].push_back(ssCellIds[cellIdx]);
+              cellIdx++;
+            }
+          }
+          else { // encountered "some-number ;"
+            std::string onlyNumber = token.substr(0, token.size()-1);
+            (*meshSideSets_)[ss][atoi(token.c_str())-1].push_back(ssCellIds[cellIdx]);
+            ssCellIds.clear();
+            processSSSide = false;
+          }
+        }
+      } // end process side set sides
+
+
+    } // end dimension loop
+
+    inputfile.close();
+
+    computeCellToEdgeMap();
+
+    computeCellToFaceMap();
+
+
+    // Parse parallel partitions.
+
+    if ((numProcs != 0) && (numProcs < 2)) {
+      throw std::runtime_error("\n>>> The number of processors is either zero (serial execution, by convention) or at least 2 (parallel execution)!\n");
+    }
+    numProcs_ = numProcs;
+    procCellIds_ = ROL::makePtr<std::vector<std::vector<int>>>(numProcs_);
+
+    for (int proc=0; proc<numProcs_; ++proc) {
+      std::string procfile = filename + "." + std::to_string(numProcs_) + "." + std::to_string(proc);
+      inputfile.open(procfile);
+      int procNumCells;
+      int cellCt = 0;
+
+      // Check if file readable.
+      if (!inputfile.good()) {
+        throw std::runtime_error("\nMeshReader: Could not open mesh file!\n");
+      }
+
+      // Parse file header.
+      bool processPartitionHeader = true;
+      while (processPartitionHeader) {
+
+        std::getline(inputfile, line);    // consider: while (getline(inputfile, line).good())
+        std::stringstream ssline(line);
+
+        while (ssline >> token) {
+
+          // Get number of cells.
+          if (token == "num_elem") {
+            ssline >> token;  // skip "="
+            ssline >> procNumCells;
+            processPartitionHeader = false;
+            break;
+          }
+
+        } // end tokenizing
+
+      } // end parse header
+
+      (*procCellIds_)[proc].resize(procNumCells, 0);
+
+      bool processCellsHeader = true;
+      while (processCellsHeader) {
+
+        std::getline(inputfile, line);
+        std::stringstream ssline(line);
+
+        while (ssline >> token) {
+
+          // Get cell ids.
+          if (token == "elem_num_map") {
+            ssline >> token;  // skip "="
+            while (std::getline(ssline, token, ',')) {
+              if (token != " ") {
+                (*procCellIds_)[proc][cellCt++] = atoi(token.c_str())-1;
+              }
+            }
+            processCellsHeader = false;
+          }
+        }
+
+      }
+
+      bool processCells = true;
+      if (token.find(semicolon) != std::string::npos) {
+        processCells = false;
+      }
+      while (processCells) {
+
+        std::getline(inputfile, line);
+        std::stringstream ssline(line);
+
+        while (std::getline(ssline, token, ',')) {
+          // Get cell ids.
+          if (token.find(semicolon) == std::string::npos) {
+            if (token != " ") {
+              (*procCellIds_)[proc][cellCt++] = atoi(token.c_str())-1;
+            }
+          }
+          else { // encountered "some-number ;"
+            std::string onlyNumber = token.substr(0, token.size()-1);
+            (*procCellIds_)[proc][cellCt++] = atoi(onlyNumber.c_str())-1;
+            processCells = false;
+          }
+        }
+      } // end process cells
+
+      inputfile.close();
+    }
+
+  }
+
+
+  scalar_view getNodes() const override {
+    return meshNodes_;
+  }
+
+
+  int_view getCellToNodeMap() const override {
+    return meshCellToNodeMap_;
+  }
+
+
+  int_view getCellToEdgeMap() const override {
+    return meshCellToEdgeMap_;
+  }
+
+
+  int_view getCellToFaceMap() const override {
+    return meshCellToFaceMap_;
+  }
+
+
+  ROL::Ptr<std::vector<std::vector<std::vector<int> > > > getSideSets (
+      const bool verbose = false,
+      std::ostream & outStream = std::cout) const override {
+    if (verbose) {
+      for (int i=0; i<numSideSets_; ++i) {
+        outStream << "\nSideset " << i << std::endl;
+        for (int j=0; j<numFacesPerCell_; ++j) {
+          outStream << "    Local side " << j << ":";
+          for (int k=0; k<static_cast<int>((*meshSideSets_)[i][j].size()); ++k) {
+            outStream << " " << (*meshSideSets_)[i][j][k];
+          }
+          outStream << std::endl;
+        }
+      }
+    }
+    return meshSideSets_;
+  }
+
+
+  ROL::Ptr<std::vector<std::vector<int>>> getProcCellIds() const override {
+    return procCellIds_;
+  }
+
+
+  int getNumCells() const override {
+    return numCells_;
+  } // getNumCells
+
+
+  int getNumNodes() const override {
+    return numNodes_;
+  } // getNumNodes
+
+
+  int getNumEdges() const override {
+    return numEdges_;
+  } // getNumEdges
+
+  int getNumFaces() const override {
+    return numFaces_;
+  } // getNumEdges
+
+  int getNumSideSets() const {
+    return numSideSets_;
+  } // getNumSideSets
+
+  void computeCellToEdgeMap() {
+    meshCellToEdgeMap_ = int_view("meshCellToEdgeMap_", numCells_, numEdgesPerCell_);
+    std::set<int> edgenodes;
+    std::map<std::set<int>,int> edgemap;
+    int edgeCt = 0;
+
+    /* Build edge map. */
+    for (int i=0; i<numCells_; ++i) { // loop over cells
+      for (int j=0; j<numEdgesPerCell_; ++j) { // loop over local edges
+        edgenodes.insert(meshCellToNodeMap_(i, cellTopo_->getNodeMap(1, j, 0)));
+        edgenodes.insert(meshCellToNodeMap_(i, cellTopo_->getNodeMap(1, j, 1)));
+        //std::pair<std::map<std::set<int>,int>::iterator,bool> ret;
+        auto ret = edgemap.insert(std::pair<std::set<int>,int>(edgenodes, edgeCt++));
+        meshCellToEdgeMap_(i, j) = edgemap[edgenodes];
+        if (ret.second == false) {
+          edgeCt--;
+        }
+        edgenodes.clear();
+      }
+    }
+
+    numEdges_ = edgemap.size();
+
+    /* Print edges and cell-to-edge map, for debugging. */
+    /*for (auto it_em=edgemap.begin(); it_em != edgemap.end(); ++it_em) {
+      for (auto it_ns=(it_em->first).begin(); it_ns != (it_em->first).end(); ++it_ns) {
+        std::cout << *it_ns << " , ";
+      }
+      std::cout << " => " << it_em->second << std::endl;
+    }
+    std::cout << cte;*/
+
+  }
+
+  void computeCellToFaceMap() {
+    meshCellToFaceMap_ = int_view("meshCellToFaceMap_", numCells_, numFacesPerCell_);
+    std::set<int> facenodes;
+    std::map<std::set<int>,int> facemap;
+    int faceCt = 0;
+
+    /* Build face map. */
+    for (int i=0; i<numCells_; ++i) { // loop over cells
+      for (int j=0; j<numFacesPerCell_; ++j) { // loop over local faces
+        for (int k=0; k<numNodesPerFace_; ++k) { // loop over nodes to insert in sorted order
+          facenodes.insert(meshCellToNodeMap_(i, cellTopo_->getNodeMap(2, j, k)));
+        }
+        //std::pair<std::map<std::set<int>,int>::iterator,bool> ret;
+        auto ret = facemap.insert(std::pair<std::set<int>,int>(facenodes, faceCt++));
+        meshCellToFaceMap_(i, j) = facemap[facenodes];
+        if (ret.second == false) {
+          faceCt--;
+        }
+        facenodes.clear();
+      }
+    }
+
+    numFaces_ = facemap.size();
+
+    /* Print faces and cell-to-face map, for debugging. */
+    /*for (auto it_fm=facemap.begin(); it_fm != facemap.end(); ++it_fm) {
+      for (auto it_ns=(it_fm->first).begin(); it_ns != (it_fm->first).end(); ++it_ns) {
+        std::cout << *it_ns << " , ";
+      }
+      std::cout << " => " << it_fm->second << std::endl;
+    }
+    std::cout << ctf;*/
+
+  }
+
+}; // MeshReader
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/TOOLS/pdeK.hpp b/packages/rol/example/PDE-OPT/TOOLS/pdeK.hpp
new file mode 100644
index 000000000000..23ab832294c3
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/TOOLS/pdeK.hpp
@@ -0,0 +1,265 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  pde.hpp
+    \brief Provides the interface for local (cell-based) PDE residual computations.
+*/
+
+#ifndef PDEOPT_PDEK_HPP
+#define PDEOPT_PDEK_HPP
+
+#include "Intrepid2_Basis.hpp"
+#include "ROL_Ptr.hpp"
+
+namespace Exception {
+
+  class NotImplemented : public Teuchos::ExceptionBase {
+    public:
+      NotImplemented(const std::string & what_arg) : Teuchos::ExceptionBase(what_arg) {}
+  }; // NotImplemented
+
+  class Zero : public Teuchos::ExceptionBase {
+    public:
+      Zero(const std::string & what_arg) : Teuchos::ExceptionBase(what_arg) {}
+  }; // Zero
+
+} // Exception
+
+template <class Real, class DeviceType>
+class PDE {
+public:
+  using scalar_view = Kokkos::DynRankView<Real,DeviceType>;
+  using basis_ptr = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+
+  virtual ~PDE() {}
+
+  virtual void residual(scalar_view & res,
+                        const scalar_view u_coeff,
+                        const scalar_view z_coeff = scalar_view(),
+                        const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) = 0;
+
+  virtual void Jacobian_1(scalar_view & jac,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> Jacobian_1 not implemented.");
+  }
+
+  virtual void Jacobian_2(scalar_view & jac,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> Jacobian_2 not implemented.");
+  }
+
+  virtual void Jacobian_3(std::vector<scalar_view > & jac,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> Jacobian_3 not implemented.");
+  }
+
+  virtual void applyJacobian_1(scalar_view &jv,
+                               const scalar_view v_coeff,
+                               const scalar_view u_coeff,
+                               const scalar_view z_coeff,
+                               const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> applyJacobian_1 not implemented.");
+  }
+
+  virtual void applyAdjointJacobian_1(scalar_view &jv,
+                                      const scalar_view v_coeff,
+                                      const scalar_view u_coeff,
+                                      const scalar_view z_coeff,
+                                      const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> applyAdjointJacobian_1 not implemented.");
+  }
+
+  virtual void applyJacobian_2(scalar_view &jv,
+                               const scalar_view v_coeff,
+                               const scalar_view u_coeff,
+                               const scalar_view z_coeff,
+                               const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> applyJacobian_2 not implemented.");
+  }
+
+  virtual void applyAdjointJacobian_2(scalar_view &jv,
+                                      const scalar_view v_coeff,
+                                      const scalar_view u_coeff,
+                                      const scalar_view z_coeff,
+                                      const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> applyAdjointJacobian_2 not implemented.");
+  }
+
+  virtual void Hessian_11(scalar_view & hess,
+                          const scalar_view l_coeff,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> Hessian_11 not implemented.");
+  }
+
+  virtual void Hessian_12(scalar_view & hess,
+                          const scalar_view l_coeff,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> Hessian_12 not implemented.");
+  }
+
+  virtual void Hessian_13(std::vector<scalar_view > & hess,
+                          const scalar_view l_coeff,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> Hessian_13 not implemented.");
+  }
+
+  virtual void Hessian_21(scalar_view & hess,
+                          const scalar_view l_coeff,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> Hessian_21 not implemented.");
+  }
+
+  virtual void Hessian_22(scalar_view & hess,
+                          const scalar_view l_coeff,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> Hessian_22 not implemented.");
+  }
+
+  virtual void Hessian_23(std::vector<scalar_view > & hess,
+                          const scalar_view l_coeff,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> Hessian_23 not implemented.");
+  }
+
+  virtual void Hessian_31(std::vector<scalar_view > & hess,
+                          const scalar_view l_coeff,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> Hessian_31 not implemented.");
+  }
+
+  virtual void Hessian_32(std::vector<scalar_view > & hess,
+                          const scalar_view l_coeff,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> Hessian_32 not implemented.");
+  }
+
+  virtual void Hessian_33(std::vector<std::vector<scalar_view > > & hess,
+                          const scalar_view l_coeff,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> Hessian_33 not implemented.");
+  }
+
+  virtual void applyHessian_11(scalar_view & hess,
+                               const scalar_view v_coeff,
+                               const scalar_view l_coeff,
+                               const scalar_view u_coeff,
+                               const scalar_view z_coeff = scalar_view(),
+                               const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> applyHessian_11 not implemented.");
+  }
+
+  virtual void applyHessian_12(scalar_view & hess,
+                               const scalar_view v_coeff,
+                               const scalar_view l_coeff,
+                               const scalar_view u_coeff,
+                               const scalar_view z_coeff = scalar_view(),
+                               const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> applyHessian_12 not implemented.");
+  }
+
+  virtual void applyHessian_21(scalar_view & hess,
+                               const scalar_view v_coeff,
+                               const scalar_view l_coeff,
+                               const scalar_view u_coeff,
+                               const scalar_view z_coeff = scalar_view(),
+                               const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> applyHessian_21 not implemented.");
+  }
+
+  virtual void applyHessian_22(scalar_view & hess,
+                               const scalar_view v_coeff,
+                               const scalar_view l_coeff,
+                               const scalar_view u_coeff,
+                               const scalar_view z_coeff = scalar_view(),
+                               const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> applyHessian_22 not implemented.");
+  }
+
+  virtual void RieszMap_1(scalar_view &riesz) {
+    throw Exception::NotImplemented(">>> RieszMap_1 not implemented.");
+  }
+
+  virtual void RieszMap_2(scalar_view &riesz) {
+    throw Exception::NotImplemented(">>> RieszMap_2 not implemented.");
+  }
+
+  virtual std::vector<basis_ptr> getFields() = 0;
+  virtual std::vector<basis_ptr> getFields2() {
+    return getFields();
+  }
+
+  virtual void setCellNodes(const scalar_view &cellNodes,
+                            const std::vector<std::vector<scalar_view>> &bdryCellNodes,
+                            const std::vector<std::vector<std::vector<int>>> &bdryCellLocIds) = 0;
+
+  virtual void setFieldPattern(const std::vector<std::vector<int>> &fieldPattern) {}
+  virtual void setFieldPattern(const std::vector<std::vector<int>> &fieldPattern1,
+                               const std::vector<std::vector<int>> &fieldPattern2) {
+    setFieldPattern(fieldPattern1);
+  }
+
+  virtual void printData(std::string tag,
+                         const scalar_view u_coeff,
+                         const scalar_view z_coeff = scalar_view(),
+                         const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {}
+
+  virtual void printCellAverages(std::string tag,
+                                 const scalar_view u_coeff,
+                                 const scalar_view z_coeff = scalar_view(),
+                                 const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {}
+
+private:
+  Real time_;
+  std::vector<Real> param_;
+
+protected:
+  Real getTime(void) const {
+    return time_;
+  }
+
+  std::vector<Real> getParameter(void) const {
+    return param_;
+  }
+
+public:
+  void setTime(const Real time) {
+    time_ = time;
+  }
+
+  void setParameter(const std::vector<Real> &param) {
+    param_.assign(param.begin(),param.end());
+  }
+
+}; // PDE
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/TOOLS/pdeconstraintK.hpp b/packages/rol/example/PDE-OPT/TOOLS/pdeconstraintK.hpp
new file mode 100644
index 000000000000..6a9a311852bc
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/TOOLS/pdeconstraintK.hpp
@@ -0,0 +1,1149 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  pdeconstraint.hpp
+    \brief Defines the SimOpt constraint for the PDE-OPT examples.
+*/
+
+#ifndef ROL_PDEOPT_PDECONSTRAINTK_H
+#define ROL_PDEOPT_PDECONSTRAINTK_H
+
+#include "ROL_Constraint_SimOpt.hpp"
+#include "pdeK.hpp"
+#include "assemblerK.hpp"
+#include "solver.hpp"
+#include "pdevectorK.hpp"
+
+// Do not instantiate the template in this translation unit.
+extern template class Assembler<double,Kokkos::HostSpace>;
+
+//// Global Timers.
+#ifdef ROL_TIMERS
+namespace ROL {
+  namespace PDEOPT {
+    ROL::Ptr<Teuchos::Time> PDEConstraintSolverConstruct_Jacobian1        = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: PDE Constraint Solver Construction Time for Jacobian1");
+    ROL::Ptr<Teuchos::Time> PDEConstraintSolverConstruct_AdjointJacobian1 = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: PDE Constraint Solver Construction Time for Adjoint Jacobian1");
+    ROL::Ptr<Teuchos::Time> PDEConstraintSolverSolve_Jacobian1            = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: PDE Constraint Solver Solution Time for Jacobian1");
+    ROL::Ptr<Teuchos::Time> PDEConstraintSolverSolve_AdjointJacobian1     = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: PDE Constraint Solver Solution Time for Adjoint Jacobian1");
+    ROL::Ptr<Teuchos::Time> PDEConstraintApplyJacobian1                   = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: PDE Constraint Apply Jacobian1");
+    ROL::Ptr<Teuchos::Time> PDEConstraintApplyJacobian2                   = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: PDE Constraint Apply Jacobian2");
+    ROL::Ptr<Teuchos::Time> PDEConstraintApplyJacobian3                   = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: PDE Constraint Apply Jacobian3");
+    ROL::Ptr<Teuchos::Time> PDEConstraintApplyAdjointJacobian1            = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: PDE Constraint Apply Adjoint Jacobian1");
+    ROL::Ptr<Teuchos::Time> PDEConstraintApplyAdjointJacobian2            = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: PDE Constraint Apply Adjoint Jacobian2");
+    ROL::Ptr<Teuchos::Time> PDEConstraintApplyAdjointJacobian3            = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: PDE Constraint Apply Adjoint Jacobian3");
+    ROL::Ptr<Teuchos::Time> PDEConstraintApplyHessian11                   = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: PDE Constraint Apply Hessian11");
+    ROL::Ptr<Teuchos::Time> PDEConstraintApplyHessian12                   = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: PDE Constraint Apply Hessian12");
+    ROL::Ptr<Teuchos::Time> PDEConstraintApplyHessian13                   = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: PDE Constraint Apply Hessian13");
+    ROL::Ptr<Teuchos::Time> PDEConstraintApplyHessian21                   = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: PDE Constraint Apply Hessian21");
+    ROL::Ptr<Teuchos::Time> PDEConstraintApplyHessian22                   = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: PDE Constraint Apply Hessian22");
+    ROL::Ptr<Teuchos::Time> PDEConstraintApplyHessian23                   = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: PDE Constraint Apply Hessian23");
+    ROL::Ptr<Teuchos::Time> PDEConstraintApplyHessian31                   = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: PDE Constraint Apply Hessian31");
+    ROL::Ptr<Teuchos::Time> PDEConstraintApplyHessian32                   = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: PDE Constraint Apply Hessian32");
+    ROL::Ptr<Teuchos::Time> PDEConstraintApplyHessian33                   = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: PDE Constraint Apply Hessian33");
+  }
+}
+#endif
+
+
+template<class Real, class DeviceType>
+class PDE_Constraint : public ROL::Constraint_SimOpt<Real> {
+private:
+  const ROL::Ptr<PDE<Real,DeviceType>> pde_;
+  ROL::Ptr<Assembler<Real,DeviceType>> assembler_;
+  ROL::Ptr<Solver<Real>> solver_;
+
+  ROL::Ptr<Tpetra::MultiVector<>>           vecR_;
+  ROL::Ptr<Tpetra::CrsMatrix<>>             matJ1_;
+  ROL::Ptr<Tpetra::CrsMatrix<>>             matJ2_;
+  ROL::Ptr<Tpetra::MultiVector<>>           vecJ3_;
+  ROL::Ptr<Tpetra::CrsMatrix<>>             matH11_;
+  ROL::Ptr<Tpetra::CrsMatrix<>>             matH12_;
+  ROL::Ptr<Tpetra::MultiVector<>>           vecH13_;
+  ROL::Ptr<Tpetra::CrsMatrix<>>             matH21_;
+  ROL::Ptr<Tpetra::CrsMatrix<>>             matH22_;
+  ROL::Ptr<Tpetra::MultiVector<>>           vecH23_;
+  ROL::Ptr<Tpetra::MultiVector<>>           vecH31_;
+  ROL::Ptr<Tpetra::MultiVector<>>           vecH32_;
+  ROL::Ptr<std::vector<std::vector<Real>> > matH33_;
+
+  bool computeJ1_,  computeJ2_,  computeJ3_;
+  bool setSolver_;
+  bool isJ1zero_, isJ1notImplemented_;
+  bool isJ2zero_, isJ2notImplemented_;
+  bool isJ3zero_, isJ3notImplemented_;
+  bool isH11zero_, isH11notImplemented_;
+  bool isH12zero_, isH12notImplemented_;
+  bool isH13zero_, isH13notImplemented_;
+  bool isH21zero_, isH21notImplemented_;
+  bool isH22zero_, isH22notImplemented_;
+  bool isH23zero_, isH23notImplemented_;
+  bool isH31zero_, isH31notImplemented_;
+  bool isH32zero_, isH32notImplemented_;
+  bool isH33zero_, isH33notImplemented_;
+
+  // Assemble the PDE Jacobian.
+  void assembleJ1(const ROL::Vector<Real> &u, const ROL::Vector<Real> &z) {
+    if ( !isJ1zero_ && !isJ1notImplemented_ ) {
+      try {
+        if (computeJ1_) {
+          ROL::Ptr<const Tpetra::MultiVector<>> uf = getConstField(u);
+          ROL::Ptr<const Tpetra::MultiVector<>> zf = getConstField(z);
+          ROL::Ptr<const std::vector<Real>>     zp = getConstParameter(z);
+
+          assembler_->assemblePDEJacobian1(matJ1_,pde_,uf,zf,zp);
+          computeJ1_ = false;
+          setSolver_ = true;
+        }
+      }
+      catch ( Exception::Zero & ez ) {
+        isJ1zero_ = true;
+      }
+      catch ( Exception::NotImplemented & eni ) {
+        isJ1notImplemented_ = true;
+      }
+    }
+  }
+
+  // Assemble the PDE Jacobian.
+  void assembleJ2(const ROL::Vector<Real> &u, const ROL::Vector<Real> &z) {
+    if ( !isJ2zero_ && !isJ2notImplemented_ ) {
+      try {
+        if (computeJ2_) {
+          ROL::Ptr<const Tpetra::MultiVector<>> uf = getConstField(u);
+          ROL::Ptr<const Tpetra::MultiVector<>> zf = getConstField(z);
+          ROL::Ptr<const std::vector<Real>>     zp = getConstParameter(z);
+
+          assembler_->assemblePDEJacobian2(matJ2_,pde_,uf,zf,zp);
+          computeJ2_ = false;
+        }
+      }
+      catch ( Exception::Zero & ez ) {
+        isJ2zero_ = true;
+      }
+      catch ( Exception::NotImplemented & eni ) {
+        isJ2notImplemented_ = true;
+      }
+    }
+  }
+
+  // Assemble the PDE Jacobian.
+  void assembleJ3(const ROL::Vector<Real> &u, const ROL::Vector<Real> &z) {
+    if ( !isJ3zero_ && !isJ3notImplemented_ ) {
+      try {
+        if (computeJ3_) {
+          ROL::Ptr<const Tpetra::MultiVector<>> uf = getConstField(u);
+          ROL::Ptr<const Tpetra::MultiVector<>> zf = getConstField(z);
+          ROL::Ptr<const std::vector<Real>>     zp = getConstParameter(z);
+
+          assembler_->assemblePDEJacobian3(vecJ3_,pde_,uf,zf,zp);
+          computeJ3_ = false;
+        }
+      }
+      catch ( Exception::Zero & ez ) {
+        isJ3zero_ = true;
+      }
+      catch ( Exception::NotImplemented & eni ) {
+        isJ3notImplemented_ = true;
+      }
+    }
+  }
+
+  // Assemble the PDE adjoint Hessian.
+  void assembleH11(const ROL::Vector<Real> &w, const ROL::Vector<Real> &u, const ROL::Vector<Real> &z) {
+    if ( !isH11zero_ && !isH11notImplemented_ ) {
+      try {
+        ROL::Ptr<const Tpetra::MultiVector<>> wf = getConstField(w);
+        ROL::Ptr<const Tpetra::MultiVector<>> uf = getConstField(u);
+        ROL::Ptr<const Tpetra::MultiVector<>> zf = getConstField(z);
+        ROL::Ptr<const std::vector<Real>>     zp = getConstParameter(z);
+
+        assembler_->assemblePDEHessian11(matH11_,pde_,wf,uf,zf,zp);
+      }
+      catch ( Exception::Zero & ez ) {
+        isH11zero_ = true;
+      }
+      catch ( Exception::NotImplemented & eni ) {
+        isH11notImplemented_ = true;
+      }
+    }
+  }
+
+  // Assemble the PDE adjoint Hessian.
+  void assembleH12(const ROL::Vector<Real> &w, const ROL::Vector<Real> &u, const ROL::Vector<Real> &z) {
+    if ( !isH12zero_ && !isH12notImplemented_ ) {
+      try {
+        ROL::Ptr<const Tpetra::MultiVector<>> wf = getConstField(w);
+        ROL::Ptr<const Tpetra::MultiVector<>> uf = getConstField(u);
+        ROL::Ptr<const Tpetra::MultiVector<>> zf = getConstField(z);
+        ROL::Ptr<const std::vector<Real>>     zp = getConstParameter(z);
+
+        assembler_->assemblePDEHessian12(matH12_,pde_,wf,uf,zf,zp);
+      }
+      catch ( Exception::Zero & ez ) {
+        isH12zero_ = true;
+      }
+      catch ( Exception::NotImplemented & eni ) {
+        isH12notImplemented_ = true;
+      }
+    }
+  }
+
+  // Assemble the PDE adjoint Hessian.
+  void assembleH13(const ROL::Vector<Real> &w, const ROL::Vector<Real> &u, const ROL::Vector<Real> &z) {
+    if ( !isH13zero_ && !isH13notImplemented_ ) {
+      try {
+        ROL::Ptr<const Tpetra::MultiVector<>> wf = getConstField(w);
+        ROL::Ptr<const Tpetra::MultiVector<>> uf = getConstField(u);
+        ROL::Ptr<const Tpetra::MultiVector<>> zf = getConstField(z);
+        ROL::Ptr<const std::vector<Real>>     zp = getConstParameter(z);
+
+        assembler_->assemblePDEHessian13(vecH13_,pde_,wf,uf,zf,zp);
+      }
+      catch ( Exception::Zero & ez ) {
+        isH13zero_ = true;
+      }
+      catch ( Exception::NotImplemented & eni ) {
+        isH13notImplemented_ = true;
+      }
+    }
+  }
+
+  // Assemble the PDE adjoint Hessian.
+  void assembleH21(const ROL::Vector<Real> &w, const ROL::Vector<Real> &u, const ROL::Vector<Real> &z) {
+    if ( !isH21zero_ && !isH21notImplemented_ ) {
+      try {
+        ROL::Ptr<const Tpetra::MultiVector<>> wf = getConstField(w);
+        ROL::Ptr<const Tpetra::MultiVector<>> uf = getConstField(u);
+        ROL::Ptr<const Tpetra::MultiVector<>> zf = getConstField(z);
+        ROL::Ptr<const std::vector<Real>>     zp = getConstParameter(z);
+
+        assembler_->assemblePDEHessian21(matH21_,pde_,wf,uf,zf,zp);
+      }
+      catch ( Exception::Zero & ez ) {
+        isH21zero_ = true;
+      }
+      catch ( Exception::NotImplemented & eni ) {
+        isH21notImplemented_ = true;
+      }
+    }
+  }
+
+  // Assemble the PDE adjoint Hessian.
+  void assembleH31(const ROL::Vector<Real> &w, const ROL::Vector<Real> &u, const ROL::Vector<Real> &z) {
+    if ( !isH31zero_ && !isH31notImplemented_ ) {
+      try {
+        ROL::Ptr<const Tpetra::MultiVector<>> wf = getConstField(w);
+        ROL::Ptr<const Tpetra::MultiVector<>> uf = getConstField(u);
+        ROL::Ptr<const Tpetra::MultiVector<>> zf = getConstField(z);
+        ROL::Ptr<const std::vector<Real>>     zp = getConstParameter(z);
+
+        assembler_->assemblePDEHessian31(vecH31_,pde_,wf,uf,zf,zp);
+      }
+      catch ( Exception::Zero & ez ) {
+        isH31zero_ = true;
+      }
+      catch ( Exception::NotImplemented & eni ) {
+        isH31notImplemented_ = true;
+      }
+    }
+  }
+
+  // Assemble the PDE adjoint Hessian.
+  void assembleH22(const ROL::Vector<Real> &w, const ROL::Vector<Real> &u, const ROL::Vector<Real> &z) {
+    if ( !isH22zero_ && !isH22notImplemented_ ) {
+      try {
+        ROL::Ptr<const Tpetra::MultiVector<>> wf = getConstField(w);
+        ROL::Ptr<const Tpetra::MultiVector<>> uf = getConstField(u);
+        ROL::Ptr<const Tpetra::MultiVector<>> zf = getConstField(z);
+        ROL::Ptr<const std::vector<Real>>     zp = getConstParameter(z);
+
+        assembler_->assemblePDEHessian22(matH22_,pde_,wf,uf,zf,zp);
+      }
+      catch ( Exception::Zero & ez ) {
+        isH22zero_ = true;
+      }
+      catch ( Exception::NotImplemented & eni ) {
+        isH22notImplemented_ = true;
+      }
+    }
+  }
+
+  // Assemble the PDE adjoint Hessian.
+  void assembleH23(const ROL::Vector<Real> &w, const ROL::Vector<Real> &u, const ROL::Vector<Real> &z) {
+    if ( !isH23zero_ && !isH23notImplemented_ ) {
+      try {
+        ROL::Ptr<const Tpetra::MultiVector<>> wf = getConstField(w);
+        ROL::Ptr<const Tpetra::MultiVector<>> uf = getConstField(u);
+        ROL::Ptr<const Tpetra::MultiVector<>> zf = getConstField(z);
+        ROL::Ptr<const std::vector<Real>>     zp = getConstParameter(z);
+
+        assembler_->assemblePDEHessian23(vecH23_,pde_,wf,uf,zf,zp);
+      }
+      catch ( Exception::Zero & ez ) {
+        isH23zero_ = true;
+      }
+      catch ( Exception::NotImplemented & eni ) {
+        isH23notImplemented_ = true;
+      }
+    }
+  }
+
+  // Assemble the PDE adjoint Hessian.
+  void assembleH32(const ROL::Vector<Real> &w, const ROL::Vector<Real> &u, const ROL::Vector<Real> &z) {
+    if ( !isH32zero_ && !isH32notImplemented_ ) {
+      try {
+        ROL::Ptr<const Tpetra::MultiVector<>> wf = getConstField(w);
+        ROL::Ptr<const Tpetra::MultiVector<>> uf = getConstField(u);
+        ROL::Ptr<const Tpetra::MultiVector<>> zf = getConstField(z);
+        ROL::Ptr<const std::vector<Real>>     zp = getConstParameter(z);
+
+        assembler_->assemblePDEHessian32(vecH32_,pde_,wf,uf,zf,zp);
+      }
+      catch ( Exception::Zero & ez ) {
+        isH32zero_ = true;
+      }
+      catch ( Exception::NotImplemented & eni ) {
+        isH32notImplemented_ = true;
+      }
+    }
+  }
+
+  // Assemble the PDE adjoint Hessian.
+  void assembleH33(const ROL::Vector<Real> &w, const ROL::Vector<Real> &u, const ROL::Vector<Real> &z) {
+    if ( !isH33zero_ && !isH33notImplemented_ ) {
+      try {
+        ROL::Ptr<const Tpetra::MultiVector<>> wf = getConstField(w);
+        ROL::Ptr<const Tpetra::MultiVector<>> uf = getConstField(u);
+        ROL::Ptr<const Tpetra::MultiVector<>> zf = getConstField(z);
+        ROL::Ptr<const std::vector<Real>>     zp = getConstParameter(z);
+
+        assembler_->assemblePDEHessian33(matH33_,pde_,wf,uf,zf,zp);
+      }
+      catch ( Exception::Zero & ez ) {
+        isH33zero_ = true;
+      }
+      catch ( Exception::NotImplemented & eni ) {
+        isH33notImplemented_ = true;
+      }
+    }
+  }
+
+  // Application routines
+  void applyJacobian1(const ROL::Ptr<Tpetra::MultiVector<>> &Jv,
+                      const ROL::Ptr<const Tpetra::MultiVector<>> &v) {
+    #ifdef ROL_TIMERS
+      Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::PDEConstraintApplyJacobian1);
+    #endif
+    if (!isJ1notImplemented_) {
+      if (isJ1zero_) {
+        Jv->putScalar(static_cast<Real>(0));
+      }
+      else {
+        matJ1_->apply(*v,*Jv);
+      }
+    }
+  }
+
+  void applyAdjointJacobian1(const ROL::Ptr<Tpetra::MultiVector<>> &Jv,
+                             const ROL::Ptr<const Tpetra::MultiVector<>> &v) {
+    #ifdef ROL_TIMERS
+      Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::PDEConstraintApplyAdjointJacobian1);
+    #endif
+    if ( !isJ1notImplemented_ ) {
+      if ( isJ1zero_ ) {
+        Jv->putScalar(static_cast<Real>(0));
+      }
+      else {
+        matJ1_->apply(*v,*Jv,Teuchos::TRANS);
+      }
+    }
+  }
+
+  void applyJacobian2(const ROL::Ptr<Tpetra::MultiVector<>> &Jv,
+                      const ROL::Ptr<const Tpetra::MultiVector<>> &v) {
+    #ifdef ROL_TIMERS
+      Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::PDEConstraintApplyJacobian2);
+    #endif
+    if (v != ROL::nullPtr && !isJ2notImplemented_) {
+      if (isJ2zero_) {
+        Jv->putScalar(static_cast<Real>(0));
+      }
+      else {
+        matJ2_->apply(*v,*Jv);
+      }
+    }
+  }
+
+  void applyAdjointJacobian2(const ROL::Ptr<Tpetra::MultiVector<>> &Jv,
+                             const ROL::Ptr<const Tpetra::MultiVector<>> &v) {
+    #ifdef ROL_TIMERS
+      Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::PDEConstraintApplyAdjointJacobian2);
+    #endif
+    if ( Jv != ROL::nullPtr && !isJ2notImplemented_ ) {
+      if ( isJ2zero_ ) {
+        Jv->putScalar(static_cast<Real>(0));
+      }
+      else {
+        matJ2_->apply(*v,*Jv,Teuchos::TRANS);
+      }
+    }
+  }
+
+  void applyJacobian3(const ROL::Ptr<Tpetra::MultiVector<>> &Jv,
+                      const ROL::Ptr<const std::vector<Real>> &v,
+                      const bool zeroOut = true) const {
+    #ifdef ROL_TIMERS
+      Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::PDEConstraintApplyJacobian3);
+    #endif
+    if ( v != ROL::nullPtr && !isJ3notImplemented_ ) {
+      const size_t size = static_cast<size_t>(v->size());
+      if ( zeroOut ) { // || (isJ3zero_ && !zeroOut) ) {
+        Jv->putScalar(static_cast<Real>(0));
+      }
+      if ( !isJ3zero_ ) {
+        for (size_t i = 0; i < size; ++i) {
+          Teuchos::ArrayView<const size_t> col(&i,1);
+          Jv->update((*v)[i],*(vecJ3_->subView(col)),static_cast<Real>(1));
+        }
+      }
+    }
+  }
+
+  void applyAdjointJacobian3(const ROL::Ptr<std::vector<Real>> &Jv,
+                             const ROL::Ptr<const Tpetra::MultiVector<>> &v) const {
+    #ifdef ROL_TIMERS
+      Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::PDEConstraintApplyAdjointJacobian3);
+    #endif
+    if ( Jv != ROL::nullPtr && !isJ3notImplemented_ ) {
+      const size_t size = static_cast<size_t>(Jv->size());
+      if ( isJ3zero_ ) {
+        Jv->assign(size,static_cast<Real>(0));
+      }
+      else {
+        Teuchos::Array<Real> val(1,0);
+        for (size_t i = 0; i < size; ++i) {
+          Teuchos::ArrayView<const size_t> col(&i,1);
+          vecJ3_->subView(col)->dot(*v, val.view(0,1));
+          (*Jv)[i] = val[0];
+        }
+      }
+    }
+  }
+
+  void applyHessian11(const ROL::Ptr<Tpetra::MultiVector<>> &Hv,
+                      const ROL::Ptr<const Tpetra::MultiVector<>> &v) const {
+    #ifdef ROL_TIMERS
+      Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::PDEConstraintApplyHessian11);
+    #endif
+    if (!isH11notImplemented_) {
+      if ( isH11zero_ ) {
+        Hv->putScalar(static_cast<Real>(0));
+      }
+      else {
+        matH11_->apply(*v,*Hv);
+      }
+    }
+  }
+
+  void applyHessian12(const ROL::Ptr<Tpetra::MultiVector<>> &Hv,
+                      const ROL::Ptr<const Tpetra::MultiVector<>> &v) const {
+    #ifdef ROL_TIMERS
+      Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::PDEConstraintApplyHessian12);
+    #endif
+    if ( Hv != ROL::nullPtr && !isH12notImplemented_ ) {
+      if ( isH12zero_ ) {
+        Hv->putScalar(static_cast<Real>(0));
+      }
+      else {
+        matH12_->apply(*v,*Hv);
+      }
+    }
+  }
+
+  void applyHessian13(const ROL::Ptr<std::vector<Real>> &Hv,
+                      const ROL::Ptr<const Tpetra::MultiVector<>> &v) {
+    #ifdef ROL_TIMERS
+      Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::PDEConstraintApplyHessian13);
+    #endif
+    if ( Hv != ROL::nullPtr && !isH13notImplemented_ ) {
+      const size_t size = static_cast<size_t>(Hv->size());
+      if ( isH13zero_ ) {
+        Hv->assign(size,static_cast<Real>(0));
+      }
+      else {
+        Teuchos::Array<Real> val(1,0);
+        for (size_t i = 0; i < size; ++i) {
+          Teuchos::ArrayView<const size_t> col(&i,1);
+          vecH13_->subView(col)->dot(*v, val.view(0,1));
+          (*Hv)[i] += val[0];
+        }
+      }
+    }
+  }
+
+  void applyHessian21(const ROL::Ptr<Tpetra::MultiVector<>> &Hv,
+                      const ROL::Ptr<const Tpetra::MultiVector<>> &v) const {
+    #ifdef ROL_TIMERS
+      Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::PDEConstraintApplyHessian21);
+    #endif
+    if ( v != ROL::nullPtr && !isH21notImplemented_ ) {
+      if ( isH21zero_ ) {
+        Hv->putScalar(static_cast<Real>(0));
+      }
+      else {
+        matH21_->apply(*v,*Hv);
+      }
+    }
+  }
+
+  void applyHessian31(const ROL::Ptr<Tpetra::MultiVector<>> &Hv,
+                      const ROL::Ptr<const std::vector<Real>> &v,
+                      const bool zeroOut = true) const {
+    #ifdef ROL_TIMERS
+      Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::PDEConstraintApplyHessian31);
+    #endif
+    if ( v != ROL::nullPtr && !isH31notImplemented_ ) {
+      const size_t size = static_cast<size_t>(v->size());
+      if ( zeroOut ) { //|| (isH31zero_ && !zeroOut) ) {
+        Hv->putScalar(static_cast<Real>(0));
+      }
+      if ( !isH31zero_ ) {
+        for (size_t i = 0; i < size; ++i) {
+          Teuchos::ArrayView<const size_t> col(&i,1);
+          Hv->update((*v)[i],*(vecH31_->subView(col)),static_cast<Real>(1));
+        }
+      }
+    }
+  }
+
+  void applyHessian22(const ROL::Ptr<Tpetra::MultiVector<>> &Hv,
+                      const ROL::Ptr<const Tpetra::MultiVector<>> &v) const {
+    #ifdef ROL_TIMERS
+      Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::PDEConstraintApplyHessian22);
+    #endif
+    if ( v != ROL::nullPtr && !isH22notImplemented_ ) {
+      if ( isH22zero_ ) {
+        Hv->putScalar(static_cast<Real>(0));
+      }
+      else {
+        matH22_->apply(*v,*Hv);
+      }
+    }
+  }
+
+  void applyHessian23(const ROL::Ptr<std::vector<Real>> &Hv,
+                      const ROL::Ptr<const Tpetra::MultiVector<>> &v,
+                      const bool zeroOut = true) const {
+    #ifdef ROL_TIMERS
+      Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::PDEConstraintApplyHessian23);
+    #endif
+    if ( Hv != ROL::nullPtr && v != ROL::nullPtr && !isH23notImplemented_ ) {
+      const size_t size = static_cast<size_t>(Hv->size());
+      if ( zeroOut ) { // || (isH23zero_ && !zeroOut) ) {
+        Hv->assign(size,static_cast<Real>(0));
+      }
+      if ( !isH23zero_ ) {
+        Teuchos::Array<Real> val(1,0);
+        for (size_t i = 0; i < size; ++i) {
+          Teuchos::ArrayView<const size_t> col(&i,1);
+          vecH23_->subView(col)->dot(*v, val.view(0,1));
+          (*Hv)[i] += val[0];
+        }
+      }
+    }
+  }
+
+  void applyHessian32(const ROL::Ptr<Tpetra::MultiVector<>> &Hv,
+                      const ROL::Ptr<const std::vector<Real>> &v,
+                      const bool zeroOut = true) const {
+    #ifdef ROL_TIMERS
+      Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::PDEConstraintApplyHessian32);
+    #endif
+    if ( Hv != ROL::nullPtr && v != ROL::nullPtr && !isH32notImplemented_ ) {
+      const size_t size = static_cast<size_t>(v->size());
+      if ( zeroOut ) { // || (isH32zero_ && !zeroOut) ) {
+        Hv->putScalar(static_cast<Real>(0));
+      }
+      if ( !isH32zero_ ) {
+        for (size_t i = 0; i < size; ++i) {
+          Teuchos::ArrayView<const size_t> col(&i,1);
+          Hv->update((*v)[i],*(vecH32_->subView(col)),static_cast<Real>(1));
+        }
+      }
+    }
+  }
+
+  void applyHessian33(const ROL::Ptr<std::vector<Real>> &Hv,
+                      const ROL::Ptr<const std::vector<Real>> &v,
+                      const bool zeroOut = true ) const {
+    #ifdef ROL_TIMERS
+      Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::PDEConstraintApplyHessian33);
+    #endif
+    if ( Hv != ROL::nullPtr && !isH33notImplemented_ ) {
+      const size_t size = static_cast<size_t>(Hv->size());
+      if ( zeroOut ) { // || (isH33zero_ && !zeroOut) ) {
+        Hv->assign(size,static_cast<Real>(0));
+      }
+      if ( !isH33zero_ ) {
+        for (size_t i = 0; i < size; ++i) {
+          for (size_t j = 0; j < size; ++j) {
+            (*Hv)[i] += (*matH33_)[i][j]*(*v)[j];
+          }
+        }
+      }
+    }
+  }
+
+  // Set the Jacobian matrix in the solver object.
+  // Assumes assembleJ1 has already been called.
+  void setSolver(void) {
+    solver_->setA(matJ1_);
+    setSolver_= false;
+  }
+
+  // Solve using the Jacobian.
+  // Assumes assembleJ1 has already been called.
+  void solveForward(ROL::Ptr<Tpetra::MultiVector<>> &x,
+                    const ROL::Ptr<const Tpetra::MultiVector<>> &b) {
+    if (setSolver_) {
+      #ifdef ROL_TIMERS
+        Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::PDEConstraintSolverConstruct_Jacobian1);
+      #endif
+      setSolver();
+    }
+
+    #ifdef ROL_TIMERS
+      Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::PDEConstraintSolverSolve_Jacobian1);
+    #endif
+    solver_->solve(x,b,false);
+  }
+
+  // Solve using the adjoint Jacobian.
+  // Assumes assembleJ1 has already been called.
+  void solveAdjoint(ROL::Ptr<Tpetra::MultiVector<>> &x,
+                    const ROL::Ptr<const Tpetra::MultiVector<>> &b) {
+    if (setSolver_) {
+      #ifdef ROL_TIMERS
+        Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::PDEConstraintSolverConstruct_AdjointJacobian1);
+      #endif
+      setSolver();
+    }
+
+    #ifdef ROL_TIMERS
+      Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::PDEConstraintSolverSolve_AdjointJacobian1);
+    #endif
+    solver_->solve(x,b,true);
+  }
+
+public:
+  PDE_Constraint(const ROL::Ptr<PDE<Real,DeviceType>> &pde,
+                 const ROL::Ptr<MeshManager<Real,DeviceType>> &meshMgr,
+                 const ROL::Ptr<const Teuchos::Comm<int>> &comm,
+                 Teuchos::ParameterList &parlist,
+                 std::ostream &outStream = std::cout)
+    : ROL::Constraint_SimOpt<Real>(),
+      pde_(pde),
+      computeJ1_(true),  computeJ2_(true),  computeJ3_(true),
+      setSolver_(true),
+      isJ1zero_(false),  isJ1notImplemented_(false),
+      isJ2zero_(false),  isJ2notImplemented_(false),
+      isJ3zero_(false),  isJ3notImplemented_(false),
+      isH11zero_(false),  isH11notImplemented_(false),
+      isH12zero_(false),  isH12notImplemented_(false),
+      isH13zero_(false),  isH13notImplemented_(false),
+      isH21zero_(false),  isH21notImplemented_(false),
+      isH22zero_(false),  isH22notImplemented_(false),
+      isH23zero_(false),  isH23notImplemented_(false),
+      isH31zero_(false),  isH31notImplemented_(false),
+      isH32zero_(false),  isH32notImplemented_(false),
+      isH33zero_(false),  isH33notImplemented_(false) {
+    assembler_ = ROL::makePtr<Assembler<Real,DeviceType>>(pde_->getFields(),pde_->getFields2(),meshMgr,comm,parlist,outStream);
+    assembler_->setCellNodes(*pde_);
+    solver_ = ROL::makePtr<Solver<Real>>(parlist.sublist("Solver"));
+  }
+
+  PDE_Constraint(const ROL::Ptr<PDE<Real,DeviceType>>       &pde,
+                 const ROL::Ptr<Assembler<Real,DeviceType>> &assembler,
+                 const ROL::Ptr<Solver<Real>>    &solver)
+    : ROL::Constraint_SimOpt<Real>(),
+      pde_(pde), assembler_(assembler), solver_(solver),
+      computeJ1_(true),  computeJ2_(true),  computeJ3_(true),
+      setSolver_(true),
+      isJ1zero_(false),  isJ1notImplemented_(false),
+      isJ2zero_(false),  isJ2notImplemented_(false),
+      isJ3zero_(false),  isJ3notImplemented_(false),
+      isH11zero_(false),  isH11notImplemented_(false),
+      isH12zero_(false),  isH12notImplemented_(false),
+      isH13zero_(false),  isH13notImplemented_(false),
+      isH21zero_(false),  isH21notImplemented_(false),
+      isH22zero_(false),  isH22notImplemented_(false),
+      isH23zero_(false),  isH23notImplemented_(false),
+      isH31zero_(false),  isH31notImplemented_(false),
+      isH32zero_(false),  isH32notImplemented_(false),
+      isH33zero_(false),  isH33notImplemented_(false) {
+    assembler_->setCellNodes(*pde_);
+  }
+
+  void setParameter(const std::vector<Real> &param) {
+    computeJ1_  = true; computeJ2_  = true; computeJ3_  = true;
+    setSolver_  = true;
+    ROL::Constraint_SimOpt<Real>::setParameter(param);
+    pde_->setParameter(param);
+  }
+
+  const ROL::Ptr<Assembler<Real,DeviceType>> getAssembler(void) const {
+    return assembler_;
+  }
+
+  const ROL::Ptr<PDE<Real,DeviceType>> getPDE(void) const {
+    return pde_;
+  }
+
+  using ROL::Constraint_SimOpt<Real>::update_1;
+  void update_1(const ROL::Vector<Real> &u, bool flag = true, int iter = -1) {
+    computeJ1_ = (flag ? true : computeJ1_);
+    computeJ2_ = (flag ? true : computeJ2_);
+    computeJ3_ = (flag ? true : computeJ3_);
+  }
+
+  using ROL::Constraint_SimOpt<Real>::update_2;
+  void update_2(const ROL::Vector<Real> &z, bool flag = true, int iter = -1) {
+    computeJ1_ = (flag ? true : computeJ1_);
+    computeJ2_ = (flag ? true : computeJ2_);
+    computeJ3_ = (flag ? true : computeJ3_);
+  }
+
+  using ROL::Constraint_SimOpt<Real>::update;
+  void update(const ROL::Vector<Real> &u, const ROL::Vector<Real> &z, bool flag = true, int iter = -1) {
+    update_1(u,flag,iter);
+    update_2(z,flag,iter);
+  }
+
+  // Do something smarter here
+  void update_1(const ROL::Vector<Real> &u, ROL::UpdateType type, int iter = -1) {
+    computeJ1_ = true;
+    computeJ2_ = true;
+    computeJ3_ = true;
+  }
+
+  void update_2(const ROL::Vector<Real> &z, ROL::UpdateType type, int iter = -1) {
+    computeJ1_ = true;
+    computeJ2_ = true;
+    computeJ3_ = true;
+  }
+
+  void update(const ROL::Vector<Real> &u, const ROL::Vector<Real> &z, ROL::UpdateType type, int iter = -1) {
+    update_1(u,type,iter);
+    update_2(z,type,iter);
+  }
+
+  void solve_update(const ROL::Vector<Real> &u, const ROL::Vector<Real> &z, ROL::UpdateType type, int iter = -1) {
+    update(u,z,type,iter);
+  }
+
+  void solve(ROL::Vector<Real> &c, ROL::Vector<Real> &u, const ROL::Vector<Real> &z, Real &tol) {
+    ROL::Constraint_SimOpt<Real>::solve(c,u,z,tol);
+  }
+
+  using ROL::Constraint_SimOpt<Real>::value;
+  void value(ROL::Vector<Real> &c, const ROL::Vector<Real> &u, const ROL::Vector<Real> &z, Real &tol) {
+    ROL::Ptr<Tpetra::MultiVector<>>       cf = getField(c);
+    ROL::Ptr<const Tpetra::MultiVector<>> uf = getConstField(u);
+    ROL::Ptr<const Tpetra::MultiVector<>> zf = getConstField(z);
+    ROL::Ptr<const std::vector<Real>>     zp = getConstParameter(z);
+
+    assembler_->assemblePDEResidual(vecR_,pde_,uf,zf,zp);
+    cf->scale(static_cast<Real>(1),*vecR_);
+  }
+
+  void applyJacobian_1(ROL::Vector<Real> &jv,
+                 const ROL::Vector<Real> &v,
+                 const ROL::Vector<Real> &u,
+                 const ROL::Vector<Real> &z, Real &tol) {
+    assembleJ1(u,z);
+    if (isJ1notImplemented_) {
+      ROL::Constraint_SimOpt<Real>::applyJacobian_1(jv,v,u,z,tol);
+    }
+    else {
+      ROL::Ptr<Tpetra::MultiVector<>>      jvf = getField(jv);
+      ROL::Ptr<const Tpetra::MultiVector<>> vf = getConstField(v);
+      applyJacobian1(jvf,vf);
+    }
+  }
+
+
+  void applyJacobian_2(ROL::Vector<Real> &jv,
+                 const ROL::Vector<Real> &v,
+                 const ROL::Vector<Real> &u,
+                 const ROL::Vector<Real> &z, Real &tol) {
+    assembleJ2(u,z);
+    assembleJ3(u,z);
+    ROL::Ptr<const Tpetra::MultiVector<>> vf = getConstField(v);
+    ROL::Ptr<const std::vector<Real>>     vp = getConstParameter(v);
+    bool useFD2 = (isJ2notImplemented_ && vf != ROL::nullPtr);
+    bool useFD3 = (isJ3notImplemented_ && vp != ROL::nullPtr);
+    jv.zero();
+    if (useFD2 || useFD3) {
+      ROL::Constraint_SimOpt<Real>::applyJacobian_2(jv,v,u,z,tol);
+    }
+    else {
+      ROL::Ptr<Tpetra::MultiVector<>> jvf = getField(jv);
+      bool zeroOut = true;
+      if (!useFD2) {
+        applyJacobian2(jvf,vf);
+        zeroOut = (vf == ROL::nullPtr);
+      }
+      if (!useFD3) {
+        applyJacobian3(jvf,vp,zeroOut);
+      }
+    }
+  }
+
+
+  void applyAdjointJacobian_1(ROL::Vector<Real> &ajv,
+                        const ROL::Vector<Real> &v,
+                        const ROL::Vector<Real> &u,
+                        const ROL::Vector<Real> &z, Real &tol) {
+    assembleJ1(u,z);
+    if (isJ1notImplemented_) {
+      ROL::Constraint_SimOpt<Real>::applyAdjointJacobian_1(ajv,v,u,z,tol);
+    }
+    else {
+      ROL::Ptr<Tpetra::MultiVector<>>     ajvf = getField(ajv);
+      ROL::Ptr<const Tpetra::MultiVector<>> vf = getConstField(v);
+      applyAdjointJacobian1(ajvf,vf);
+    }
+  }
+
+
+  void applyAdjointJacobian_2(ROL::Vector<Real> &ajv,
+                        const ROL::Vector<Real> &v,
+                        const ROL::Vector<Real> &u,
+                        const ROL::Vector<Real> &z, Real &tol) {
+    assembleJ2(u,z);
+    assembleJ3(u,z);
+    ROL::Ptr<Tpetra::MultiVector<>> ajvf = getField(ajv);
+    ROL::Ptr<std::vector<Real>>     ajvp = getParameter(ajv);
+    bool useFD2 = (isJ2notImplemented_ && ajvf != ROL::nullPtr);
+    bool useFD3 = (isJ3notImplemented_ && ajvp != ROL::nullPtr);
+    if (useFD2 || useFD3) {
+      ROL::Constraint_SimOpt<Real>::applyAdjointJacobian_2(ajv,v,u,z,tol);
+    }
+    else {
+      ROL::Ptr<const Tpetra::MultiVector<>> vf = getConstField(v);
+      if (!useFD2) {
+        applyAdjointJacobian2(ajvf,vf);
+      }
+      if (!useFD3) {
+        applyAdjointJacobian3(ajvp,vf);
+      }
+    }
+  }
+
+
+  void applyAdjointHessian_11(ROL::Vector<Real> &ahwv,
+                        const ROL::Vector<Real> &w,
+                        const ROL::Vector<Real> &v,
+                        const ROL::Vector<Real> &u,
+                        const ROL::Vector<Real> &z, Real &tol) {
+    assembleH11(w,u,z);
+    if (isH11notImplemented_) {
+      ROL::Constraint_SimOpt<Real>::applyAdjointHessian_11(ahwv,w,v,u,z,tol);
+    }
+    else {
+      ROL::Ptr<Tpetra::MultiVector<>>    ahwvf = getField(ahwv);
+      ROL::Ptr<const Tpetra::MultiVector<>> vf = getConstField(v);
+      applyHessian11(ahwvf,vf);
+    }
+  }
+
+
+  void applyAdjointHessian_12(ROL::Vector<Real> &ahwv,
+                        const ROL::Vector<Real> &w,
+                        const ROL::Vector<Real> &v,
+                        const ROL::Vector<Real> &u,
+                        const ROL::Vector<Real> &z, Real &tol) {
+    assembleH12(w,u,z);
+    assembleH13(w,u,z);
+    ROL::Ptr<std::vector<Real>>     ahwvp = getParameter(ahwv);
+    ROL::Ptr<Tpetra::MultiVector<>> ahwvf = getField(ahwv);
+    bool useFD2 = (isH12notImplemented_ && ahwvf != ROL::nullPtr);
+    bool useFD3 = (isH13notImplemented_ && ahwvp != ROL::nullPtr);
+    if (useFD2 || useFD3) {
+      ROL::Constraint_SimOpt<Real>::applyAdjointHessian_12(ahwv,w,v,u,z,tol);
+    }
+    else {
+      ROL::Ptr<const Tpetra::MultiVector<>> vf = getConstField(v);
+      if (!useFD2) {
+        applyHessian12(ahwvf,vf);
+      }
+      if (!useFD3) {
+        applyHessian13(ahwvp,vf);
+      }
+    }
+  }
+
+
+  void applyAdjointHessian_21(ROL::Vector<Real> &ahwv,
+                        const ROL::Vector<Real> &w,
+                        const ROL::Vector<Real> &v,
+                        const ROL::Vector<Real> &u,
+                        const ROL::Vector<Real> &z, Real &tol) {
+    assembleH21(w,u,z);
+    assembleH31(w,u,z);
+    ROL::Ptr<const Tpetra::MultiVector<>> vf = getConstField(v);
+    ROL::Ptr<const std::vector<Real>>     vp = getConstParameter(v);
+    bool useFD2 = (isH21notImplemented_ && vf != ROL::nullPtr);
+    bool useFD3 = (isH31notImplemented_ && vp != ROL::nullPtr);
+    ahwv.zero();
+    if (useFD2 || useFD3) {
+      ROL::Constraint_SimOpt<Real>::applyAdjointHessian_21(ahwv,w,v,u,z,tol);
+    }
+    else {
+      ROL::Ptr<Tpetra::MultiVector<>> ahwvf = getField(ahwv);
+      bool zeroOut = true;
+      if (!useFD2) {
+        applyHessian21(ahwvf,vf);
+        zeroOut = (vf == ROL::nullPtr);
+      }
+      if (!useFD3) {
+        applyHessian31(ahwvf,vp,zeroOut);
+      }
+    }
+  }
+
+
+  void applyAdjointHessian_22(ROL::Vector<Real> &ahwv,
+                        const ROL::Vector<Real> &w,
+                        const ROL::Vector<Real> &v,
+                        const ROL::Vector<Real> &u,
+                        const ROL::Vector<Real> &z, Real &tol) {
+    assembleH22(w,u,z);
+    assembleH23(w,u,z);
+    assembleH32(w,u,z);
+    assembleH33(w,u,z);
+    ahwv.zero();
+
+    ROL::Ptr<Tpetra::MultiVector<>>    ahwvf = getField(ahwv);
+    ROL::Ptr<std::vector<Real>>        ahwvp = getParameter(ahwv);
+    ROL::Ptr<const Tpetra::MultiVector<>> vf = getConstField(v);
+    ROL::Ptr<const std::vector<Real>>     vp = getConstParameter(v);
+    bool useFD22 = (isH22notImplemented_ && vf != ROL::nullPtr);
+    bool useFD23 = (isH23notImplemented_ && vp != ROL::nullPtr);
+    bool useFD32 = (isH32notImplemented_ && vp != ROL::nullPtr);
+    bool useFD33 = (isH33notImplemented_ && vp != ROL::nullPtr);
+    if (useFD22 || useFD23 || useFD32 || useFD33) {
+      ROL::Constraint_SimOpt<Real>::applyAdjointHessian_22(ahwv,w,v,u,z,tol);
+    }
+    else {
+      bool zeroOut = true;
+      if (!useFD22) {
+        applyHessian22(ahwvf,vf);
+        zeroOut = (vf == ROL::nullPtr);
+      }
+      if (!useFD23) {
+        applyHessian23(ahwvp,vf,zeroOut);
+      }
+      if (!useFD32) {
+        applyHessian32(ahwvf,vp,zeroOut);
+        zeroOut = (vf == ROL::nullPtr);
+      }
+      if (!useFD33) {
+        applyHessian33(ahwvp,vp,zeroOut);
+      }
+    }
+  }
+
+
+  void applyInverseJacobian_1(ROL::Vector<Real> &ijv,
+                        const ROL::Vector<Real> &v,
+                        const ROL::Vector<Real> &u,
+                        const ROL::Vector<Real> &z, Real &tol) {
+    assembleJ1(u,z);
+    if (isJ1notImplemented_) {
+      ROL::Constraint_SimOpt<Real>::applyInverseJacobian_1(ijv,v,u,z,tol);
+    }
+    else {
+      if (isJ1zero_) {
+        throw Exception::Zero(">>> (PDE_Constraint::applyInverseJacobian_1): Jacobian is zero!");
+      }
+      else {
+        ROL::Ptr<Tpetra::MultiVector<>>     ijvf = getField(ijv);
+        ROL::Ptr<const Tpetra::MultiVector<>> vf = getConstField(v);
+        solveForward(ijvf,vf);
+      }
+    }
+  }
+
+
+  void applyInverseAdjointJacobian_1(ROL::Vector<Real> &iajv,
+                               const ROL::Vector<Real> &v,
+                               const ROL::Vector<Real> &u,
+                               const ROL::Vector<Real> &z, Real &tol) {
+    assembleJ1(u,z);
+    if (isJ1notImplemented_) {
+      ROL::Constraint_SimOpt<Real>::applyInverseAdjointJacobian_1(iajv,v,u,z,tol);
+    }
+    else {
+      if (isJ1zero_) {
+        throw Exception::Zero(">>> (PDE_Constraint::applyAdjointInverseJacobian_1): Jacobian is zero!");
+      }
+      else {
+        ROL::Ptr<Tpetra::MultiVector<>>    iajvf = getField(iajv);
+        ROL::Ptr<const Tpetra::MultiVector<>> vf = getConstField(v);
+        solveAdjoint(iajvf,vf);
+      }
+    }
+  }
+
+
+  /***************************************************************************/
+  /* Output routines.                                                        */
+  /***************************************************************************/
+  void printMeshData(std::ostream &outStream) const {
+    assembler_->printMeshData(outStream);
+  }
+
+  void outputTpetraData() const {
+    Tpetra::MatrixMarket::Writer< Tpetra::CrsMatrix<>> matWriter;
+    if (matJ1_ != ROL::nullPtr) {
+      matWriter.writeSparseFile("jacobian1.txt", matJ1_);
+    }
+    else {
+      std::ofstream emptyfile;
+      emptyfile.open("jacobian1.txt");
+      emptyfile.close();
+    }
+    if (matJ2_ != ROL::nullPtr) {
+      matWriter.writeSparseFile("jacobian2.txt", matJ2_);
+    }
+    else {
+      std::ofstream emptyfile;
+      emptyfile.open("jacobian2.txt");
+      emptyfile.close();
+    }
+    if (vecR_ != ROL::nullPtr) {
+      matWriter.writeDenseFile("residual.txt", vecR_);
+    }
+    else {
+      std::ofstream emptyfile;
+      emptyfile.open("residual.txt");
+      emptyfile.close();
+    }
+  }
+
+  void outputTpetraVector(const ROL::Ptr<const Tpetra::MultiVector<>> &vec,
+                          const std::string &filename) const {
+    assembler_->outputTpetraVector(vec, filename);
+  }
+
+  void inputTpetraVector(ROL::Ptr<Tpetra::MultiVector<>> &vec,
+                         const std::string &filename) const {
+    assembler_->inputTpetraVector(vec, filename);
+  }
+  /***************************************************************************/
+  /* End of output routines.                                                 */
+  /***************************************************************************/
+
+private: // Vector accessor functions
+
+  ROL::Ptr<const Tpetra::MultiVector<>> getConstField(const ROL::Vector<Real> &x) const {
+    ROL::Ptr<const Tpetra::MultiVector<>> xp;
+    try {
+      xp = dynamic_cast<const ROL::TpetraMultiVector<Real>&>(x).getVector();
+    }
+    catch (std::exception &e) {
+      try {
+        ROL::Ptr<const ROL::TpetraMultiVector<Real>> xvec
+          = dynamic_cast<const PDE_OptVector<Real>&>(x).getField();
+        if (xvec == ROL::nullPtr) {
+          xp = ROL::nullPtr;
+        }
+        else {
+          xp = xvec->getVector();
+        }
+      }
+      catch (std::exception &ee) {
+        xp = ROL::nullPtr;
+      }
+    }
+    return xp;
+  }
+
+  ROL::Ptr<Tpetra::MultiVector<>> getField(ROL::Vector<Real> &x) const {
+    ROL::Ptr<Tpetra::MultiVector<>> xp;
+    try {
+      xp = dynamic_cast<ROL::TpetraMultiVector<Real>&>(x).getVector();
+    }
+    catch (std::exception &e) {
+      try {
+        ROL::Ptr<ROL::TpetraMultiVector<Real>> xvec
+          = dynamic_cast<PDE_OptVector<Real>&>(x).getField();
+        if ( xvec == ROL::nullPtr ) {
+          xp = ROL::nullPtr;
+        }
+        else {
+          xp = xvec->getVector();
+        }
+      }
+      catch (std::exception &ee) {
+        xp = ROL::nullPtr;
+      }
+    }
+    return xp;
+  }
+
+  ROL::Ptr<const std::vector<Real>> getConstParameter(const ROL::Vector<Real> &x) const {
+    ROL::Ptr<const std::vector<Real>> xp;
+    try {
+      xp = dynamic_cast<const ROL::StdVector<Real>&>(x).getVector();
+    }
+    catch (std::exception &e) {
+      try {
+        ROL::Ptr<const ROL::StdVector<Real>> xvec
+          = dynamic_cast<const PDE_OptVector<Real>&>(x).getParameter();
+        if ( xvec == ROL::nullPtr ) {
+          xp = ROL::nullPtr;
+        }
+        else {
+          xp = xvec->getVector();
+        }
+      }
+      catch (std::exception &ee) {
+        xp = ROL::nullPtr;
+      }
+    }
+    return xp;
+  }
+
+  ROL::Ptr<std::vector<Real>> getParameter(ROL::Vector<Real> &x) const {
+    ROL::Ptr<std::vector<Real>> xp;
+    try {
+      xp = dynamic_cast<ROL::StdVector<Real>&>(x).getVector();
+    }
+    catch (std::exception &e) {
+      try {
+        ROL::Ptr<ROL::StdVector<Real>> xvec
+          = dynamic_cast<PDE_OptVector<Real>&>(x).getParameter();
+        if ( xvec == ROL::nullPtr ) {
+          xp = ROL::nullPtr;
+        }
+        else {
+          xp = xvec->getVector();
+        }
+      }
+      catch (std::exception &ee) {
+        xp = ROL::nullPtr;
+      }
+    }
+    return xp;
+  }
+};
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/TOOLS/pdeobjectiveK.hpp b/packages/rol/example/PDE-OPT/TOOLS/pdeobjectiveK.hpp
new file mode 100644
index 000000000000..8cf742b6ab87
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/TOOLS/pdeobjectiveK.hpp
@@ -0,0 +1,128 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+#ifndef PDE_PDEOBJECTIVE_HPP
+#define PDE_PDEOBJECTIVE_HPP
+
+#include "ROL_Objective_SimOpt.hpp"
+#include "ROL_CompositeObjective_SimOpt.hpp"
+#include "ROL_LinearCombinationObjective_SimOpt.hpp"
+#include "ROL_StdObjective.hpp"
+#include "integralobjectiveK.hpp"
+#include "qoiK.hpp"
+#include "assemblerK.hpp"
+
+// Do not instantiate the template in this translation unit.
+extern template class Assembler<double,Kokkos::HostSpace>;
+
+template <class Real, class DeviceType>
+class PDE_Objective : public ROL::Objective_SimOpt<Real> {
+public:
+  using QoI_type = QoI<Real, DeviceType>;
+  using Assembler_type = Assembler<Real, DeviceType>;
+  using IntegralObjective_type = IntegralObjective<Real,DeviceType>;
+private:
+  std::vector<ROL::Ptr<QoI_type> > qoi_vec_;
+  const ROL::Ptr<ROL::StdObjective<Real> > std_obj_;
+  const ROL::Ptr<Assembler_type> assembler_;
+
+  std::vector<ROL::Ptr<ROL::Objective_SimOpt<Real> > > obj_vec_;
+  ROL::Ptr<ROL::Objective_SimOpt<Real> > obj_;
+
+public:
+  PDE_Objective(const std::vector<ROL::Ptr<QoI_type> > &qoi_vec,
+                const ROL::Ptr<ROL::StdObjective<Real> > &std_obj,
+                const ROL::Ptr<Assembler_type> &assembler)
+    : qoi_vec_(qoi_vec), std_obj_(std_obj), assembler_(assembler) {
+    int size = qoi_vec_.size();
+    obj_vec_.clear(); obj_vec_.resize(size,ROL::nullPtr);
+    for (int i = 0; i < size; ++i) {
+      obj_vec_[i] = ROL::makePtr<IntegralObjective_type>(qoi_vec[i],assembler);
+    }
+    obj_ = ROL::makePtr<ROL::CompositeObjective_SimOpt<Real>>(obj_vec_,std_obj_);
+  }
+
+  PDE_Objective(const std::vector<ROL::Ptr<QoI_type> > &qoi_vec,
+                const ROL::Ptr<Assembler_type> &assembler)
+    : qoi_vec_(qoi_vec), std_obj_(ROL::nullPtr), assembler_(assembler) {
+    int size = qoi_vec_.size();
+    obj_vec_.clear(); obj_vec_.resize(size,ROL::nullPtr);
+    for (int i = 0; i < size; ++i) {
+      obj_vec_[i] = ROL::makePtr<IntegralObjective_type>(qoi_vec[i],assembler);
+    }
+    obj_ = ROL::makePtr<ROL::LinearCombinationObjective_SimOpt<Real>>(obj_vec_);
+  }
+
+  PDE_Objective(const std::vector<ROL::Ptr<QoI_type> > &qoi_vec,
+                const std::vector<Real> &weights,
+                const ROL::Ptr<Assembler_type> &assembler)
+    : qoi_vec_(qoi_vec), std_obj_(ROL::nullPtr), assembler_(assembler) {
+    int size = qoi_vec_.size();
+    obj_vec_.clear(); obj_vec_.resize(size,ROL::nullPtr);
+    for (int i = 0; i < size; ++i) {
+      obj_vec_[i] = ROL::makePtr<IntegralObjective_type>(qoi_vec[i],assembler);
+    }
+    obj_ = ROL::makePtr<ROL::LinearCombinationObjective_SimOpt<Real>>(weights,obj_vec_);
+  }
+
+  PDE_Objective(const ROL::Ptr<QoI_type> &qoi,
+                const ROL::Ptr<Assembler_type> &assembler)
+    : std_obj_(ROL::nullPtr), assembler_(assembler) {
+    int size = 1;
+    qoi_vec_.clear(); qoi_vec_.resize(size,ROL::nullPtr);
+    obj_vec_.clear(); obj_vec_.resize(size,ROL::nullPtr);
+    qoi_vec_[0] = qoi;
+    obj_vec_[0] = ROL::makePtr<IntegralObjective_type>(qoi_vec_[0],assembler);
+    obj_ = obj_vec_[0];
+  }
+
+  void setParameter(const std::vector<Real> &param) {
+    ROL::Objective_SimOpt<Real>::setParameter(param);
+    obj_->setParameter(param);
+  }
+
+  void update( const ROL::Vector<Real> &u, const ROL::Vector<Real> &z, bool flag = true, int iter = -1 ) override {
+    obj_->update(u,z,flag,iter);
+  }
+
+  void update( const ROL::Vector<Real> &u, const ROL::Vector<Real> &z, ROL::UpdateType type, int iter = -1 ) override {
+    obj_->update(u,z,type,iter);
+  }
+
+  Real value( const ROL::Vector<Real> &u, const ROL::Vector<Real> &z, Real &tol ) override {
+    return obj_->value(u,z,tol);
+  }
+
+  void gradient_1( ROL::Vector<Real> &g, const ROL::Vector<Real> &u, const ROL::Vector<Real> &z, Real &tol ) override {
+    obj_->gradient_1(g,u,z,tol);
+  }
+
+  void gradient_2( ROL::Vector<Real> &g, const ROL::Vector<Real> &u, const ROL::Vector<Real> &z, Real &tol ) override {
+    obj_->gradient_2(g,u,z,tol);
+  }
+
+  void hessVec_11( ROL::Vector<Real> &hv, const ROL::Vector<Real> &v, const ROL::Vector<Real> &u, const ROL::Vector<Real> &z, Real &tol ) override {
+    obj_->hessVec_11(hv,v,u,z,tol);
+  }
+
+  void hessVec_12( ROL::Vector<Real> &hv, const ROL::Vector<Real> &v, const ROL::Vector<Real> &u, const ROL::Vector<Real> &z, Real &tol ) override {
+    obj_->hessVec_12(hv,v,u,z,tol);
+  }
+
+  void hessVec_21( ROL::Vector<Real> &hv, const ROL::Vector<Real> &v, const ROL::Vector<Real> &u, const ROL::Vector<Real> &z, Real &tol ) override {
+    obj_->hessVec_21(hv,v,u,z,tol);
+  }
+
+  void hessVec_22( ROL::Vector<Real> &hv, const ROL::Vector<Real> &v, const ROL::Vector<Real> &u, const ROL::Vector<Real> &z, Real &tol ) override {
+    obj_->hessVec_22(hv,v,u,z,tol);
+  }
+
+}; // class PDE_Objective
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/TOOLS/pdevector.hpp b/packages/rol/example/PDE-OPT/TOOLS/pdevector.hpp
index ba812c9843d0..365bbc1e8aec 100644
--- a/packages/rol/example/PDE-OPT/TOOLS/pdevector.hpp
+++ b/packages/rol/example/PDE-OPT/TOOLS/pdevector.hpp
@@ -927,7 +927,7 @@ class PDE_OptVector : public ROL::Vector<Real> {
         e1 = vec1_->basis(i);
       }
       else {
-        e1->zero();
+        e1 = vec1_->clone(); e1->zero();
       }
       e = ROL::makePtr<PDE_OptVector>(
         ROL::dynamicPtrCast<ROL::TpetraMultiVector<Real> >(e1));
@@ -939,7 +939,7 @@ class PDE_OptVector : public ROL::Vector<Real> {
         e2 = vec2_->basis(i);
       }
       else {
-        e2->zero();
+        e2 = vec2_->clone(); e2->zero();
       }
       e = ROL::makePtr<PDE_OptVector>(
         ROL::dynamicPtrCast<ROL::StdVector<Real> >(e2));
diff --git a/packages/rol/example/PDE-OPT/TOOLS/pdevectorK.hpp b/packages/rol/example/PDE-OPT/TOOLS/pdevectorK.hpp
new file mode 100644
index 000000000000..1fb8fe0ccf8d
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/TOOLS/pdevectorK.hpp
@@ -0,0 +1,1055 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+#ifndef ROL_PDEOPT_PDEVECTORK_HPP
+#define ROL_PDEOPT_PDEVECTORK_HPP
+
+#include "ROL_TpetraMultiVector.hpp"
+#include "ROL_StdVector.hpp"
+
+#include "assemblerK.hpp"
+#include "solver.hpp"
+
+// Do not instantiate the template in this translation unit.
+extern template class Assembler<double, Kokkos::HostSpace>;
+
+//// Global Timers.
+#ifdef ROL_TIMERS
+namespace ROL {
+  namespace PDEOPT {
+    ROL::Ptr<Teuchos::Time> PDEVectorSimRieszConstruct    = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: PDE Vector Sim Riesz Construction Time");
+    ROL::Ptr<Teuchos::Time> PDEVectorSimRieszApply        = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: PDE Vector Sim Riesz Application Time");
+    ROL::Ptr<Teuchos::Time> PDEVectorSimRieszSolve        = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: PDE Vector Sim Riesz Solver Solution Time");
+    ROL::Ptr<Teuchos::Time> PDEVectorOptRieszConstruct    = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: PDE Vector Opt Riesz Construction Time");
+    ROL::Ptr<Teuchos::Time> PDEVectorOptRieszApply        = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: PDE Vector Opt Riesz Application Time");
+    ROL::Ptr<Teuchos::Time> PDEVectorOptRieszSolve        = Teuchos::TimeMonitor::getNewCounter("ROL::PDEOPT: PDE Vector Opt Riesz Solver Solution Time");
+  }
+}
+#endif
+
+
+template <class Real, class DeviceType,
+          class LO=Tpetra::Map<>::local_ordinal_type, 
+          class GO=Tpetra::Map<>::global_ordinal_type,
+          class Node=Tpetra::Map<>::node_type >
+class PDE_PrimalSimVector;
+
+template <class Real, class DeviceType,
+          class LO=Tpetra::Map<>::local_ordinal_type, 
+          class GO=Tpetra::Map<>::global_ordinal_type,
+          class Node=Tpetra::Map<>::node_type >
+class PDE_DualSimVector;
+
+template <class Real, class DeviceType, class LO, class GO, class Node>
+class PDE_PrimalSimVector : public ROL::TpetraMultiVector<Real,LO,GO,Node> {
+  private:
+    ROL::Ptr<Tpetra::CrsMatrix<> > RieszMap_;
+    ROL::Ptr<Tpetra::MultiVector<> > lumpedRiesz_;
+    ROL::Ptr<Solver<Real> > solver_;
+
+    bool useRiesz_;
+    bool useLumpedRiesz_;
+
+    mutable ROL::Ptr<PDE_DualSimVector<Real,DeviceType> > dual_vec_;
+    mutable bool isDualInitialized_;
+
+    void lumpRiesz(void) {
+      lumpedRiesz_ = ROL::makePtr<Tpetra::MultiVector<Real,LO,GO,Node>>(ROL::TpetraMultiVector<Real,LO,GO,Node>::getMap(),1);
+      Tpetra::MultiVector<Real,LO,GO,Node> ones(ROL::TpetraMultiVector<Real,LO,GO,Node>::getMap(),1);
+      ones.putScalar(static_cast<Real>(1));
+      RieszMap_->apply(ones, *lumpedRiesz_);
+    }
+
+    void applyRiesz(const ROL::Ptr<Tpetra::MultiVector<> > &out,
+                    const ROL::Ptr<const Tpetra::MultiVector<> > &in) const {
+      #ifdef ROL_TIMERS
+        Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::PDEVectorSimRieszApply);
+      #endif
+      if ( useRiesz_ ) {
+        if (useLumpedRiesz_) {
+          out->elementWiseMultiply(static_cast<Real>(1), *(lumpedRiesz_->getVector(0)), *in, static_cast<Real>(0));
+        }
+        else {
+          RieszMap_->apply(*in,*out);
+        }
+      }
+      else {
+        out->scale(static_cast<Real>(1),*in);
+      }
+    }
+
+  public:
+    virtual ~PDE_PrimalSimVector() {}
+
+    using Assembler_type = Assembler<Real,DeviceType>;
+    using PDE_type = PDE<Real,DeviceType>;
+    using DynamicPDE_type = DynamicPDE<Real,DeviceType>;
+
+    PDE_PrimalSimVector(const ROL::Ptr<Tpetra::MultiVector<Real,LO,GO,Node> > &tpetra_vec,
+                        const ROL::Ptr<PDE_type> &pde,
+                        const ROL::Ptr<Assembler_type> &assembler)
+      : ROL::TpetraMultiVector<Real,LO,GO,Node>(tpetra_vec), solver_(ROL::nullPtr),
+        useRiesz_(false), useLumpedRiesz_(false), isDualInitialized_(false) {}
+
+    PDE_PrimalSimVector(const ROL::Ptr<Tpetra::MultiVector<Real,LO,GO,Node> > &tpetra_vec,
+                        const ROL::Ptr<PDE_type> &pde,
+                        const ROL::Ptr<Assembler_type> &assembler,
+                        Teuchos::ParameterList &parlist)
+      : ROL::TpetraMultiVector<Real,LO,GO,Node>(tpetra_vec),
+        isDualInitialized_(false) {
+      #ifdef ROL_TIMERS
+        Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::PDEVectorSimRieszConstruct);
+      #endif
+      useRiesz_       = parlist.sublist("Vector").sublist("Sim").get("Use Riesz Map", false);
+      useLumpedRiesz_ = parlist.sublist("Vector").sublist("Sim").get("Lump Riesz Map", false);
+      if (useRiesz_) assembler->assemblePDERieszMap1(RieszMap_, pde);
+      useRiesz_ = useRiesz_ && (RieszMap_ != ROL::nullPtr);
+      if (useRiesz_) {
+        if (useLumpedRiesz_) {
+          lumpRiesz();
+        }
+        else {
+          solver_ = ROL::makePtr<Solver<Real>>(parlist.sublist("Solver"));
+          solver_->setA(RieszMap_);
+        }
+      }
+    }
+
+    PDE_PrimalSimVector(const ROL::Ptr<Tpetra::MultiVector<Real,LO,GO,Node> > &tpetra_vec,
+                        const ROL::Ptr<DynamicPDE_type> &pde,
+                        Assembler_type &assembler)
+      : ROL::TpetraMultiVector<Real,LO,GO,Node>(tpetra_vec), solver_(ROL::nullPtr),
+        useRiesz_(false), useLumpedRiesz_(false), isDualInitialized_(false) {}
+
+    PDE_PrimalSimVector(const ROL::Ptr<Tpetra::MultiVector<Real,LO,GO,Node> > &tpetra_vec,
+                        const ROL::Ptr<DynamicPDE_type> &pde,
+                        Assembler_type &assembler,
+                        Teuchos::ParameterList &parlist)
+      : ROL::TpetraMultiVector<Real,LO,GO,Node>(tpetra_vec),
+        isDualInitialized_(false) {
+      #ifdef ROL_TIMERS
+        Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::PDEVectorSimRieszConstruct);
+      #endif
+      useRiesz_       = parlist.sublist("Vector").sublist("Sim").get("Use Riesz Map", false);
+      useLumpedRiesz_ = parlist.sublist("Vector").sublist("Sim").get("Lump Riesz Map", false);
+      if (useRiesz_) assembler.assembleDynPDERieszMap1(RieszMap_, pde);
+      useRiesz_ = useRiesz_ && (RieszMap_ != ROL::nullPtr);
+      if (useRiesz_) {
+        if (useLumpedRiesz_) {
+          lumpRiesz();
+        }
+        else {
+          solver_ = ROL::makePtr<Solver<Real>>(parlist.sublist("Solver"));
+          solver_->setA(RieszMap_);
+        }
+      }
+    }
+
+    PDE_PrimalSimVector(const ROL::Ptr<Tpetra::MultiVector<Real,LO,GO,Node> > &tpetra_vec,
+                        const ROL::Ptr<Tpetra::CrsMatrix<> > &RieszMap,
+                        const ROL::Ptr<Solver<Real> > &solver,
+                        const ROL::Ptr<Tpetra::MultiVector<Real,LO,GO,Node> > &lumpedRiesz)
+      : ROL::TpetraMultiVector<Real,LO,GO,Node>(tpetra_vec), RieszMap_(RieszMap),
+        lumpedRiesz_(lumpedRiesz), solver_(solver), isDualInitialized_(false) {
+      if (RieszMap_ != ROL::nullPtr) {
+        useLumpedRiesz_ = (lumpedRiesz_ != ROL::nullPtr);
+        useRiesz_ = (solver_ != ROL::nullPtr) || useLumpedRiesz_;
+      }
+      else {
+        useLumpedRiesz_ = false;
+        useRiesz_ = false;
+      }
+    }
+
+    Real dot( const ROL::Vector<Real> &x ) const {
+      TEUCHOS_TEST_FOR_EXCEPTION( (ROL::TpetraMultiVector<Real,LO,GO,Node>::dimension() != x.dimension()),
+                                  std::invalid_argument,
+                                  "Error: Vectors must have the same dimension." );
+      const ROL::Ptr<const Tpetra::MultiVector<Real,LO,GO,Node> > ex
+        = dynamic_cast<const ROL::TpetraMultiVector<Real,LO,GO,Node>&>(x).getVector();
+      const Tpetra::MultiVector<Real,LO,GO,Node> &ey
+        = *(ROL::TpetraMultiVector<Real,LO,GO,Node>::getVector());
+      size_t n = ey.getNumVectors();
+      // Scale x with scale_vec_
+      ROL::Ptr<Tpetra::MultiVector<Real,LO,GO,Node> > wex
+        = ROL::makePtr<Tpetra::MultiVector<Real,LO,GO,Node>>(ROL::TpetraMultiVector<Real,LO,GO,Node>::getMap(), n);
+      applyRiesz(wex,ex);
+      // Perform Euclidean dot between *this and scaled x for each vector
+      Teuchos::Array<Real> val(n,0);
+      ey.dot( *wex, val.view(0,n) );
+      // Combine dots for each vector to get a scalar
+      Real xy(0);
+      for (size_t i = 0; i < n; ++i) {
+        xy += val[i];
+      }
+      return xy;
+    }
+
+    ROL::Ptr<ROL::Vector<Real> > clone() const {
+      const Tpetra::MultiVector<Real,LO,GO,Node> &ey
+        = *(ROL::TpetraMultiVector<Real,LO,GO,Node>::getVector());
+      size_t n = ey.getNumVectors();
+      return ROL::makePtr<PDE_PrimalSimVector<Real,DeviceType,LO,GO,Node>>(
+             ROL::makePtr<Tpetra::MultiVector<Real,LO,GO,Node>>(ROL::TpetraMultiVector<Real,LO,GO,Node>::getMap(),n),
+             RieszMap_, solver_, lumpedRiesz_);
+    }
+
+    const ROL::Vector<Real> & dual() const {
+      if ( !isDualInitialized_ ) {
+        // Create new memory for dual vector
+        size_t n = ROL::TpetraMultiVector<Real,LO,GO,Node>::getVector()->getNumVectors();
+        dual_vec_ = ROL::makePtr<PDE_DualSimVector<Real,DeviceType,LO,GO,Node>>(
+                    ROL::makePtr<Tpetra::MultiVector<Real,LO,GO,Node>>(ROL::TpetraMultiVector<Real,LO,GO,Node>::getMap(),n),
+                    RieszMap_, solver_, lumpedRiesz_);
+        isDualInitialized_ = true;
+      }
+      // Scale *this with scale_vec_ and place in dual vector
+      applyRiesz(dual_vec_->getVector(),ROL::TpetraMultiVector<Real,LO,GO,Node>::getVector());
+      return *dual_vec_;
+    }
+
+    Real apply(const ROL::Vector<Real> &x) const {
+      const PDE_DualSimVector<Real,DeviceType,LO,GO,Node> &ex = dynamic_cast<const PDE_DualSimVector<Real,DeviceType,LO,GO,Node>&>(x);
+      return ROL::TpetraMultiVector<Real,LO,GO,Node>::dot(ex);
+    }
+}; // class PDE_PrimalSimVector
+
+template <class Real, class DeviceType, class LO, class GO, class Node>
+class PDE_DualSimVector : public ROL::TpetraMultiVector<Real,LO,GO,Node> {
+  private:
+    ROL::Ptr<Tpetra::CrsMatrix<Real> > RieszMap_;
+    ROL::Ptr<Tpetra::MultiVector<> > lumpedRiesz_;
+    ROL::Ptr<Tpetra::MultiVector<> > recipLumpedRiesz_;
+    ROL::Ptr<Solver<Real> > solver_;
+
+    bool useRiesz_;
+    bool useLumpedRiesz_;
+
+    mutable ROL::Ptr<PDE_PrimalSimVector<Real,DeviceType> > primal_vec_;
+    mutable bool isDualInitialized_;
+
+    void lumpRiesz(void) {
+      lumpedRiesz_ = ROL::makePtr<Tpetra::Vector<Real,LO,GO,Node>>(ROL::TpetraMultiVector<Real,LO,GO,Node>::getMap());
+      Tpetra::MultiVector<Real,LO,GO,Node> ones(ROL::TpetraMultiVector<Real,LO,GO,Node>::getMap(),1);
+      ones.putScalar(static_cast<Real>(1));
+      RieszMap_->apply(ones, *lumpedRiesz_);
+    }
+
+    void invertLumpedRiesz(void) {
+      recipLumpedRiesz_ = ROL::makePtr<Tpetra::MultiVector<Real,LO,GO,Node>>(ROL::TpetraMultiVector<Real,LO,GO,Node>::getMap(),1);
+      recipLumpedRiesz_->reciprocal(*lumpedRiesz_);
+    }
+
+    void applyRiesz(const ROL::Ptr<Tpetra::MultiVector<> > &out,
+                    const ROL::Ptr<const Tpetra::MultiVector<> > &in) const {
+      #ifdef ROL_TIMERS
+        Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::PDEVectorSimRieszSolve);
+      #endif
+      if ( useRiesz_ ) {
+        if (useLumpedRiesz_) {
+          out->elementWiseMultiply(static_cast<Real>(1), *(recipLumpedRiesz_->getVector(0)), *in, static_cast<Real>(0));
+        }
+        else {
+          solver_->solve(out,in,false);
+        }
+      }
+      else {
+        out->scale(static_cast<Real>(1),*in);
+      }
+    }
+
+  public:
+    virtual ~PDE_DualSimVector() {}
+
+    using Assembler_type = Assembler<Real,DeviceType>;
+    using PDE_type = PDE<Real,DeviceType>;
+    using DynamicPDE_type = DynamicPDE<Real,DeviceType>;
+
+    PDE_DualSimVector(const ROL::Ptr<Tpetra::MultiVector<Real,LO,GO,Node> > &tpetra_vec,
+                      const ROL::Ptr<PDE_type> &pde,
+                      const ROL::Ptr<Assembler_type> &assembler)
+      : ROL::TpetraMultiVector<Real,LO,GO,Node>(tpetra_vec), solver_(ROL::nullPtr),
+        useRiesz_(false), useLumpedRiesz_(false), isDualInitialized_(false) {}
+
+    PDE_DualSimVector(const ROL::Ptr<Tpetra::MultiVector<Real,LO,GO,Node> > &tpetra_vec,
+                      const ROL::Ptr<PDE_type> &pde,
+                      const ROL::Ptr<Assembler_type> &assembler,
+                      Teuchos::ParameterList &parlist)
+      : ROL::TpetraMultiVector<Real,LO,GO,Node>(tpetra_vec),
+        isDualInitialized_(false) {
+      #ifdef ROL_TIMERS
+        Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::PDEVectorSimRieszConstruct);
+      #endif
+      useRiesz_       = parlist.sublist("Vector").sublist("Sim").get("Use Riesz Map", false);
+      useLumpedRiesz_ = parlist.sublist("Vector").sublist("Sim").get("Lump Riesz Map", false);
+      if (useRiesz_) assembler->assemblePDERieszMap1(RieszMap_, pde);
+      useRiesz_ = useRiesz_ && (RieszMap_ != ROL::nullPtr);
+      if (useRiesz_) {
+        if (useLumpedRiesz_) {
+          lumpRiesz();
+          invertLumpedRiesz();
+        }
+        else {
+          solver_ = ROL::makePtr<Solver<Real>>(parlist.sublist("Solver"));
+          solver_->setA(RieszMap_);
+        }
+      }
+    }
+
+    PDE_DualSimVector(const ROL::Ptr<Tpetra::MultiVector<Real,LO,GO,Node> > &tpetra_vec,
+                      const ROL::Ptr<DynamicPDE_type> &pde,
+                      Assembler_type &assembler)
+      : ROL::TpetraMultiVector<Real,LO,GO,Node>(tpetra_vec), solver_(ROL::nullPtr),
+        useRiesz_(false), useLumpedRiesz_(false), isDualInitialized_(false) {}
+
+    PDE_DualSimVector(const ROL::Ptr<Tpetra::MultiVector<Real,LO,GO,Node> > &tpetra_vec,
+                      const ROL::Ptr<DynamicPDE_type> &pde,
+                      Assembler_type &assembler,
+                      Teuchos::ParameterList &parlist)
+      : ROL::TpetraMultiVector<Real,LO,GO,Node>(tpetra_vec),
+        isDualInitialized_(false) {
+      #ifdef ROL_TIMERS
+        Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::PDEVectorSimRieszConstruct);
+      #endif
+      useRiesz_       = parlist.sublist("Vector").sublist("Sim").get("Use Riesz Map", false);
+      useLumpedRiesz_ = parlist.sublist("Vector").sublist("Sim").get("Lump Riesz Map", false);
+      assembler.assembleDynPDERieszMap1(RieszMap_, pde);
+      useRiesz_ = useRiesz_ && (RieszMap_ != ROL::nullPtr);
+      if (useRiesz_) {
+        if (useLumpedRiesz_) {
+          lumpRiesz();
+          invertLumpedRiesz();
+        }
+        else {
+          solver_ = ROL::makePtr<Solver<Real>>(parlist.sublist("Solver"));
+          solver_->setA(RieszMap_);
+        }
+      }
+    }
+
+    PDE_DualSimVector(const ROL::Ptr<Tpetra::MultiVector<Real,LO,GO,Node> > &tpetra_vec,
+                      const ROL::Ptr<Tpetra::CrsMatrix<> > &RieszMap,
+                      const ROL::Ptr<Solver<Real> > &solver,
+                      const ROL::Ptr<Tpetra::MultiVector<Real,LO,GO,Node> > &lumpedRiesz)
+      : ROL::TpetraMultiVector<Real,LO,GO,Node>(tpetra_vec), RieszMap_(RieszMap),
+        lumpedRiesz_(lumpedRiesz), solver_(solver), isDualInitialized_(false) {
+      if (RieszMap_ != ROL::nullPtr) {
+        useLumpedRiesz_ = (lumpedRiesz_ != ROL::nullPtr);
+        useRiesz_ = (solver_ != ROL::nullPtr) || useLumpedRiesz_;
+        if (useLumpedRiesz_) {
+          invertLumpedRiesz();
+        }
+      }
+      else {
+        useLumpedRiesz_ = false;
+        useRiesz_ = false;
+      }
+    }
+
+    Real dot( const ROL::Vector<Real> &x ) const {
+      TEUCHOS_TEST_FOR_EXCEPTION( (ROL::TpetraMultiVector<Real,LO,GO,Node>::dimension() != x.dimension()),
+                                  std::invalid_argument,
+                                  "Error: Vectors must have the same dimension." );
+      const ROL::Ptr<const Tpetra::MultiVector<Real,LO,GO,Node> > &ex
+        = dynamic_cast<const ROL::TpetraMultiVector<Real,LO,GO,Node>&>(x).getVector();
+      const Tpetra::MultiVector<Real,LO,GO,Node> &ey
+        = *(ROL::TpetraMultiVector<Real,LO,GO,Node>::getVector());
+      size_t n = ey.getNumVectors();
+      // Scale x with 1/scale_vec_
+      ROL::Ptr<Tpetra::MultiVector<Real,LO,GO,Node> > wex
+        = ROL::makePtr<Tpetra::MultiVector<Real,LO,GO,Node>>(ROL::TpetraMultiVector<Real,LO,GO,Node>::getMap(), n);
+      applyRiesz(wex,ex);
+      // Perform Euclidean dot between *this and scaled x for each vector
+      Teuchos::Array<Real> val(n,0);
+      ey.dot( *wex, val.view(0,n) );
+      // Combine dots for each vector to get a scalar
+      Real xy(0);
+      for (size_t i = 0; i < n; ++i) {
+        xy += val[i];
+      }
+      return xy;
+    }
+
+    ROL::Ptr<ROL::Vector<Real> > clone() const {
+      const Tpetra::MultiVector<Real,LO,GO,Node> &ey
+        = *(ROL::TpetraMultiVector<Real,LO,GO,Node>::getVector());
+      size_t n = ey.getNumVectors();  
+      return ROL::makePtr<PDE_DualSimVector<Real,DeviceType,LO,GO,Node>>(
+             ROL::makePtr<Tpetra::MultiVector<Real,LO,GO,Node>>(ROL::TpetraMultiVector<Real,LO,GO,Node>::getMap(),n),
+             RieszMap_, solver_, lumpedRiesz_);
+    }
+
+    const ROL::Vector<Real> & dual() const {
+      if ( !isDualInitialized_ ) {
+        // Create new memory for dual vector
+        size_t n = ROL::TpetraMultiVector<Real,LO,GO,Node>::getVector()->getNumVectors();
+        primal_vec_ = ROL::makePtr<PDE_PrimalSimVector<Real,DeviceType,LO,GO,Node>>(
+                      ROL::makePtr<Tpetra::MultiVector<Real,LO,GO,Node>>(ROL::TpetraMultiVector<Real,LO,GO,Node>::getMap(),n),
+                      RieszMap_, solver_, lumpedRiesz_);
+        isDualInitialized_ = true;
+      }
+      // Scale *this with scale_vec_ and place in dual vector
+      applyRiesz(primal_vec_->getVector(),ROL::TpetraMultiVector<Real,LO,GO,Node>::getVector());
+      return *primal_vec_;
+    }
+
+    Real apply(const ROL::Vector<Real> &x) const {
+      const PDE_PrimalSimVector<Real,DeviceType,LO,GO,Node> &ex = dynamic_cast<const PDE_PrimalSimVector<Real,DeviceType,LO,GO,Node>&>(x);
+      return ROL::TpetraMultiVector<Real,LO,GO,Node>::dot(ex);
+    }
+}; // class PDE_DualSimVector
+
+template <class Real, class DeviceType,
+          class LO=Tpetra::Map<>::local_ordinal_type, 
+          class GO=Tpetra::Map<>::global_ordinal_type,
+          class Node=Tpetra::Map<>::node_type >
+class PDE_PrimalOptVector;
+
+template <class Real, class DeviceType,
+          class LO=Tpetra::Map<>::local_ordinal_type, 
+          class GO=Tpetra::Map<>::global_ordinal_type,
+          class Node=Tpetra::Map<>::node_type >
+class PDE_DualOptVector;
+
+template <class Real, class DeviceType, class LO, class GO, class Node>
+class PDE_PrimalOptVector : public ROL::TpetraMultiVector<Real,LO,GO,Node> {
+  private:
+    ROL::Ptr<Tpetra::CrsMatrix<> > RieszMap_;
+    ROL::Ptr<Tpetra::MultiVector<> > lumpedRiesz_;
+    ROL::Ptr<Solver<Real> > solver_;
+
+    bool useRiesz_;
+    bool useLumpedRiesz_;
+
+    mutable ROL::Ptr<PDE_DualOptVector<Real, DeviceType> > dual_vec_;
+    mutable bool isDualInitialized_;
+
+    void lumpRiesz(void) {
+      lumpedRiesz_ = ROL::makePtr<Tpetra::MultiVector<Real,LO,GO,Node>>(ROL::TpetraMultiVector<Real,LO,GO,Node>::getMap(),1);
+      Tpetra::MultiVector<Real,LO,GO,Node> ones(ROL::TpetraMultiVector<Real,LO,GO,Node>::getMap(),1);
+      ones.putScalar(static_cast<Real>(1));
+      RieszMap_->apply(ones, *lumpedRiesz_);
+    }
+
+    void applyRiesz(const ROL::Ptr<Tpetra::MultiVector<> > &out,
+                    const ROL::Ptr<const Tpetra::MultiVector<> > &in) const {
+      #ifdef ROL_TIMERS
+        Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::PDEVectorOptRieszApply);
+      #endif
+      if ( useRiesz_ ) {
+        if (useLumpedRiesz_) {
+          out->elementWiseMultiply(static_cast<Real>(1), *(lumpedRiesz_->getVector(0)), *in, static_cast<Real>(0));
+        }
+        else {
+          RieszMap_->apply(*in,*out);
+        }
+      }
+      else {
+        out->scale(static_cast<Real>(1),*in);
+      }
+    }
+
+  public:
+
+    using Assembler_type = Assembler<Real,DeviceType>;
+    using PDE_type = PDE<Real,DeviceType>;
+    using DynamicPDE_type = DynamicPDE<Real,DeviceType>;
+
+    virtual ~PDE_PrimalOptVector() {}
+
+    PDE_PrimalOptVector(const ROL::Ptr<Tpetra::MultiVector<Real,LO,GO,Node> > &tpetra_vec,
+                        const ROL::Ptr<PDE_type> &pde,
+                        const ROL::Ptr<Assembler_type> &assembler)
+      : ROL::TpetraMultiVector<Real,LO,GO,Node>(tpetra_vec), solver_(ROL::nullPtr),
+        useRiesz_(false), useLumpedRiesz_(false), isDualInitialized_(false) {}
+
+    PDE_PrimalOptVector(const ROL::Ptr<Tpetra::MultiVector<Real,LO,GO,Node> > &tpetra_vec,
+                        const ROL::Ptr<PDE_type> &pde,
+                        const ROL::Ptr<Assembler_type> &assembler,
+                        Teuchos::ParameterList &parlist)
+      : ROL::TpetraMultiVector<Real,LO,GO,Node>(tpetra_vec),
+        isDualInitialized_(false) {
+      #ifdef ROL_TIMERS
+        Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::PDEVectorOptRieszConstruct);
+      #endif
+      useRiesz_       = parlist.sublist("Vector").sublist("Opt").get("Use Riesz Map", false);
+      useLumpedRiesz_ = parlist.sublist("Vector").sublist("Opt").get("Lump Riesz Map", false);
+      if (useRiesz_) assembler->assemblePDERieszMap2(RieszMap_, pde);
+      useRiesz_ = useRiesz_ && (RieszMap_ != ROL::nullPtr);
+      if (useRiesz_) {
+        if (useLumpedRiesz_) {
+          lumpRiesz();
+        }
+        else {
+          solver_ = ROL::makePtr<Solver<Real>>(parlist.sublist("Solver"));
+          solver_->setA(RieszMap_);
+        }
+      }
+    }
+
+    PDE_PrimalOptVector(const ROL::Ptr<Tpetra::MultiVector<Real,LO,GO,Node> > &tpetra_vec,
+                        const ROL::Ptr<DynamicPDE_type> &pde,
+                        Assembler_type &assembler)
+      : ROL::TpetraMultiVector<Real,LO,GO,Node>(tpetra_vec), solver_(ROL::nullPtr),
+        useRiesz_(false), useLumpedRiesz_(false), isDualInitialized_(false) {}
+
+    PDE_PrimalOptVector(const ROL::Ptr<Tpetra::MultiVector<Real,LO,GO,Node> > &tpetra_vec,
+                        const ROL::Ptr<DynamicPDE_type> &pde,
+                        Assembler_type &assembler,
+                        Teuchos::ParameterList &parlist)
+      : ROL::TpetraMultiVector<Real,LO,GO,Node>(tpetra_vec),
+        isDualInitialized_(false) {
+      #ifdef ROL_TIMERS
+        Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::PDEVectorOptRieszConstruct);
+      #endif
+      useRiesz_       = parlist.sublist("Vector").sublist("Opt").get("Use Riesz Map", false);
+      useLumpedRiesz_ = parlist.sublist("Vector").sublist("Opt").get("Lump Riesz Map", false);
+      assembler.assembleDynPDERieszMap2(RieszMap_, pde);
+      useRiesz_ = useRiesz_ && (RieszMap_ != ROL::nullPtr);
+      if (useRiesz_) {
+        if (useLumpedRiesz_) {
+          lumpRiesz();
+        }
+        else {
+          solver_ = ROL::makePtr<Solver<Real>>(parlist.sublist("Solver"));
+          solver_->setA(RieszMap_);
+        }
+      }
+    }
+
+    PDE_PrimalOptVector(const ROL::Ptr<Tpetra::MultiVector<Real,LO,GO,Node> > &tpetra_vec,
+                        const ROL::Ptr<Tpetra::CrsMatrix<> > &RieszMap,
+                        const ROL::Ptr<Solver<Real> > &solver,
+                        const ROL::Ptr<Tpetra::MultiVector<Real,LO,GO,Node> > &lumpedRiesz)
+      : ROL::TpetraMultiVector<Real,LO,GO,Node>(tpetra_vec), RieszMap_(RieszMap),
+        lumpedRiesz_(lumpedRiesz), solver_(solver), isDualInitialized_(false) {
+      if (RieszMap_ != ROL::nullPtr) {
+        useLumpedRiesz_ = (lumpedRiesz_ != ROL::nullPtr);
+        useRiesz_ = (solver_ != ROL::nullPtr) || useLumpedRiesz_;
+      }
+      else {
+        useLumpedRiesz_ = false;
+        useRiesz_ = false;
+      }
+    }
+
+    Real dot( const ROL::Vector<Real> &x ) const {
+      TEUCHOS_TEST_FOR_EXCEPTION( (ROL::TpetraMultiVector<Real,LO,GO,Node>::dimension() != x.dimension()),
+                                  std::invalid_argument,
+                                  "Error: Vectors must have the same dimension." );
+      const ROL::Ptr<const Tpetra::MultiVector<Real,LO,GO,Node> > ex
+        = dynamic_cast<const ROL::TpetraMultiVector<Real,LO,GO,Node>&>(x).getVector();
+      const Tpetra::MultiVector<Real,LO,GO,Node> &ey
+        = *(ROL::TpetraMultiVector<Real,LO,GO,Node>::getVector());
+      size_t n = ey.getNumVectors();
+      // Scale x with scale_vec_
+      ROL::Ptr<Tpetra::MultiVector<Real,LO,GO,Node> > wex
+        = ROL::makePtr<Tpetra::MultiVector<Real,LO,GO,Node>>(ROL::TpetraMultiVector<Real,LO,GO,Node>::getMap(), n);
+      applyRiesz(wex,ex);
+      // Perform Euclidean dot between *this and scaled x for each vector
+      Teuchos::Array<Real> val(n,0);
+      ey.dot( *wex, val.view(0,n) );
+      // Combine dots for each vector to get a scalar
+      Real xy(0);
+      for (size_t i = 0; i < n; ++i) {
+        xy += val[i];
+      }
+      return xy;
+    }
+
+    ROL::Ptr<ROL::Vector<Real> > clone() const {
+      const Tpetra::MultiVector<Real,LO,GO,Node> &ey
+        = *(ROL::TpetraMultiVector<Real,LO,GO,Node>::getVector());
+      size_t n = ey.getNumVectors();
+      return ROL::makePtr<PDE_PrimalOptVector<Real,DeviceType,LO,GO,Node>>(
+             ROL::makePtr<Tpetra::MultiVector<Real,LO,GO,Node>>(ROL::TpetraMultiVector<Real,LO,GO,Node>::getMap(),n),
+             RieszMap_, solver_, lumpedRiesz_);
+    }
+
+    const ROL::Vector<Real> & dual() const {
+      if ( !isDualInitialized_ ) {
+        // Create new memory for dual vector
+        size_t n = ROL::TpetraMultiVector<Real,LO,GO,Node>::getVector()->getNumVectors();
+        dual_vec_ = ROL::makePtr<PDE_DualOptVector<Real,DeviceType,LO,GO,Node>>(
+                    ROL::makePtr<Tpetra::MultiVector<Real,LO,GO,Node>>(ROL::TpetraMultiVector<Real,LO,GO,Node>::getMap(),n),
+                    RieszMap_, solver_, lumpedRiesz_);
+        isDualInitialized_ = true;
+      }
+      // Scale *this with scale_vec_ and place in dual vector
+      applyRiesz(dual_vec_->getVector(),ROL::TpetraMultiVector<Real,LO,GO,Node>::getVector());
+      return *dual_vec_;
+    }
+
+    Real apply(const ROL::Vector<Real> &x) const {
+      const PDE_DualOptVector<Real,DeviceType,LO,GO,Node> &ex = dynamic_cast<const PDE_DualOptVector<Real,DeviceType,LO,GO,Node>&>(x);
+      return ROL::TpetraMultiVector<Real,LO,GO,Node>::dot(ex);
+    }
+}; // class PDE_PrimalOptVector
+
+template <class Real, class DeviceType, class LO, class GO, class Node>
+class PDE_DualOptVector : public ROL::TpetraMultiVector<Real,LO,GO,Node> {
+  private:
+    ROL::Ptr<Tpetra::CrsMatrix<Real> > RieszMap_;
+    ROL::Ptr<Tpetra::MultiVector<> > lumpedRiesz_;
+    ROL::Ptr<Tpetra::MultiVector<> > recipLumpedRiesz_;
+    ROL::Ptr<Solver<Real> > solver_;
+
+    bool useRiesz_;
+    bool useLumpedRiesz_;
+
+    mutable ROL::Ptr<PDE_PrimalOptVector<Real,DeviceType> > primal_vec_;
+    mutable bool isDualInitialized_;
+
+    void lumpRiesz(void) {
+      lumpedRiesz_ = ROL::makePtr<Tpetra::Vector<Real,LO,GO,Node>>(ROL::TpetraMultiVector<Real,LO,GO,Node>::getMap());
+      Tpetra::MultiVector<Real,LO,GO,Node> ones(ROL::TpetraMultiVector<Real,LO,GO,Node>::getMap(),1);
+      ones.putScalar(static_cast<Real>(1));
+      RieszMap_->apply(ones, *lumpedRiesz_);
+    }
+
+    void invertLumpedRiesz(void) {
+      recipLumpedRiesz_ = ROL::makePtr<Tpetra::MultiVector<Real,LO,GO,Node>>(ROL::TpetraMultiVector<Real,LO,GO,Node>::getMap(),1);
+      recipLumpedRiesz_->reciprocal(*lumpedRiesz_);
+    }
+
+    void applyRiesz(const ROL::Ptr<Tpetra::MultiVector<> > &out,
+                    const ROL::Ptr<const Tpetra::MultiVector<> > &in) const {
+      #ifdef ROL_TIMERS
+        Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::PDEVectorOptRieszSolve);
+      #endif
+      if ( useRiesz_ ) {
+        if (useLumpedRiesz_) {
+          out->elementWiseMultiply(static_cast<Real>(1), *(recipLumpedRiesz_->getVector(0)), *in, static_cast<Real>(0));
+        }
+        else {
+          solver_->solve(out,in,false);
+        }
+      }
+      else {
+        out->scale(static_cast<Real>(1),*in);
+      }
+    }
+
+  public:
+    virtual ~PDE_DualOptVector() {}
+
+    using Assembler_type = Assembler<Real,DeviceType>;
+    using PDE_type = PDE<Real,DeviceType>;
+    using DynamicPDE_type = DynamicPDE<Real,DeviceType>;
+
+    PDE_DualOptVector(const ROL::Ptr<Tpetra::MultiVector<Real,LO,GO,Node> > &tpetra_vec,
+                      const ROL::Ptr<PDE_type> &pde,
+                      const ROL::Ptr<Assembler_type> &assembler)
+      : ROL::TpetraMultiVector<Real,LO,GO,Node>(tpetra_vec), solver_(ROL::nullPtr),
+        useRiesz_(false), useLumpedRiesz_(false), isDualInitialized_(false) {}
+
+    PDE_DualOptVector(const ROL::Ptr<Tpetra::MultiVector<Real,LO,GO,Node> > &tpetra_vec,
+                      const ROL::Ptr<PDE_type> &pde,
+                      const ROL::Ptr<Assembler_type> &assembler,
+                      Teuchos::ParameterList &parlist)
+      : ROL::TpetraMultiVector<Real,LO,GO,Node>(tpetra_vec),
+        isDualInitialized_(false) {
+      #ifdef ROL_TIMERS
+        Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::PDEVectorOptRieszConstruct);
+      #endif
+      useRiesz_       = parlist.sublist("Vector").sublist("Opt").get("Use Riesz Map", false);
+      useLumpedRiesz_ = parlist.sublist("Vector").sublist("Opt").get("Lump Riesz Map", false);
+      if (useRiesz_) assembler->assemblePDERieszMap2(RieszMap_, pde);
+      useRiesz_ = useRiesz_ && (RieszMap_ != ROL::nullPtr);
+      if (useRiesz_) {
+        if (useLumpedRiesz_) {
+          lumpRiesz();
+          invertLumpedRiesz();
+        }
+        else {
+          solver_ = ROL::makePtr<Solver<Real>>(parlist.sublist("Solver"));
+          solver_->setA(RieszMap_);
+        }
+      }
+    }
+
+    PDE_DualOptVector(const ROL::Ptr<Tpetra::MultiVector<Real,LO,GO,Node> > &tpetra_vec,
+                      const ROL::Ptr<DynamicPDE_type> &pde,
+                      Assembler_type &assembler)
+      : ROL::TpetraMultiVector<Real,LO,GO,Node>(tpetra_vec), solver_(ROL::nullPtr),
+        useRiesz_(false), useLumpedRiesz_(false), isDualInitialized_(false) {}
+
+    PDE_DualOptVector(const ROL::Ptr<Tpetra::MultiVector<Real,LO,GO,Node> > &tpetra_vec,
+                      const ROL::Ptr<DynamicPDE_type> &pde,
+                      Assembler_type &assembler,
+                      Teuchos::ParameterList &parlist)
+      : ROL::TpetraMultiVector<Real,LO,GO,Node>(tpetra_vec),
+        isDualInitialized_(false) {
+      #ifdef ROL_TIMERS
+        Teuchos::TimeMonitor LocalTimer(*ROL::PDEOPT::PDEVectorOptRieszConstruct);
+      #endif
+      useRiesz_       = parlist.sublist("Vector").sublist("Opt").get("Use Riesz Map", false);
+      useLumpedRiesz_ = parlist.sublist("Vector").sublist("Opt").get("Lump Riesz Map", false);
+      assembler.assembleDynPDERieszMap2(RieszMap_, pde);
+      useRiesz_ = useRiesz_ && (RieszMap_ != ROL::nullPtr);
+      if (useRiesz_) {
+        if (useLumpedRiesz_) {
+          lumpRiesz();
+          invertLumpedRiesz();
+        }
+        else {
+          solver_ = ROL::makePtr<Solver<Real>>(parlist.sublist("Solver"));
+          solver_->setA(RieszMap_);
+        }
+      }
+    }
+
+    PDE_DualOptVector(const ROL::Ptr<Tpetra::MultiVector<Real,LO,GO,Node> > &tpetra_vec,
+                      const ROL::Ptr<Tpetra::CrsMatrix<> > &RieszMap,
+                      const ROL::Ptr<Solver<Real> > &solver,
+                      const ROL::Ptr<Tpetra::MultiVector<Real,LO,GO,Node> > &lumpedRiesz)
+      : ROL::TpetraMultiVector<Real,LO,GO,Node>(tpetra_vec), RieszMap_(RieszMap),
+        lumpedRiesz_(lumpedRiesz), solver_(solver), isDualInitialized_(false) {
+      if (RieszMap_ != ROL::nullPtr) {
+        useLumpedRiesz_ = (lumpedRiesz_ != ROL::nullPtr);
+        useRiesz_ = (solver_ != ROL::nullPtr) || useLumpedRiesz_;
+        if (useLumpedRiesz_) {
+          invertLumpedRiesz();
+        }
+      }
+      else {
+        useLumpedRiesz_ = false;
+        useRiesz_ = false;
+      }
+    }
+
+    Real dot( const ROL::Vector<Real> &x ) const {
+      TEUCHOS_TEST_FOR_EXCEPTION( (ROL::TpetraMultiVector<Real,LO,GO,Node>::dimension() != x.dimension()),
+                                  std::invalid_argument,
+                                  "Error: Vectors must have the same dimension." );
+      const ROL::Ptr<const Tpetra::MultiVector<Real,LO,GO,Node> > &ex
+        = dynamic_cast<const ROL::TpetraMultiVector<Real,LO,GO,Node>&>(x).getVector();
+      const Tpetra::MultiVector<Real,LO,GO,Node> &ey
+        = *(ROL::TpetraMultiVector<Real,LO,GO,Node>::getVector());
+      size_t n = ey.getNumVectors();
+      // Scale x with 1/scale_vec_
+      ROL::Ptr<Tpetra::MultiVector<Real,LO,GO,Node> > wex
+        = ROL::makePtr<Tpetra::MultiVector<Real,LO,GO,Node>>(ROL::TpetraMultiVector<Real,LO,GO,Node>::getMap(), n);
+      applyRiesz(wex,ex);
+      // Perform Euclidean dot between *this and scaled x for each vector
+      Teuchos::Array<Real> val(n,0);
+      ey.dot( *wex, val.view(0,n) );
+      // Combine dots for each vector to get a scalar
+      Real xy(0);
+      for (size_t i = 0; i < n; ++i) {
+        xy += val[i];
+      }
+      return xy;
+    }
+
+    ROL::Ptr<ROL::Vector<Real> > clone() const {
+      const Tpetra::MultiVector<Real,LO,GO,Node> &ey
+        = *(ROL::TpetraMultiVector<Real,LO,GO,Node>::getVector());
+      size_t n = ey.getNumVectors();  
+      return ROL::makePtr<PDE_DualOptVector<Real,DeviceType,LO,GO,Node>>(
+             ROL::makePtr<Tpetra::MultiVector<Real,LO,GO,Node>>(ROL::TpetraMultiVector<Real,LO,GO,Node>::getMap(),n),
+             RieszMap_, solver_, lumpedRiesz_);
+    }
+
+    const ROL::Vector<Real> & dual() const {
+      if ( !isDualInitialized_ ) {
+        // Create new memory for dual vector
+        size_t n = ROL::TpetraMultiVector<Real,LO,GO,Node>::getVector()->getNumVectors();
+        primal_vec_ = ROL::makePtr<PDE_PrimalOptVector<Real,DeviceType,LO,GO,Node>>(
+                      ROL::makePtr<Tpetra::MultiVector<Real,LO,GO,Node>>(ROL::TpetraMultiVector<Real,LO,GO,Node>::getMap(),n),
+                      RieszMap_, solver_, lumpedRiesz_);
+        isDualInitialized_ = true;
+      }
+      // Scale *this with scale_vec_ and place in dual vector
+      applyRiesz(primal_vec_->getVector(),ROL::TpetraMultiVector<Real,LO,GO,Node>::getVector());
+      return *primal_vec_;
+    }
+
+    Real apply(const ROL::Vector<Real> &x) const {
+      const PDE_PrimalOptVector<Real,DeviceType,LO,GO,Node> &ex = dynamic_cast<const PDE_PrimalOptVector<Real,DeviceType,LO,GO,Node>&>(x);
+      return ROL::TpetraMultiVector<Real,LO,GO,Node>::dot(ex);
+    }
+}; // class PDE_DualOptVector
+
+template <class Real,
+          class LO=Tpetra::Map<>::local_ordinal_type, 
+          class GO=Tpetra::Map<>::global_ordinal_type,
+          class Node=Tpetra::Map<>::node_type >
+class PDE_OptVector : public ROL::Vector<Real> {
+private:
+  ROL::Ptr<ROL::TpetraMultiVector<Real,LO,GO,Node> > vec1_;
+  ROL::Ptr<ROL::StdVector<Real> >                    vec2_;
+  const int rank_;
+  mutable ROL::Ptr<ROL::TpetraMultiVector<Real,LO,GO,Node> > dual_vec1_;
+  mutable ROL::Ptr<ROL::StdVector<Real> >                    dual_vec2_;
+  mutable ROL::Ptr<PDE_OptVector<Real,LO,GO,Node> >          dual_vec_;
+  mutable bool isDualInitialized_;
+
+public:
+  PDE_OptVector(const ROL::Ptr<ROL::TpetraMultiVector<Real,LO,GO,Node> > &vec1,
+                const ROL::Ptr<ROL::StdVector<Real> >                    &vec2,
+                const int rank = 0 ) 
+    : vec1_(vec1), vec2_(vec2), rank_(rank), isDualInitialized_(false) {
+
+    dual_vec1_ = ROL::dynamicPtrCast<ROL::TpetraMultiVector<Real,LO,GO,Node> >(vec1_->dual().clone());
+    dual_vec2_ = ROL::dynamicPtrCast<ROL::StdVector<Real> >(vec2_->dual().clone());
+  }
+
+  PDE_OptVector(const ROL::Ptr<ROL::TpetraMultiVector<Real,LO,GO,Node> > &vec)
+    : vec1_(vec), vec2_(ROL::nullPtr), rank_(0), dual_vec2_(ROL::nullPtr), isDualInitialized_(false) {
+    dual_vec1_ = ROL::dynamicPtrCast<ROL::TpetraMultiVector<Real,LO,GO,Node> >(vec1_->dual().clone());
+  }
+
+  PDE_OptVector(const ROL::Ptr<ROL::StdVector<Real> > &vec,
+                const int rank = 0)
+    : vec1_(ROL::nullPtr), vec2_(vec), rank_(rank), dual_vec1_(ROL::nullPtr), isDualInitialized_(false) {
+    dual_vec2_ = ROL::dynamicPtrCast<ROL::StdVector<Real> >(vec2_->dual().clone());
+  }
+
+  void set( const ROL::Vector<Real> &x ) {
+    const PDE_OptVector<Real> &xs = dynamic_cast<const PDE_OptVector<Real>&>(x);
+    if ( vec1_ != ROL::nullPtr ) {
+      vec1_->set(*(xs.getField()));
+    }
+    if ( vec2_ != ROL::nullPtr ) {
+      vec2_->set(*(xs.getParameter()));
+    }
+  }
+
+  void plus( const ROL::Vector<Real> &x ) {
+    const PDE_OptVector<Real> &xs = dynamic_cast<const PDE_OptVector<Real>&>(x);
+    if ( vec1_ != ROL::nullPtr ) {
+      vec1_->plus(*(xs.getField()));
+    }
+    if ( vec2_ != ROL::nullPtr ) {
+      vec2_->plus(*(xs.getParameter()));
+    }
+  }
+
+  void scale( const Real alpha ) {
+    if ( vec1_ != ROL::nullPtr ) {
+      vec1_->scale(alpha);
+    }
+    if ( vec2_ != ROL::nullPtr ) {
+      vec2_->scale(alpha);
+    }
+  }
+
+  void axpy( const Real alpha, const ROL::Vector<Real> &x ) {
+    const PDE_OptVector<Real> &xs = dynamic_cast<const PDE_OptVector<Real>&>(x);
+    if ( vec1_ != ROL::nullPtr ) {
+      vec1_->axpy(alpha,*(xs.getField()));
+    }
+    if ( vec2_ != ROL::nullPtr ) {
+      vec2_->axpy(alpha,*(xs.getParameter()));
+    }
+  }
+
+  Real dot( const ROL::Vector<Real> &x ) const {
+    const PDE_OptVector<Real> &xs = dynamic_cast<const PDE_OptVector<Real>&>(x);
+    Real val(0);
+    if ( vec1_ != ROL::nullPtr ) {
+      val += vec1_->dot(*(xs.getField()));
+    }
+    if ( vec2_ != ROL::nullPtr ) {
+      val += vec2_->dot(*(xs.getParameter()));
+    }
+    return val;
+  }
+
+  Real norm() const {
+    Real val(0);
+    if ( vec1_ != ROL::nullPtr ) {
+      Real norm1 = vec1_->norm();
+      val += norm1*norm1;
+    }
+    if ( vec2_ != ROL::nullPtr ) {
+      Real norm2 = vec2_->norm();
+      val += norm2*norm2;
+    }
+    return std::sqrt(val);
+  } 
+
+  ROL::Ptr<ROL::Vector<Real> > clone(void) const {
+    if ( vec2_ == ROL::nullPtr ) {
+      return ROL::makePtr<PDE_OptVector<Real,LO,GO,Node>>(
+             ROL::dynamicPtrCast<ROL::TpetraMultiVector<Real,LO,GO,Node> >(vec1_->clone()));
+    }
+    if ( vec1_ == ROL::nullPtr ) {
+      return ROL::makePtr<PDE_OptVector<Real,LO,GO,Node>>(
+             ROL::dynamicPtrCast<ROL::StdVector<Real> >(vec2_->clone()));
+    }
+    return ROL::makePtr<PDE_OptVector<Real,LO,GO,Node>>(
+           ROL::dynamicPtrCast<ROL::TpetraMultiVector<Real,LO,GO,Node> >(vec1_->clone()),
+           ROL::dynamicPtrCast<ROL::StdVector<Real> >(vec2_->clone()));
+  }
+
+  const ROL::Vector<Real> & dual(void) const {
+    if ( !isDualInitialized_ ) {
+      if ( vec1_ == ROL::nullPtr ) {
+        dual_vec_ = ROL::makePtr<PDE_OptVector<Real>>(dual_vec2_);
+      }
+      else if ( vec2_ == ROL::nullPtr ) {
+        dual_vec_ = ROL::makePtr<PDE_OptVector<Real>>(dual_vec1_);
+      }
+      else {
+        dual_vec_ = ROL::makePtr<PDE_OptVector<Real>>(dual_vec1_,dual_vec2_);
+      }
+      isDualInitialized_ = true;
+    }
+    if ( vec1_ != ROL::nullPtr ) {
+      dual_vec1_->set(vec1_->dual());
+    }
+    if ( vec2_ != ROL::nullPtr ) {
+      dual_vec2_->set(vec2_->dual());
+    }
+    return *dual_vec_;
+  }
+
+  Real apply(const ROL::Vector<Real> &x) const {
+    const PDE_OptVector<Real> &xs = dynamic_cast<const PDE_OptVector<Real>&>(x);
+    Real val(0);
+    if ( vec1_ != ROL::nullPtr ) val += vec1_->apply(*(xs.getField()));
+    if ( vec2_ != ROL::nullPtr ) val += vec2_->apply(*(xs.getParameter()));
+    return val;
+  }
+
+  ROL::Ptr<ROL::Vector<Real> > basis( const int i )  const {
+    ROL::Ptr<ROL::Vector<Real> > e;
+    if ( vec1_ != ROL::nullPtr && vec2_ != ROL::nullPtr ) {
+      int n1 = vec1_->dimension();
+      ROL::Ptr<ROL::Vector<Real> > e1, e2;
+      if ( i < n1 ) {
+        e1 = vec1_->basis(i);
+        e2 = vec2_->clone(); e2->zero();
+      }
+      else {
+        e1 = vec1_->clone(); e1->zero();
+        e2 = vec2_->basis(i-n1);
+      }
+      e = ROL::makePtr<PDE_OptVector>(
+        ROL::dynamicPtrCast<ROL::TpetraMultiVector<Real,LO,GO,Node> >(e1),
+        ROL::dynamicPtrCast<ROL::StdVector<Real> >(e2));
+    }
+    if ( vec1_ != ROL::nullPtr && vec2_ == ROL::nullPtr ) {
+      int n1 = vec1_->dimension();
+      ROL::Ptr<ROL::Vector<Real>> e1;
+      if ( i < n1 ) {
+        e1 = vec1_->basis(i);
+      }
+      else {
+        e1 = vec1_->clone();
+        e1->zero();
+      }
+      e = ROL::makePtr<PDE_OptVector>(
+        ROL::dynamicPtrCast<ROL::TpetraMultiVector<Real,LO,GO,Node> >(e1));
+    }
+    if ( vec1_ == ROL::nullPtr && vec2_ != ROL::nullPtr ) {
+      int n2 = vec2_->dimension();
+      ROL::Ptr<ROL::Vector<Real> > e2;
+      if ( i < n2 ) {
+        e2 = vec2_->basis(i);
+      }
+      else {
+        e2 = vec2_->clone();
+        e2->zero();
+      }
+      e = ROL::makePtr<PDE_OptVector>(
+        ROL::dynamicPtrCast<ROL::StdVector<Real> >(e2));
+    }
+    return e;
+  }
+
+  void applyUnary( const ROL::Elementwise::UnaryFunction<Real> &f ) {
+    if ( vec1_ != ROL::nullPtr ) {
+      vec1_->applyUnary(f);
+    }
+    if ( vec2_ != ROL::nullPtr ) {
+      vec2_->applyUnary(f);
+    }
+  }
+
+  void applyBinary( const ROL::Elementwise::BinaryFunction<Real> &f, const ROL::Vector<Real> &x ) {
+    const PDE_OptVector<Real> &xs = dynamic_cast<const PDE_OptVector<Real>&>(x);
+    if ( vec1_ != ROL::nullPtr ) {
+      vec1_->applyBinary(f,*xs.getField());
+    }
+    if ( vec2_ != ROL::nullPtr ) {
+      vec2_->applyBinary(f,*xs.getParameter());
+    }
+  }
+
+  Real reduce( const ROL::Elementwise::ReductionOp<Real> &r ) const {
+    Real result = r.initialValue();
+    if ( vec1_ != ROL::nullPtr ) {
+      r.reduce(vec1_->reduce(r),result);
+    }
+    if ( vec2_ != ROL::nullPtr ) {
+      r.reduce(vec2_->reduce(r),result);
+    }
+    return result;
+  }
+
+  int dimension() const {
+    int dim(0);
+    if ( vec1_ != ROL::nullPtr ) {
+      dim += vec1_->dimension();
+    }
+    if ( vec2_ != ROL::nullPtr ) {
+      dim += vec2_->dimension();
+    }
+    return dim;
+  }
+
+  void randomize(const Real l = 0.0, const Real u = 1.0) {
+    if (vec1_ != ROL::nullPtr) {
+      vec1_->randomize(l,u);
+    }
+    if (vec2_ != ROL::nullPtr) {
+      vec2_->randomize(l,u);
+    }
+  }
+
+  void print(std::ostream &outStream) const {
+    if (vec1_ != ROL::nullPtr) {
+      vec1_->print(outStream);
+    }
+    if (vec2_ != ROL::nullPtr) {
+      if (rank_ == 0) {
+        vec2_->print(outStream);
+      }
+    }
+  }
+
+  ROL::Ptr<const ROL::TpetraMultiVector<Real,LO,GO,Node>> getField(void) const { 
+    return vec1_;
+  }
+
+  ROL::Ptr<const ROL::StdVector<Real>> getParameter(void) const { 
+    return vec2_; 
+  }
+
+  ROL::Ptr<ROL::TpetraMultiVector<Real,LO,GO,Node>> getField(void) { 
+    return vec1_;
+  }
+
+  ROL::Ptr<ROL::StdVector<Real>> getParameter(void) { 
+    return vec2_; 
+  }
+
+  void setField(const ROL::Vector<Real>& vec) { 
+    vec1_->set(vec);
+  }
+  
+  void setParameter(const ROL::Vector<Real>& vec) { 
+    vec2_->set(vec); 
+  }
+}; // class PDE_OptVector
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/TOOLS/qoiK.hpp b/packages/rol/example/PDE-OPT/TOOLS/qoiK.hpp
new file mode 100644
index 000000000000..cca5b55b3620
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/TOOLS/qoiK.hpp
@@ -0,0 +1,203 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  qoi.hpp
+    \brief Provides the interface for local (cell-based) quantities of interest.
+*/
+
+#ifndef ROL_PDEOPT_QOIK_HPP
+#define ROL_PDEOPT_QOIK_HPP
+
+#include "pdeK.hpp"
+
+template <class Real, class DeviceType>
+class QoI {
+public:
+  
+  using scalar_view = Kokkos::DynRankView<Real,DeviceType>;
+
+  virtual ~QoI() {}
+
+  virtual Real value(scalar_view & val,
+                     const scalar_view u_coeff,
+                     const scalar_view z_coeff = scalar_view(),
+                     const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) = 0;
+
+  virtual void gradient_1(scalar_view & grad,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> gradient_1 not implemented.");
+  }
+
+  virtual void gradient_2(scalar_view & grad,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> gradient_2 not implemented.");
+  }
+
+  virtual std::vector<Real> gradient_3(std::vector<scalar_view > & grad,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> gradient_3 not implemented.");
+  }
+
+  virtual void HessVec_11(scalar_view & hess,
+                          const scalar_view v_coeff,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> HessVec_11 not implemented.");
+  }
+
+  virtual void HessVec_12(scalar_view & hess,
+                          const scalar_view v_coeff,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> HessVec_12 not implemented.");
+  }
+
+  virtual void HessVec_13(scalar_view & hess,
+                          const ROL::Ptr<const std::vector<Real> > & v_param,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> HessVec_13 not implemented.");
+  }
+
+  virtual void HessVec_21(scalar_view & hess,
+                          const scalar_view v_coeff,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> HessVec_21 not implemented.");
+  }
+
+  virtual void HessVec_22(scalar_view & hess,
+                          const scalar_view v_coeff,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> HessVec_22 not implemented.");
+  }
+
+  virtual void HessVec_23(scalar_view & hess,
+                          const ROL::Ptr<const std::vector<Real> > & v_param,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> HessVec_23 not implemented.");
+  }
+
+  virtual std::vector<Real> HessVec_31(std::vector<scalar_view > & hess,
+                          const scalar_view v_coeff,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> HessVec_31 not implemented.");
+  }
+
+  virtual std::vector<Real> HessVec_32(std::vector<scalar_view > & hess,
+                          const scalar_view v_coeff,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> HessVec_32 not implemented.");
+  }
+
+  virtual std::vector<Real> HessVec_33(std::vector<scalar_view > & hess,
+                          const ROL::Ptr<const std::vector<Real> > & v_param,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> HessVec_33 not implemented.");
+  }
+
+  virtual void Hessian_11(scalar_view & hess,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> Hessian_11 not implemented.");
+  }
+
+  virtual void Hessian_12(scalar_view & hess,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> Hessian_12 not implemented.");
+  }
+
+  virtual void Hessian_13(std::vector<scalar_view > & hess,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> Hessian_13 not implemented.");
+  }
+
+  virtual void Hessian_21(scalar_view & hess,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> Hessian_21 not implemented.");
+  }
+
+  virtual void Hessian_22(scalar_view & hess,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> Hessian_22 not implemented.");
+  }
+
+  virtual void Hessian_23(std::vector<scalar_view > & hess,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> Hessian_23 not implemented.");
+  }
+
+  virtual void Hessian_31(std::vector<scalar_view > & hess,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> Hessian_31 not implemented.");
+  }
+
+  virtual void Hessian_32(std::vector<scalar_view > & hess,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> Hessian_32 not implemented.");
+  }
+
+  virtual void Hessian_33(std::vector<std::vector<Real> > & hess,
+                          const scalar_view u_coeff,
+                          const scalar_view z_coeff = scalar_view(),
+                          const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    throw Exception::NotImplemented(">>> Hessian_33 not implemented.");
+  }
+
+private:
+  std::vector<Real> param_;
+
+protected:
+  std::vector<Real> getParameter(void) const {
+    return param_;
+  }
+
+public:
+  void setParameter(const std::vector<Real> &param) {
+    param_.assign(param.begin(),param.end());
+  }
+
+}; // QOI
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/TOOLS/solver.hpp b/packages/rol/example/PDE-OPT/TOOLS/solver.hpp
index 9a208993fc64..3aeeb0a4e021 100644
--- a/packages/rol/example/PDE-OPT/TOOLS/solver.hpp
+++ b/packages/rol/example/PDE-OPT/TOOLS/solver.hpp
@@ -15,7 +15,7 @@
 #ifndef ROL_PDEOPT_SOLVER_H
 #define ROL_PDEOPT_SOLVER_H
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Teuchos_TimeMonitor.hpp"
 
 #include "Tpetra_MultiVector.hpp"
diff --git a/packages/rol/example/PDE-OPT/TOOLS/sysbuilder.hpp b/packages/rol/example/PDE-OPT/TOOLS/sysbuilder.hpp
index d1183459fad1..c590d03d7391 100644
--- a/packages/rol/example/PDE-OPT/TOOLS/sysbuilder.hpp
+++ b/packages/rol/example/PDE-OPT/TOOLS/sysbuilder.hpp
@@ -14,7 +14,7 @@
 #ifndef ROL_PDEOPT_SYSBUILDER_H
 #define ROL_PDEOPT_SYSBUILDER_H
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Teuchos_TimeMonitor.hpp"
 
 #include "Tpetra_MultiVector.hpp"
diff --git a/packages/rol/example/PDE-OPT/adv-diff-react/CMakeLists.txt b/packages/rol/example/PDE-OPT/adv-diff-react/CMakeLists.txt
index 40c8628d66a7..a8eaa4fdec71 100644
--- a/packages/rol/example/PDE-OPT/adv-diff-react/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/adv-diff-react/CMakeLists.txt
@@ -14,7 +14,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   # Need ROL_TpetraTeuchosBatchManager.hpp
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
 
-  #TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  #ROL_ADD_EXECUTABLE_AND_TEST(
   #  example_01
   #  SOURCES example_01.cpp
   #  ARGS PrintItAll
@@ -23,7 +23,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   #  ADD_DIR_TO_NAME
   #)
 
-  #TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  #ROL_ADD_EXECUTABLE_AND_TEST(
   #  example_02
   #  SOURCES example_02.cpp
   #  ARGS PrintItAll
@@ -33,25 +33,25 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   #  ADD_DIR_TO_NAME
   #)
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_02
     SOURCES example_02.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_04
     SOURCES example_04.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_05
     SOURCES example_05.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  ROL_ADD_EXECUTABLE_AND_TEST(
     example_06
     SOURCES example_06.cpp
     ARGS PrintItAll
@@ -61,7 +61,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     StochAdvDiffDataCopy
     SOURCE_FILES
       input.xml input_ex04.xml input_ex05.xml input_ex06.xml plotresults.m plotresults_ex02.m
diff --git a/packages/rol/example/PDE-OPT/adv-diff-react/example_01.cpp b/packages/rol/example/PDE-OPT/adv-diff-react/example_01.cpp
index 84f4d63cf41a..0d3bca3a848b 100644
--- a/packages/rol/example/PDE-OPT/adv-diff-react/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/adv-diff-react/example_01.cpp
@@ -14,8 +14,8 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
+#include "ROL_XMLReader.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -67,7 +67,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int>> comm
     = Tpetra::getDefaultComm();
   ROL::Ptr<const Teuchos::Comm<int>> serial_comm
diff --git a/packages/rol/example/PDE-OPT/adv-diff-react/example_02.cpp b/packages/rol/example/PDE-OPT/adv-diff-react/example_02.cpp
index d5fc85b076f4..bdd6bd29be70 100644
--- a/packages/rol/example/PDE-OPT/adv-diff-react/example_02.cpp
+++ b/packages/rol/example/PDE-OPT/adv-diff-react/example_02.cpp
@@ -13,8 +13,7 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -179,7 +178,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int>> comm
     = Tpetra::getDefaultComm();
   ROL::Ptr<const Teuchos::Comm<int>> serial_comm
diff --git a/packages/rol/example/PDE-OPT/adv-diff-react/example_03.cpp b/packages/rol/example/PDE-OPT/adv-diff-react/example_03.cpp
index 92ee143f7f7c..744958b918bc 100644
--- a/packages/rol/example/PDE-OPT/adv-diff-react/example_03.cpp
+++ b/packages/rol/example/PDE-OPT/adv-diff-react/example_03.cpp
@@ -13,8 +13,8 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
+#include "ROL_XMLReader.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -62,7 +62,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int>> comm
     = Tpetra::getDefaultComm();
   ROL::Ptr<const Teuchos::Comm<int>> serial_comm
diff --git a/packages/rol/example/PDE-OPT/adv-diff-react/example_04.cpp b/packages/rol/example/PDE-OPT/adv-diff-react/example_04.cpp
index ebe40c688279..77b69999a641 100644
--- a/packages/rol/example/PDE-OPT/adv-diff-react/example_04.cpp
+++ b/packages/rol/example/PDE-OPT/adv-diff-react/example_04.cpp
@@ -13,8 +13,7 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -124,7 +123,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int>> comm
     = Tpetra::getDefaultComm();
   ROL::Ptr<const Teuchos::Comm<int>> serial_comm
diff --git a/packages/rol/example/PDE-OPT/adv-diff-react/example_05.cpp b/packages/rol/example/PDE-OPT/adv-diff-react/example_05.cpp
index 0f2c3a8c26fe..d0a121b61189 100644
--- a/packages/rol/example/PDE-OPT/adv-diff-react/example_05.cpp
+++ b/packages/rol/example/PDE-OPT/adv-diff-react/example_05.cpp
@@ -13,8 +13,7 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -46,7 +45,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int>> comm
     = Tpetra::getDefaultComm();
   const int myRank = comm->getRank();
diff --git a/packages/rol/example/PDE-OPT/adv-diff-react/example_06.cpp b/packages/rol/example/PDE-OPT/adv-diff-react/example_06.cpp
index e0afb5206d89..a68060cae452 100644
--- a/packages/rol/example/PDE-OPT/adv-diff-react/example_06.cpp
+++ b/packages/rol/example/PDE-OPT/adv-diff-react/example_06.cpp
@@ -12,7 +12,7 @@
 */
 
 #include "Teuchos_Comm.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
 
@@ -47,7 +47,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int>> comm
     = Tpetra::getDefaultComm();
   ROL::Ptr<const Teuchos::Comm<int>> serial_comm
diff --git a/packages/rol/example/PDE-OPT/allen-cahn/CMakeLists.txt b/packages/rol/example/PDE-OPT/allen-cahn/CMakeLists.txt
index ec63e966c3a0..3617cab6a401 100644
--- a/packages/rol/example/PDE-OPT/allen-cahn/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/allen-cahn/CMakeLists.txt
@@ -13,19 +13,19 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   # Need ROL_TpetraTeuchosBatchManager.hpp
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_01
     SOURCES example_01.cpp
     ADD_DIR_TO_NAME
   )
 
-#  TRIBITS_ADD_EXECUTABLE(
+#  ROL_ADD_EXECUTABLE(
 #    example_02
 #    SOURCES example_02.cpp
 #    ADD_DIR_TO_NAME
 #  )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     AllenCahnDataCopy
     SOURCE_FILES
       input.xml plotresults.m
@@ -33,5 +33,40 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
   )
 
+ENDIF()
+
+IF(${PROJECT_NAME}_ENABLE_Intrepid2 AND
+   ${PROJECT_NAME}_ENABLE_Ifpack2   AND
+   ${PROJECT_NAME}_ENABLE_MueLu     AND
+   ${PROJECT_NAME}_ENABLE_Amesos2   AND
+   ${PROJECT_NAME}_ENABLE_Tpetra )
+
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/teuchos/src/sol)
+
+  # Need ROL_TpetraMultiVector.hpp
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
+
+  # Need ROL_TpetraTeuchosBatchManager.hpp
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
+
+  ROL_ADD_EXECUTABLE(
+    example_01_Kokkos
+    SOURCES example_01K.cpp
+    ADD_DIR_TO_NAME
+  )
+
+#  ROL_ADD_EXECUTABLE(
+#    example_02
+#    SOURCES example_02.cpp
+#    ADD_DIR_TO_NAME
+#  )
+
+  ROL_COPY_FILES_TO_BINARY_DIR(
+    AllenCahnDataCopyK
+    SOURCE_FILES
+      input.xml plotresults.m
+    SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}"
+    DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
+  )
 
 ENDIF()
diff --git a/packages/rol/example/PDE-OPT/allen-cahn/example_01.cpp b/packages/rol/example/PDE-OPT/allen-cahn/example_01.cpp
index b1075e2a2d04..56e3123301db 100644
--- a/packages/rol/example/PDE-OPT/allen-cahn/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/allen-cahn/example_01.cpp
@@ -12,8 +12,7 @@
 */
 
 #include "Teuchos_Comm.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -43,7 +42,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int> > comm
     = Tpetra::getDefaultComm();
   const int myRank = comm->getRank();
diff --git a/packages/rol/example/PDE-OPT/allen-cahn/example_01K.cpp b/packages/rol/example/PDE-OPT/allen-cahn/example_01K.cpp
new file mode 100644
index 000000000000..8a8e34a07f8a
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/allen-cahn/example_01K.cpp
@@ -0,0 +1,214 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_01.cpp
+    \brief Shows how to solve the Poisson-Boltzmann problem.
+*/
+
+#include "Teuchos_Comm.hpp"
+#include "ROL_GlobalMPISession.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+
+#include "ROL_Stream.hpp"
+#include "ROL_Solver.hpp"
+#include "ROL_Reduced_Objective_SimOpt.hpp"
+#include "ROL_BoundConstraint_SimOpt.hpp"
+#include "ROL_Bounds.hpp"
+
+#include "../TOOLS/meshmanagerK.hpp"
+#include "../TOOLS/pdeconstraintK.hpp"
+#include "../TOOLS/pdeobjectiveK.hpp"
+#include "../TOOLS/pdevectorK.hpp"
+#include "pde_allen_cahnK.hpp"
+#include "obj_allen_cahnK.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+int main(int argc, char *argv[]) {
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  Kokkos::ScopeGuard kokkosScope (argc, argv);
+  ROL::Ptr<const Teuchos::Comm<int> > comm
+    = Tpetra::getDefaultComm();
+  const int myRank = comm->getRank();
+  if ((iprint > 0) && (myRank == 0))
+    outStream = ROL::makePtrFromRef(std::cout);
+  else
+    outStream = ROL::makePtrFromRef(bhs);
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+
+    /*** Read in XML input ***/
+    std::string filename = "input.xml";
+    auto parlist = ROL::getParametersFromXmlFile(filename);
+
+    /*** Initialize main data structure. ***/
+    auto meshMgr = ROL::makePtr<MeshManager_Rectangle<RealT,DeviceT>>(*parlist);
+    // Initialize PDE describe Poisson's equation
+    auto pde = ROL::makePtr<PDE_Allen_Cahn<RealT,DeviceT>>(*parlist);
+    auto con = ROL::makePtr<PDE_Constraint<RealT,DeviceT>>(pde,meshMgr,comm,*parlist,*outStream);
+    auto assembler = con->getAssembler();
+    assembler->printMeshData(*outStream);
+    con->setSolveParameters(*parlist);
+
+    /*************************************************************************/
+    /***************** BUILD VECTORS *****************************************/
+    /*************************************************************************/
+    auto u_ptr = assembler->createStateVector();
+    auto p_ptr = assembler->createStateVector();
+    auto r_ptr = assembler->createResidualVector();
+    auto z_ptr = assembler->createControlVector();
+    u_ptr->randomize();  //u_ptr->putScalar(static_cast<RealT>(1));
+    p_ptr->randomize();  //p_ptr->putScalar(static_cast<RealT>(1));
+    r_ptr->randomize();  //r_ptr->putScalar(static_cast<RealT>(1));
+    z_ptr->randomize();  //z_ptr->putScalar(static_cast<RealT>(1));
+    auto up = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(u_ptr,pde,assembler,*parlist);
+    auto pp = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(p_ptr,pde,assembler,*parlist);
+    auto rp = ROL::makePtr<PDE_DualSimVector<RealT,DeviceT>>(r_ptr,pde,assembler,*parlist);
+    auto zp = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(z_ptr,pde,assembler,*parlist);
+
+    /*************************************************************************/
+    /***************** BUILD COST FUNCTIONAL *********************************/
+    /*************************************************************************/
+    // Initialize quadratic objective function
+    std::vector<ROL::Ptr<QoI<RealT,DeviceT>>> qoi_vec(2,ROL::nullPtr);
+    qoi_vec[0] = ROL::makePtr<QoI_State_Cost_Allen_Cahn<RealT,DeviceT>>(pde->getFE());
+    qoi_vec[1] = ROL::makePtr<QoI_Control_Cost_Allen_Cahn<RealT,DeviceT>>(pde->getFE(),
+                                                                  pde->getBdryFE(),
+                                                                  pde->getBdryCellLocIds());
+    std::vector<RealT> weights(2);
+    weights[0] = static_cast<RealT>(1);
+    weights[1] = parlist->sublist("Problem").get("Control Penalty Parameter", 1e-4);
+    // Build full-space objective
+    auto obj = ROL::makePtr<PDE_Objective<RealT,DeviceT>>(qoi_vec,weights,assembler);
+    // Build reduced-space objective
+    bool storage = parlist->sublist("Problem").get("Use state storage",true);
+    auto stateStore = ROL::makePtr<ROL::VectorController<RealT>>();
+    auto robj = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(obj,con,
+                                                                stateStore,
+                                                                up,zp,pp,
+                                                                storage);
+
+    /*************************************************************************/
+    /***************** BUILD BOUND CONSTRAINT ********************************/
+    /*************************************************************************/
+    // Control bounds
+    auto zlo_ptr = assembler->createControlVector();
+    auto zhi_ptr = assembler->createControlVector();
+    RealT lo = parlist->sublist("Problem").get("Lower Bound",0.0);
+    RealT hi = parlist->sublist("Problem").get("Upper Bound",1.0);
+    zlo_ptr->putScalar(lo); zhi_ptr->putScalar(hi);
+    auto zlop = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(zlo_ptr,pde,assembler);
+    auto zhip = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(zhi_ptr,pde,assembler);
+    auto zbnd = ROL::makePtr<ROL::Bounds<RealT>>(zlop,zhip);
+    bool deactivate = parlist->sublist("Problem").get("Deactivate Bound Constraints",false);
+    if (deactivate) zbnd->deactivate();
+    // State bounds
+    auto ulo_ptr = assembler->createStateVector();
+    auto uhi_ptr = assembler->createStateVector();
+    ulo_ptr->putScalar(ROL::ROL_NINF<RealT>()); uhi_ptr->putScalar(ROL::ROL_INF<RealT>());
+    auto ulop = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(ulo_ptr,pde,assembler);
+    auto uhip = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(uhi_ptr,pde,assembler);
+    auto ubnd = ROL::makePtr<ROL::Bounds<RealT>>(ulop,uhip);
+    ubnd->deactivate();
+
+    // SimOpt bounds
+    auto bnd = ROL::makePtr<ROL::BoundConstraint_SimOpt<RealT>>(ubnd,zbnd);
+
+    // Create ROL SimOpt vectors
+    auto du_ptr = assembler->createStateVector();
+    auto dz_ptr = assembler->createControlVector();
+    du_ptr->randomize(); //du_ptr->putScalar(static_cast<RealT>(0));
+    dz_ptr->randomize(); //dz_ptr->putScalar(static_cast<RealT>(1));
+    auto dup = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(du_ptr,pde,assembler,*parlist);
+    auto dzp = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(dz_ptr,pde,assembler,*parlist);
+
+    ROL::Vector_SimOpt<RealT> x(up,zp);
+    ROL::Vector_SimOpt<RealT> d(dup,dzp);
+
+    /*************************************************************************/
+    /***************** RUN VECTOR AND DERIVATIVE CHECKS **********************/
+    /*************************************************************************/
+    bool checkDeriv = parlist->sublist("Problem").get("Check Derivatives",false);
+    if ( checkDeriv ) {
+      *outStream << "\n\nCheck Gradient of Full Objective Function\n";
+      obj->checkGradient(x,d,true,*outStream);
+      *outStream << "\n\nCheck Hessian of Full Objective Function\n";
+      obj->checkHessVec(x,d,true,*outStream);
+      *outStream << "\n\nCheck Full Jacobian of PDE Constraint\n";
+      con->checkApplyJacobian(x,d,*rp,true,*outStream);
+      *outStream << "\n\nCheck Jacobian_1 of PDE Constraint\n";
+      con->checkApplyJacobian_1(*up,*zp,*dup,*rp,true,*outStream);
+      *outStream << "\n\nCheck Jacobian_2 of PDE Constraint\n";
+      con->checkApplyJacobian_2(*up,*zp,*dzp,*rp,true,*outStream);
+      *outStream << "\n\nCheck Full Hessian of PDE Constraint\n";
+      con->checkApplyAdjointHessian(x,*pp,d,x,true,*outStream);
+      *outStream << "\n";
+      con->checkAdjointConsistencyJacobian(*dup,d,x,true,*outStream);
+      *outStream << "\n";
+      con->checkInverseJacobian_1(*rp,*dup,*up,*zp,true,*outStream);
+      *outStream << "\n";
+      *outStream << "\n\nCheck Gradient of Reduced Objective Function\n";
+      robj->checkGradient(*zp,*dzp,true,*outStream);
+      *outStream << "\n\nCheck Hessian of Reduced Objective Function\n";
+      robj->checkHessVec(*zp,*dzp,true,*outStream);
+    }
+
+    bool useFullSpace = parlist->sublist("Problem").get("Full space",false);
+    ROL::Ptr<ROL::Problem<RealT>> optProb;
+    if ( useFullSpace ) {
+      optProb = ROL::makePtr<ROL::Problem<RealT>>(obj, makePtrFromRef(x));
+      optProb->addBoundConstraint(bnd);
+      optProb->addConstraint("PDE", con, pp);
+    }
+    else {
+      optProb = ROL::makePtr<ROL::Problem<RealT>>(robj, zp);
+      optProb->addBoundConstraint(zbnd);
+    }
+    optProb->finalize(false,true,*outStream);
+    ROL::Solver<RealT> optSolver(optProb, *parlist);
+    optSolver.solve(*outStream);
+
+    // Output.
+    RealT tol(1.e-8);
+    con->solve(*rp,*up,*zp,tol);
+    con->outputTpetraVector(u_ptr,"state.txt");
+    con->outputTpetraVector(z_ptr,"control.txt");
+
+    Teuchos::Array<RealT> res(1,0);
+    con->value(*rp,*up,*zp,tol);
+    r_ptr->norm2(res.view(0,1));
+    *outStream << "Residual Norm: " << res[0] << std::endl;
+    errorFlag += (res[0] > 1.e-6 ? 1 : 0);
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/allen-cahn/example_02.cpp b/packages/rol/example/PDE-OPT/allen-cahn/example_02.cpp
index 6af965515025..62c90dabbc84 100644
--- a/packages/rol/example/PDE-OPT/allen-cahn/example_02.cpp
+++ b/packages/rol/example/PDE-OPT/allen-cahn/example_02.cpp
@@ -12,8 +12,8 @@
 */
 
 #include "Teuchos_Comm.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
+#include "ROL_XMLReader.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -61,7 +61,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int> > comm
     = Tpetra::getDefaultComm();
   ROL::Ptr<const Teuchos::Comm<int> > serial_comm
diff --git a/packages/rol/example/PDE-OPT/allen-cahn/obj_allen_cahnK.hpp b/packages/rol/example/PDE-OPT/allen-cahn/obj_allen_cahnK.hpp
new file mode 100644
index 000000000000..fc829d84c4ec
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/allen-cahn/obj_allen_cahnK.hpp
@@ -0,0 +1,312 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  obj.hpp
+    \brief Provides the interface for local (cell-based) objective function computations.
+*/
+
+#ifndef PDEOPT_QOI_L2TRACKING_ALLEN_CAHNK_HPP
+#define PDEOPT_QOI_L2TRACKING_ALLEN_CAHNK_HPP
+
+#include "../TOOLS/qoiK.hpp"
+#include "pde_allen_cahnK.hpp"
+
+template <class Real, class DeviceType>
+class QoI_State_Cost_Allen_Cahn : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  ROL::Ptr<fe_type> fe_;
+  scalar_view target_;
+
+  Real targetFunc(const std::vector<Real> & x) const {
+    Real val(0);
+    for (auto xi : x) val += xi*xi;
+    return val;
+  }
+
+public:
+  QoI_State_Cost_Allen_Cahn(const ROL::Ptr<fe_type> &fe) : fe_(fe) {
+    int c = fe_->cubPts().extent_int(0);
+    int p = fe_->cubPts().extent_int(1);
+    int d = fe_->cubPts().extent_int(2);
+    std::vector<Real> pt(d);
+    target_ = scalar_view("target_",c,p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k)
+          pt[k] = (fe_->cubPts())(i,j,k);
+        target_(i,j) = targetFunc(pt);
+      }
+    }
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = u_coeff.extent_int(0);
+    int p = fe_->cubPts().extent_int(1);
+    // Initialize output val
+    val = scalar_view("val", c);
+    // Evaluate state on FE basis
+    scalar_view valU_eval("valU_eval", c, p);
+    fe_->evaluateValue(valU_eval, u_coeff);
+    // Compute difference between state and target
+    rst::subtract(valU_eval,target_);
+    // Compute squared L2-norm of diff
+    fe_->computeIntegral(val,valU_eval,valU_eval);
+    // Scale by one half
+    rst::scale(val,static_cast<Real>(0.5));
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    // Get relevant dimensions
+    int c = u_coeff.extent_int(0);
+    int p = fe_->cubPts().extent_int(1);
+    int f = fe_->N().extent_int(1);
+    // Initialize output grad
+    grad = scalar_view("grad", c, f);
+    // Evaluate state on FE basis
+    scalar_view valU_eval("valU_eval", c, p);
+    fe_->evaluateValue(valU_eval, u_coeff);
+    // Compute difference between state and target
+    rst::subtract(valU_eval,target_);
+    // Compute gradient of squared L2-norm of diff
+    fst::integrate(grad,valU_eval,fe_->NdetJ(),false);
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_State_Cost_Allen_Cahn::gradient_2 is zero.");
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    int c = v_coeff.extent_int(0);
+    int p = fe_->cubPts().extent_int(1);
+    int f = fe_->N().extent_int(1);
+    hess = scalar_view("hess", c, f);
+    scalar_view valV_eval("valV_eval", c, p);
+    fe_->evaluateValue(valV_eval, v_coeff);
+    fst::integrate(hess,valV_eval,fe_->NdetJ(),false);
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_State_Cost_Allen_Cahn::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_State_Cost_Allen_Cahn::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_State_Cost_Allen_Cahn::HessVec_22 is zero.");
+  }
+
+}; // QoI_L2Tracking
+
+template <class Real, class DeviceType>
+class QoI_Control_Cost_Allen_Cahn : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> fe_vol_;
+  const std::vector<std::vector<ROL::Ptr<fe_type>>> fe_bdry_;
+  const std::vector<std::vector<std::vector<int>>> bdryCellLocIds_;
+
+  scalar_view getBoundaryCoeff(const scalar_view cell_coeff, const int sideset, const int locSideId) const {
+    std::vector<int> bdryCellLocId = bdryCellLocIds_[sideset][locSideId];
+    const int numCellsSide = bdryCellLocId.size();
+    const int f = fe_vol_->N().extent_int(1);
+    
+    scalar_view bdry_coeff("bdry_coeff", numCellsSide, f);
+    for (int i = 0; i < numCellsSide; ++i) {
+      for (int j = 0; j < f; ++j)
+        bdry_coeff(i, j) = cell_coeff(bdryCellLocId[i], j);
+    }
+    return bdry_coeff;
+  }
+
+public:
+  QoI_Control_Cost_Allen_Cahn(const ROL::Ptr<fe_type> &fe_vol,
+                  const std::vector<std::vector<ROL::Ptr<fe_type>>> &fe_bdry,
+                  const std::vector<std::vector<std::vector<int>>> &bdryCellLocIds)
+    : fe_vol_(fe_vol), fe_bdry_(fe_bdry), bdryCellLocIds_(bdryCellLocIds) {}
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    const int c = fe_vol_->gradN().extent_int(0);
+    // Initialize output val
+    val = scalar_view("val", c);
+    // Compute cost integral
+    const int numSideSets = bdryCellLocIds_.size();
+    for (int i = 0; i < numSideSets; ++i) {
+      const int numLocSides = bdryCellLocIds_[i].size();
+      for (int j = 0; j < numLocSides; ++j) {
+        const int numCellsSide  = bdryCellLocIds_[i][j].size();
+        if ( numCellsSide ) {
+          const int numCubPerSide = fe_bdry_[i][j]->cubPts().extent_int(1);
+          // Evaluate control on FE basis
+          scalar_view z_coeff_bdry = getBoundaryCoeff(z_coeff, i, j);
+          scalar_view valZ_eval("valZ_eval", numCellsSide, numCubPerSide);
+          fe_bdry_[i][j]->evaluateValue(valZ_eval, z_coeff_bdry);
+          // Integrate cell cost
+          scalar_view intVal("intVal", numCellsSide);
+          fe_bdry_[i][j]->computeIntegral(intVal,valZ_eval,valZ_eval);
+          // Add to integral value
+          for (int k = 0; k < numCellsSide; ++k) {
+            int cidx = bdryCellLocIds_[i][j][k];
+            val(cidx) += static_cast<Real>(0.5)*intVal(k);
+          }
+        }
+      }
+    }
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Control_Cost_Allen_Cahn::gradient_1 is zero.");
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    const int c = fe_vol_->gradN().extent_int(0);
+    const int f = fe_vol_->gradN().extent_int(1);
+    // Initialize output val
+    grad = scalar_view("grad", c, f);
+    // Compute cost integral
+    const int numSideSets = bdryCellLocIds_.size();
+    for (int i = 0; i < numSideSets; ++i) {
+      const int numLocSides = bdryCellLocIds_[i].size();
+      for (int j = 0; j < numLocSides; ++j) {
+        const int numCellsSide  = bdryCellLocIds_[i][j].size();
+        if ( numCellsSide ) {
+          const int numCubPerSide = fe_bdry_[i][j]->cubPts().extent_int(1);
+          // Evaluate control on FE basis
+          scalar_view z_coeff_bdry = getBoundaryCoeff(z_coeff, i, j);
+          scalar_view valZ_eval("valZ_eval", numCellsSide, numCubPerSide);
+          fe_bdry_[i][j]->evaluateValue(valZ_eval, z_coeff_bdry);
+          // Compute gradient of squared L2-norm of diff
+          scalar_view intGrad("intGrad", numCellsSide, f);
+          fst::integrate(intGrad,valZ_eval,fe_bdry_[i][j]->NdetJ(),false);
+          // Add to integral value
+          for (int k = 0; k < numCellsSide; ++k) {
+            int cidx = bdryCellLocIds_[i][j][k];
+            for (int l = 0; l < f; ++l)
+              grad(cidx,l) += intGrad(k,l);
+          }
+        }
+      }
+    }
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Control_Cost_Allen_Cahn::HessVec_11 is zero.");
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Control_Cost_Allen_Cahn::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Control_Cost_Allen_Cahn::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    const int c = fe_vol_->gradN().extent_int(0);
+    const int f = fe_vol_->gradN().extent_int(1);
+    // Initialize output val
+    hess = scalar_view("hess", c, f);
+    // Compute cost integral
+    const int numSideSets = bdryCellLocIds_.size();
+    for (int i = 0; i < numSideSets; ++i) {
+      const int numLocSides = bdryCellLocIds_[i].size();
+      for (int j = 0; j < numLocSides; ++j) {
+        const int numCellsSide  = bdryCellLocIds_[i][j].size();
+        if ( numCellsSide ) {
+          const int numCubPerSide = fe_bdry_[i][j]->cubPts().extent_int(1);
+          // Evaluate direction on FE basis
+          scalar_view v_coeff_bdry = getBoundaryCoeff(v_coeff, i, j);
+          scalar_view valV_eval("valV_eval", numCellsSide, numCubPerSide);
+          fe_bdry_[i][j]->evaluateValue(valV_eval, v_coeff_bdry);
+          // Compute hessian times a vector of cost
+          scalar_view intHess("intHess", numCellsSide, f);
+          fst::integrate(intHess,valV_eval,fe_bdry_[i][j]->NdetJ(),false);
+          // Add to integral value
+          for (int k = 0; k < numCellsSide; ++k) {
+            int cidx = bdryCellLocIds_[i][j][k];
+            for (int l = 0; l < f; ++l)
+              hess(cidx,l) += intHess(k,l);
+          }
+        }
+      }
+    }
+  }
+
+}; // QoI_L2Penalty
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/allen-cahn/pde_allen_cahnK.hpp b/packages/rol/example/PDE-OPT/allen-cahn/pde_allen_cahnK.hpp
new file mode 100644
index 000000000000..747ba192799b
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/allen-cahn/pde_allen_cahnK.hpp
@@ -0,0 +1,477 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  pde.hpp
+    \brief Implements the local PDE interface for the Poisson-Boltzmann control problem.
+*/
+
+#ifndef PDE_ALLEN_CAHNK_HPP
+#define PDE_ALLEN_CAHNK_HPP
+
+#include "../TOOLS/pdeK.hpp"
+#include "../TOOLS/feK.hpp"
+
+#include "Intrepid2_HGRAD_QUAD_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_QUAD_C2_FEM.hpp"
+#include "Intrepid2_DefaultCubatureFactory.hpp"
+#include "Intrepid2_FunctionSpaceTools.hpp"
+#include "Intrepid2_CellTools.hpp"
+
+#include "ROL_Ptr.hpp"
+
+template <class Real, class DeviceType>
+class PDE_Allen_Cahn : public PDE<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  // Finite element basis information
+  basis_ptr basisPtr_;
+  std::vector<basis_ptr> basisPtrs_;
+  // Cell cubature information
+  ROL::Ptr<Intrepid2::Cubature<DeviceType,Real,Real>> cellCub_, bdryCub_;
+  // Cell node information
+  scalar_view volCellNodes_;
+  std::vector<std::vector<scalar_view>> bdryCellNodes_;
+  std::vector<std::vector<std::vector<int>>> bdryCellLocIds_;
+  // Finite element definition
+  ROL::Ptr<fe_type> fe_vol_;
+  std::vector<std::vector<ROL::Ptr<fe_type>>> fe_bdry_;
+  // Local degrees of freedom on boundary, for each side of the reference cell (first index).
+  std::vector<std::vector<int>> fidx_;
+
+  Real uScale_, vScale_;
+  Real robinCoeff_;
+
+public:
+  PDE_Allen_Cahn(ROL::ParameterList &parlist) {
+    // Finite element fields.
+    int basisOrder = parlist.sublist("Problem").get("Order of FE discretization",1);
+    if (basisOrder == 1)
+      basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceType,Real,Real>>();
+    else if (basisOrder == 2)
+      basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C2_FEM<DeviceType,Real,Real>>();
+    basisPtrs_.clear(); basisPtrs_.push_back(basisPtr_);
+    // Quadrature rules.
+    shards::CellTopology cellType = basisPtr_->getBaseCellTopology();                  // get the cell type from any basis
+    Intrepid2::DefaultCubatureFactory cubFactory;                                      // create cubature factory
+    int cubDegree = parlist.sublist("PDE Poisson Boltzmann").get("Cubature Degree",2); // set cubature degree, e.g., 2
+    cellCub_ = cubFactory.create<DeviceType,Real,Real>(cellType, cubDegree);           // create default cubature
+
+    int d = cellType.getDimension();
+    shards::CellTopology bdryCellType = cellType.getCellTopologyData(d-1, 0);
+    int bdryCubDegree = parlist.sublist("Problem").get("Boundary Cubature Degree",2); // set cubature degree, e.g., 2
+    bdryCub_ = cubFactory.create<DeviceType,Real,Real>(bdryCellType, bdryCubDegree);
+
+    uScale_     = parlist.sublist("Problem").get("Scale Third Order Term",1.0);
+    vScale_     = parlist.sublist("Problem").get("Scale First Order Term",1.0);
+    robinCoeff_ = parlist.sublist("Problem").get("Robin Coefficient",1e0);
+  }
+
+  void residual(scalar_view & res,
+                const scalar_view u_coeff,
+                const scalar_view z_coeff = scalar_view(),
+                const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // GET DIMENSIONS
+    const int c = fe_vol_->gradN().extent_int(0);
+    const int f = fe_vol_->gradN().extent_int(1);
+    const int p = fe_vol_->gradN().extent_int(2);
+    const int d = fe_vol_->gradN().extent_int(3);
+    // INITIALIZE RESIDUAL
+    res = scalar_view("res", c, f);
+    // STORAGE
+    scalar_view valU_eval("valU_eval", c, p);
+    scalar_view gradU_eval("gradU_eval", c, p, d);
+    scalar_view lambda("lambda", c, p);
+    scalar_view lambda_gradU_eval("lambda_gradU_eval", c, p, d);
+    scalar_view phi_valU_eval("phi_valU_eval", c, p);
+    // COMPUTE PDE COEFFICIENTS
+    computeCoefficients(lambda);
+    // EVALUATE STATE AT QUADRATURE POINTS
+    fe_vol_->evaluateValue(valU_eval, u_coeff);
+    fe_vol_->evaluateGradient(gradU_eval, u_coeff);
+    // ADD STIFFNESS TERM TO RESIDUAL
+    fst::tensorMultiplyDataData(lambda_gradU_eval,lambda,gradU_eval);
+    fst::integrate(res,lambda_gradU_eval,fe_vol_->gradNdetJ(),false);
+    // ADD NONLINEAR TERM TO RESIDUAL
+    computeNonlinearity(phi_valU_eval, valU_eval,0);
+    fst::integrate(res,phi_valU_eval,fe_vol_->NdetJ(),true);
+    // APPLY ROBIN CONDITIONS
+    std::vector<std::vector<scalar_view>> bdryCellDofValues;
+    const int numSideSets = bdryCellLocIds_.size();
+    for (int i = 0; i < numSideSets; ++i) {
+      int numLocalSideIds = bdryCellLocIds_[i].size();
+      for (int j = 0; j < numLocalSideIds; ++j) {
+        int numCellsSide = bdryCellLocIds_[i][j].size();
+        const int numCubPerSide = bdryCub_->getNumPoints();
+        if (numCellsSide) {
+          // Get U and Z coefficients on Robin boundary
+          scalar_view u_coeff_bdry = getBoundaryCoeff(u_coeff, i, j);
+          scalar_view z_coeff_bdry = getBoundaryCoeff(z_coeff, i, j);
+          // Evaluate U and Z on FE basis
+          scalar_view valU_eval_bdry("valU_eval_bdry", numCellsSide, numCubPerSide);
+          scalar_view valZ_eval_bdry("valZ_eval_bdry", numCellsSide, numCubPerSide);
+          fe_bdry_[i][j]->evaluateValue(valU_eval_bdry, u_coeff_bdry);
+          fe_bdry_[i][j]->evaluateValue(valZ_eval_bdry, z_coeff_bdry);
+          // Compute Robin residual
+          scalar_view robinRes("robinRes", numCellsSide, f);
+          scalar_view robinVal("robinVal", numCellsSide, numCubPerSide);
+          computeRobin(robinVal,valU_eval_bdry,valZ_eval_bdry,i,j,0);
+          fst::integrate(robinRes,robinVal,fe_bdry_[i][j]->NdetJ(),false);
+          // Add Robin residual to volume residual
+          for (int k = 0; k < numCellsSide; ++k) {
+            int cidx = bdryCellLocIds_[i][j][k];
+            for (int l = 0; l < f; ++l)
+              res(cidx,l) += robinRes(k,l);
+          }
+        }
+      }
+    }
+  }
+
+  void Jacobian_1(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // GET DIMENSIONS
+    const int c = fe_vol_->gradN().extent_int(0);
+    const int f = fe_vol_->gradN().extent_int(1);
+    const int p = fe_vol_->gradN().extent_int(2);
+    const int d = fe_vol_->gradN().extent_int(3);
+    // INITIALIZE JACOBIAN
+    jac = scalar_view("jac", c, f, f);
+    // STORAGE
+    scalar_view valU_eval("valU_eval", c, p);
+    scalar_view lambda("lambda", c, p);
+    scalar_view lambda_gradN_eval("lambda_gradN_eval", c, f, p, d);
+    scalar_view dphi_valU_eval("dphi_valU_eval", c, p);
+    scalar_view NdphiU("NdphiU", c, f, p);
+    // COMPUTE PDE COEFFICIENTS
+    computeCoefficients(lambda);
+    // EVALUATE STATE AT QUADRATURE POINTS
+    fe_vol_->evaluateValue(valU_eval, u_coeff);
+    // ADD STIFFNESS TERM TO JACOBIAN
+    fst::tensorMultiplyDataField(lambda_gradN_eval,lambda,fe_vol_->gradN());
+    fst::integrate(jac,lambda_gradN_eval,fe_vol_->gradNdetJ(),false);
+    // ADD NONLINEAR TERM
+    computeNonlinearity(dphi_valU_eval, valU_eval, 1);
+    fst::scalarMultiplyDataField(NdphiU,dphi_valU_eval,fe_vol_->N());
+    fst::integrate(jac,NdphiU,fe_vol_->NdetJ(),true);
+    // APPLY ROBIN CONDITIONS
+    const int numSideSets = bdryCellLocIds_.size();
+    for (int i = 0; i < numSideSets; ++i) {
+      int numLocalSideIds = bdryCellLocIds_[i].size();
+      for (int j = 0; j < numLocalSideIds; ++j) {
+        int numCellsSide = bdryCellLocIds_[i][j].size();
+        const int numCubPerSide = bdryCub_->getNumPoints();
+        if (numCellsSide) {
+          // Get U and Z coefficients on Robin boundary
+          scalar_view u_coeff_bdry = getBoundaryCoeff(u_coeff, i, j);
+          scalar_view z_coeff_bdry = getBoundaryCoeff(z_coeff, i, j);
+          // Evaluate U and Z on FE basis
+          scalar_view valU_eval_bdry("valU_eval_bdry", numCellsSide, numCubPerSide);
+          scalar_view valZ_eval_bdry("valZ_eval_bdry", numCellsSide, numCubPerSide);
+          fe_bdry_[i][j]->evaluateValue(valU_eval_bdry, u_coeff_bdry);
+          fe_bdry_[i][j]->evaluateValue(valZ_eval_bdry, z_coeff_bdry);
+          // Compute Robin residual
+          scalar_view robinVal_N("robinVal_N", numCellsSide, f, numCubPerSide);
+          scalar_view robinJac("robinJac", numCellsSide, f, f);
+          scalar_view robinVal("robinVal", numCellsSide, numCubPerSide);
+          computeRobin(robinVal,valU_eval_bdry,valZ_eval_bdry,i,j,1,1);
+          fst::scalarMultiplyDataField(robinVal_N,robinVal,fe_bdry_[i][j]->N());
+          fst::integrate(robinJac,robinVal_N,fe_bdry_[i][j]->NdetJ(),false);
+          // Add Robin residual to volume residual
+          for (int k = 0; k < numCellsSide; ++k) {
+            int cidx = bdryCellLocIds_[i][j][k];
+            for (int l = 0; l < f; ++l) { 
+              for (int m = 0; m < f; ++m)
+                jac(cidx,l,m) += robinJac(k,l,m);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void Jacobian_2(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // GET DIMENSIONS
+    const int c = fe_vol_->gradN().extent_int(0);
+    const int f = fe_vol_->gradN().extent_int(1);
+    // INITIALIZE JACOBIAN
+    jac = scalar_view("jac", c, f, f);
+    // APPLY ROBIN CONDITIONS
+    const int numSideSets = bdryCellLocIds_.size();
+    for (int i = 0; i < numSideSets; ++i) {
+      int numLocalSideIds = bdryCellLocIds_[i].size();
+      for (int j = 0; j < numLocalSideIds; ++j) {
+        int numCellsSide = bdryCellLocIds_[i][j].size();
+        const int numCubPerSide = bdryCub_->getNumPoints();
+        if (numCellsSide) {
+          // Get U and Z coefficients on Robin boundary
+          scalar_view u_coeff_bdry = getBoundaryCoeff(u_coeff, i, j);
+          scalar_view z_coeff_bdry = getBoundaryCoeff(z_coeff, i, j);
+          // Evaluate U and Z on FE basis
+          scalar_view valU_eval_bdry("valU_eval_bdry", numCellsSide, numCubPerSide);
+          scalar_view valZ_eval_bdry("valZ_eval_bdry", numCellsSide, numCubPerSide);
+          fe_bdry_[i][j]->evaluateValue(valU_eval_bdry, u_coeff_bdry);
+          fe_bdry_[i][j]->evaluateValue(valZ_eval_bdry, z_coeff_bdry);
+          // Compute Robin residual
+          scalar_view robinVal_N("robinVal_N", numCellsSide, f, numCubPerSide);
+          scalar_view robinJac("robinJac", numCellsSide, f, f);
+          scalar_view robinVal("robinVal", numCellsSide, numCubPerSide);
+          computeRobin(robinVal,valU_eval_bdry,valZ_eval_bdry,i,j,1,2);
+          fst::scalarMultiplyDataField(robinVal_N,robinVal,fe_bdry_[i][j]->N());
+          fst::integrate(robinJac,robinVal_N,fe_bdry_[i][j]->NdetJ(),false);
+          // Add Robin residual to volume residual
+          for (int k = 0; k < numCellsSide; ++k) {
+            int cidx = bdryCellLocIds_[i][j][k];
+            for (int l = 0; l < f; ++l) { 
+              for (int m = 0; m < f; ++m)
+                jac(cidx,l,m) += robinJac(k,l,m);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void Hessian_11(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // GET DIMENSIONS
+    const int c = fe_vol_->gradN().extent_int(0);
+    const int f = fe_vol_->gradN().extent_int(1);
+    const int p = fe_vol_->gradN().extent_int(2);
+    // INITIALIZE HESSIAN
+    hess = scalar_view("hess", c, f, f);
+    // STORAGE
+    scalar_view valU_eval("valU_eval", c, p);
+    scalar_view valL_eval("valL_eval", c, p);
+    scalar_view d2phi_valU_eval("d2phi_valU_eval", c, p);
+    scalar_view Ld2phiU("Ld2phiU", c, p);
+    scalar_view NLd2phiU("NLd2phiU", c, f, p);
+    // EVALUATE STATE AT QUADRATURE POINTS
+    fe_vol_->evaluateValue(valU_eval, u_coeff);
+    fe_vol_->evaluateValue(valL_eval, l_coeff);
+    // COMPUTE NONLINEAR TERM
+    computeNonlinearity(d2phi_valU_eval, valU_eval, 2);
+    fst::scalarMultiplyDataData(Ld2phiU,d2phi_valU_eval,valL_eval);
+    fst::scalarMultiplyDataField(NLd2phiU,Ld2phiU,fe_vol_->N());
+    fst::integrate(hess,NLd2phiU,fe_vol_->NdetJ(),false);
+  }
+
+  void Hessian_12(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Allen_Cahn:Hessian_12: Hessian is zero.");
+  }
+
+  void Hessian_21(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Allen_Cahn:Hessian_21: Hessian is zero.");
+  }
+
+  void Hessian_22(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Allen_Cahn:Hessian_22: Hessian is zero.");
+  }
+
+  void RieszMap_1(scalar_view & riesz) override {
+    // GET DIMENSIONS
+    const int c = fe_vol_->N().extent_int(0);
+    const int f = fe_vol_->N().extent_int(1);
+    // INITIALIZE RIESZ
+    riesz = scalar_view("riesz1", c, f, f);
+    Kokkos::deep_copy(riesz,fe_vol_->stiffMat());
+    rst::add(riesz,fe_vol_->massMat());
+  }
+
+  void RieszMap_2(scalar_view & riesz) override {
+    // GET DIMENSIONS
+    const int c = fe_vol_->N().extent_int(0);
+    const int f = fe_vol_->N().extent_int(1);
+    // INITIALIZE RIESZ
+    riesz = scalar_view("riesz2", c, f, f);
+    Kokkos::deep_copy(riesz,fe_vol_->massMat());
+  }
+
+  std::vector<basis_ptr> getFields() override {
+    return basisPtrs_;
+  }
+
+  void setCellNodes(const scalar_view &volCellNodes,
+                    const std::vector<std::vector<scalar_view>> &bdryCellNodes,
+                    const std::vector<std::vector<std::vector<int>>> &bdryCellLocIds) override {
+    volCellNodes_ = volCellNodes;
+    bdryCellNodes_ = bdryCellNodes;
+    bdryCellLocIds_ = bdryCellLocIds;
+    // Finite element definition.
+    fe_vol_ = ROL::makePtr<fe_type>(volCellNodes_,basisPtr_,cellCub_);
+    // Set local boundary DOFs.
+    fidx_ = fe_vol_->getBoundaryDofs();
+    // Construct boundary FEs
+    const int numSidesets = bdryCellNodes.size();
+    fe_bdry_.resize(numSidesets);
+    for(int i = 0; i < numSidesets; ++i) {
+      int numLocSides = bdryCellNodes[i].size();
+      fe_bdry_[i].resize(numLocSides);
+      for (int j = 0; j < numLocSides; ++j) {
+        if (bdryCellNodes_[i][j] != scalar_view()) {
+          fe_bdry_[i][j] = ROL::makePtr<fe_type>(bdryCellNodes_[i][j],basisPtr_,bdryCub_,j);
+        }
+      }
+    }
+  }
+
+  const ROL::Ptr<fe_type> getFE(void) const {
+    return fe_vol_;
+  }
+
+  const std::vector<std::vector<ROL::Ptr<fe_type>>> getBdryFE(void) const {
+    return fe_bdry_;
+  }
+
+  const scalar_view getCellNodes(void) const {
+    return volCellNodes_;
+  }
+
+  const std::vector<std::vector<std::vector<int>>> getBdryCellLocIds(void) const {
+    return bdryCellLocIds_;
+  }
+
+private:
+
+  Real evaluateLambda(const std::vector<Real> &x) const {
+    const std::vector<Real> param = PDE<Real,DeviceType>::getParameter();
+    Real val(1);
+    if (param.size()) {
+      val += param[0];
+    }
+    return val;
+  }
+
+  void computeCoefficients(scalar_view &lambda) const {
+    // GET DIMENSIONS
+    const int c = fe_vol_->gradN().extent_int(0);
+    const int p = fe_vol_->gradN().extent_int(2);
+    const int d = fe_vol_->gradN().extent_int(3);
+    std::vector<Real> pt(d);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for ( int k = 0; k < d; ++k)
+          pt[k] = (fe_vol_->cubPts())(i,j,k);
+        lambda(i,j) = evaluateLambda(pt);
+      }
+    }
+  }
+
+  Real evaluateDelta(const std::vector<Real> &x) const {
+    const std::vector<Real> param = PDE<Real,DeviceType>::getParameter();
+    Real val(1);
+    if (param.size())
+      val += param[1];
+    return val;
+  }
+
+  Real evaluateNonlinearity(const std::vector<Real> &x, const Real u, const int deriv) const {
+    Real val(0);
+    if (deriv == 0)
+      val = uScale_*std::pow(u,3) - vScale_*u;
+    if (deriv == 1)
+      val = uScale_*static_cast<Real>(3)*std::pow(u,2) - vScale_;
+    if (deriv == 2)
+      val = uScale_*static_cast<Real>(6)*u;
+    return evaluateDelta(x) * val;
+  }
+
+  void computeNonlinearity(scalar_view &val,
+                           const scalar_view &u,
+                           const int deriv = 0) const {
+    // GET DIMENSIONS
+    const int c = fe_vol_->gradN().extent_int(0);
+    const int p = fe_vol_->gradN().extent_int(2);
+    const int d = fe_vol_->gradN().extent_int(3);
+    std::vector<Real> pt(d);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for ( int k = 0; k < d; ++k)
+          pt[k] = (fe_vol_->cubPts())(i,j,k);
+        val(i,j) = evaluateNonlinearity(pt, u(i,j), deriv);
+      }
+    }
+  }
+
+  Real evaluateRobin(const Real u, const Real z, const std::vector<Real> &x,
+                     const int sideset, const int locSideId,
+                     const int deriv = 0, const int component = 1) const {
+    const std::vector<Real> param = PDE<Real,DeviceType>::getParameter();
+    Real h(robinCoeff_);
+    if (param.size())
+      h += param[2];
+    if ( deriv == 1 )
+      return (component==1) ? h : -h;
+    if ( deriv > 1 )
+      return static_cast<Real>(0);
+    return h * (u - z);
+  }
+
+  void computeRobin(scalar_view &robin,
+                    const scalar_view &u,
+                    const scalar_view &z,
+                    const int sideset,
+                    const int locSideId,
+                    const int deriv = 0,
+                    const int component = 1) const {
+    const int c = u.extent_int(0);
+    const int p = u.extent_int(1);
+    const int d = fe_vol_->gradN().extent_int(3);
+    std::vector<Real> pt(d);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k)
+          pt[k] = (fe_bdry_[sideset][locSideId]->cubPts())(i,j,k);
+        robin(i,j) = evaluateRobin(u(i,j),z(i,j),pt,sideset,locSideId,deriv,component);
+      }
+    }
+  }
+
+  scalar_view getBoundaryCoeff(const scalar_view cell_coeff, const int sideset, const int locSideId) const {
+    std::vector<int> bdryCellLocId = bdryCellLocIds_[sideset][locSideId];
+    const int numCellsSide = bdryCellLocId.size();
+    const int f = fe_vol_->N().extent_int(1);
+    
+    scalar_view bdry_coeff("bdry_coeff", numCellsSide, f);
+    for (int i = 0; i < numCellsSide; ++i) {
+      for (int j = 0; j < f; ++j)
+        bdry_coeff(i, j) = cell_coeff(bdryCellLocId[i], j);
+    }
+    return bdry_coeff;
+  }
+
+}; // PDE_Allen_Cahn
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/binary/adv-diff-SUR/CMakeLists.txt b/packages/rol/example/PDE-OPT/binary/adv-diff-SUR/CMakeLists.txt
index f660a02cf2c0..13fbcc1e30e2 100644
--- a/packages/rol/example/PDE-OPT/binary/adv-diff-SUR/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/binary/adv-diff-SUR/CMakeLists.txt
@@ -27,7 +27,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/pebbl/src/function)
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/pebbl/src/vector)
 
-  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  ROL_ADD_EXECUTABLE_AND_TEST(
     example_01
     SOURCES example_01.cpp
     ARGS PrintItAll
@@ -36,7 +36,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     BinaryAdvSURDiffDataCopy
     SOURCE_FILES
       input_ex01.xml
diff --git a/packages/rol/example/PDE-OPT/binary/adv-diff-SUR/example_01.cpp b/packages/rol/example/PDE-OPT/binary/adv-diff-SUR/example_01.cpp
index 10575725c204..446c18b23fde 100644
--- a/packages/rol/example/PDE-OPT/binary/adv-diff-SUR/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/binary/adv-diff-SUR/example_01.cpp
@@ -11,7 +11,7 @@
     \brief Shows how to solve the binary advection-diffusion control problem.
 */
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
 
@@ -29,7 +29,7 @@ int main(int argc, char *argv[]) {
   using RealT = double;
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
   ROL::Ptr<const Teuchos::Comm<int> > comm
     = Tpetra::getDefaultComm();
 
diff --git a/packages/rol/example/PDE-OPT/binary/adv-diff-TEST/CMakeLists.txt b/packages/rol/example/PDE-OPT/binary/adv-diff-TEST/CMakeLists.txt
index 7367eac8fdab..791ac6c91cc3 100644
--- a/packages/rol/example/PDE-OPT/binary/adv-diff-TEST/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/binary/adv-diff-TEST/CMakeLists.txt
@@ -30,7 +30,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/pebbl/src/function)
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/pebbl/src/vector)
 
-  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  ROL_ADD_EXECUTABLE_AND_TEST(
     Binary
     SOURCES example_01.cpp
     ARGS PrintItAll
@@ -39,7 +39,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     BinaryAdvTESTDiffDataCopy
     SOURCE_FILES
       input_ex01.xml
diff --git a/packages/rol/example/PDE-OPT/binary/adv-diff-TEST/example_01.cpp b/packages/rol/example/PDE-OPT/binary/adv-diff-TEST/example_01.cpp
index ce99ba8017fb..c95d5a93301f 100644
--- a/packages/rol/example/PDE-OPT/binary/adv-diff-TEST/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/binary/adv-diff-TEST/example_01.cpp
@@ -11,7 +11,7 @@
     \brief Shows how to solve the binary advection-diffusion control problem.
 */
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
 
@@ -33,7 +33,7 @@ int main(int argc, char *argv[]) {
   using RealT = double;
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
   ROL::Ptr<const Teuchos::Comm<int> > comm
     = Tpetra::getDefaultComm();
 
diff --git a/packages/rol/example/PDE-OPT/binary/adv-diff/CMakeLists.txt b/packages/rol/example/PDE-OPT/binary/adv-diff/CMakeLists.txt
index 794a3e5ecbd3..13c100f6e3f7 100644
--- a/packages/rol/example/PDE-OPT/binary/adv-diff/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/binary/adv-diff/CMakeLists.txt
@@ -27,7 +27,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/pebbl/src/function)
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/pebbl/src/vector)
 
-  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  ROL_ADD_EXECUTABLE_AND_TEST(
     example_01
     SOURCES example_01.cpp
     ARGS PrintItAll
@@ -36,7 +36,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     BinaryAdvDiffDataCopy
     SOURCE_FILES
       input_ex01.xml
diff --git a/packages/rol/example/PDE-OPT/binary/adv-diff/example_01.cpp b/packages/rol/example/PDE-OPT/binary/adv-diff/example_01.cpp
index 3ed8041c96c2..6ee19645f386 100644
--- a/packages/rol/example/PDE-OPT/binary/adv-diff/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/binary/adv-diff/example_01.cpp
@@ -11,7 +11,7 @@
     \brief Shows how to solve the binary advection-diffusion control problem.
 */
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
 
@@ -31,7 +31,7 @@ int main(int argc, char *argv[]) {
   using RealT = double;
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
   ROL::Ptr<const Teuchos::Comm<int> > comm
     = Tpetra::getDefaultComm();
 
diff --git a/packages/rol/example/PDE-OPT/binary/elasticity/CMakeLists.txt b/packages/rol/example/PDE-OPT/binary/elasticity/CMakeLists.txt
index 1e5659087bfc..620bfa25b0a2 100644
--- a/packages/rol/example/PDE-OPT/binary/elasticity/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/binary/elasticity/CMakeLists.txt
@@ -18,13 +18,13 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/pebbl/src/function)
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/pebbl/src/vector)
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_01
     SOURCES example_01.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     BinTopoOptDataCopy
     SOURCE_FILES
       input_ex01.xml
diff --git a/packages/rol/example/PDE-OPT/binary/elasticity/example_01.cpp b/packages/rol/example/PDE-OPT/binary/elasticity/example_01.cpp
index c1721986ce65..a877996eeb0a 100644
--- a/packages/rol/example/PDE-OPT/binary/elasticity/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/binary/elasticity/example_01.cpp
@@ -14,8 +14,8 @@
 
 #include "Teuchos_Comm.hpp"
 #include "Teuchos_Time.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
+#include "ROL_XMLReader.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -41,7 +41,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int>> comm
     = Tpetra::getDefaultComm();
   const int myRank = comm->getRank();
diff --git a/packages/rol/example/PDE-OPT/binary/stefan-boltzmann/CMakeLists.txt b/packages/rol/example/PDE-OPT/binary/stefan-boltzmann/CMakeLists.txt
index c6bf82ea9d98..3231af6c33ff 100644
--- a/packages/rol/example/PDE-OPT/binary/stefan-boltzmann/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/binary/stefan-boltzmann/CMakeLists.txt
@@ -30,7 +30,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/pebbl/src/function)
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/pebbl/src/vector)
 
-  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  ROL_ADD_EXECUTABLE_AND_TEST(
     BinaryStefanBoltzmann
     SOURCES example_01.cpp
     ARGS PrintItAll
@@ -39,7 +39,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     BinaryStefanBoltzmannDataCopy
     SOURCE_FILES
       input_ex01.xml
diff --git a/packages/rol/example/PDE-OPT/binary/stefan-boltzmann/example_01.cpp b/packages/rol/example/PDE-OPT/binary/stefan-boltzmann/example_01.cpp
index 2220c7d6253c..a139865ff225 100644
--- a/packages/rol/example/PDE-OPT/binary/stefan-boltzmann/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/binary/stefan-boltzmann/example_01.cpp
@@ -11,7 +11,7 @@
     \brief Shows how to solve the binary advection-diffusion control problem.
 */
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
 
@@ -32,7 +32,7 @@ int main(int argc, char *argv[]) {
   using RealT = double;
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
   ROL::Ptr<const Teuchos::Comm<int>> comm
     = Tpetra::getDefaultComm();
 
diff --git a/packages/rol/example/PDE-OPT/dynamic/adv_diff/CMakeLists.txt b/packages/rol/example/PDE-OPT/dynamic/adv_diff/CMakeLists.txt
index f34b376b327d..8d331409e3b1 100644
--- a/packages/rol/example/PDE-OPT/dynamic/adv_diff/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/dynamic/adv_diff/CMakeLists.txt
@@ -9,22 +9,22 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   # Need ROL_TpetraMultiVector.hpp
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_01
     SOURCES example_01.cpp
     ADD_DIR_TO_NAME
   )
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_02
     SOURCES example_02.cpp
     ADD_DIR_TO_NAME
   )
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_03
     SOURCES example_03.cpp
     ADD_DIR_TO_NAME
   )
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     ParabolicDataCopy
     SOURCE_FILES
       input.xml plotresults.m plotadvection.m
diff --git a/packages/rol/example/PDE-OPT/dynamic/adv_diff/example_01.cpp b/packages/rol/example/PDE-OPT/dynamic/adv_diff/example_01.cpp
index 1b9337f915f5..bf721d935569 100644
--- a/packages/rol/example/PDE-OPT/dynamic/adv_diff/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/dynamic/adv_diff/example_01.cpp
@@ -13,7 +13,7 @@
 */
 
 #include "Teuchos_Comm.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
 
@@ -43,7 +43,7 @@ int main(int argc, char *argv[]) {
   using RealT = double;
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
   ROL::Ptr<const Teuchos::Comm<int>> comm
     = Tpetra::getDefaultComm();
 
diff --git a/packages/rol/example/PDE-OPT/dynamic/adv_diff/example_02.cpp b/packages/rol/example/PDE-OPT/dynamic/adv_diff/example_02.cpp
index cae6c8e71afe..566bae79c434 100644
--- a/packages/rol/example/PDE-OPT/dynamic/adv_diff/example_02.cpp
+++ b/packages/rol/example/PDE-OPT/dynamic/adv_diff/example_02.cpp
@@ -13,7 +13,7 @@
 */
 
 #include "Teuchos_Comm.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
 
@@ -47,7 +47,7 @@ int main(int argc, char *argv[]) {
   using RealT = double;
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
   ROL::Ptr<const Teuchos::Comm<int>> comm
     = Tpetra::getDefaultComm();
 
diff --git a/packages/rol/example/PDE-OPT/dynamic/adv_diff/example_03.cpp b/packages/rol/example/PDE-OPT/dynamic/adv_diff/example_03.cpp
index 61756f7db0cf..575c0c46d206 100644
--- a/packages/rol/example/PDE-OPT/dynamic/adv_diff/example_03.cpp
+++ b/packages/rol/example/PDE-OPT/dynamic/adv_diff/example_03.cpp
@@ -13,7 +13,7 @@
 */
 
 #include "Teuchos_Comm.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
 
@@ -47,7 +47,7 @@ int main(int argc, char *argv[]) {
   using RealT = double;
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
   ROL::Ptr<const Teuchos::Comm<int>> comm
     = Tpetra::getDefaultComm();
 
diff --git a/packages/rol/example/PDE-OPT/dynamic/navier-stokes/AugmentedSystem.cpp b/packages/rol/example/PDE-OPT/dynamic/navier-stokes/AugmentedSystem.cpp
index 969406ff4fea..5abc774d828d 100644
--- a/packages/rol/example/PDE-OPT/dynamic/navier-stokes/AugmentedSystem.cpp
+++ b/packages/rol/example/PDE-OPT/dynamic/navier-stokes/AugmentedSystem.cpp
@@ -12,7 +12,7 @@
 */
 
 #include "Teuchos_Comm.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
 
@@ -124,7 +124,7 @@ int main(int argc, char *argv[]) {
   using Teuchos::RCP;
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
   ROL::Ptr<const Teuchos::Comm<int>> comm
     = Tpetra::getDefaultComm();
 
diff --git a/packages/rol/example/PDE-OPT/dynamic/navier-stokes/CMakeLists.txt b/packages/rol/example/PDE-OPT/dynamic/navier-stokes/CMakeLists.txt
index 009a4f3eebe7..2e4aeb71e325 100644
--- a/packages/rol/example/PDE-OPT/dynamic/navier-stokes/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/dynamic/navier-stokes/CMakeLists.txt
@@ -15,25 +15,25 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/mpi/src/vector)
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/mpi)
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_01
     SOURCES example_01.cpp
     ADD_DIR_TO_NAME
   )
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_02
     SOURCES example_02.cpp
     ADD_DIR_TO_NAME
   )
   IF(TPL_ENABLE_MPI)
-    TRIBITS_ADD_EXECUTABLE(
+    ROL_ADD_EXECUTABLE(
       AugmentedSystem
       SOURCES AugmentedSystem.cpp ROL_PinTVectorCommunication_StdTpetraComposite.hpp
       ADD_DIR_TO_NAME
     )
   ENDIF()
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     DynamicNavierStokesDataCopy
     SOURCE_FILES
       input.xml
diff --git a/packages/rol/example/PDE-OPT/dynamic/navier-stokes/example_01.cpp b/packages/rol/example/PDE-OPT/dynamic/navier-stokes/example_01.cpp
index 15f96d73ed8c..3c8147d191fd 100644
--- a/packages/rol/example/PDE-OPT/dynamic/navier-stokes/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/dynamic/navier-stokes/example_01.cpp
@@ -12,7 +12,7 @@
 */
 
 #include "Teuchos_Comm.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
 
@@ -53,7 +53,7 @@ int main(int argc, char *argv[]) {
   using RealT = double;
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
   ROL::Ptr<const Teuchos::Comm<int>> comm
     = Tpetra::getDefaultComm();
 
diff --git a/packages/rol/example/PDE-OPT/dynamic/navier-stokes/example_02.cpp b/packages/rol/example/PDE-OPT/dynamic/navier-stokes/example_02.cpp
index 22f9c53699bd..2e4e1f6b8d64 100644
--- a/packages/rol/example/PDE-OPT/dynamic/navier-stokes/example_02.cpp
+++ b/packages/rol/example/PDE-OPT/dynamic/navier-stokes/example_02.cpp
@@ -12,7 +12,7 @@
 */
 
 #include "Teuchos_Comm.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
 
@@ -57,7 +57,7 @@ int main(int argc, char *argv[]) {
   using RealT = double;
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
   ROL::Ptr<const Teuchos::Comm<int>> comm
     = Tpetra::getDefaultComm();
 
diff --git a/packages/rol/example/PDE-OPT/dynamic/semilinear/AugmentedSystem.cpp b/packages/rol/example/PDE-OPT/dynamic/semilinear/AugmentedSystem.cpp
index 839f493c037a..10bd392320e9 100644
--- a/packages/rol/example/PDE-OPT/dynamic/semilinear/AugmentedSystem.cpp
+++ b/packages/rol/example/PDE-OPT/dynamic/semilinear/AugmentedSystem.cpp
@@ -12,7 +12,7 @@
 */
 
 #include "Teuchos_Comm.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Teuchos_FancyOStream.hpp"
 
 #include "Tpetra_Core.hpp"
@@ -134,7 +134,7 @@ int main(int argc, char *argv[])
   using Teuchos::RCP;
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // pauseToAttach(MPI_COMM_WORLD);
 
diff --git a/packages/rol/example/PDE-OPT/dynamic/semilinear/CMakeLists.txt b/packages/rol/example/PDE-OPT/dynamic/semilinear/CMakeLists.txt
index 45b6737156da..cd2da21a4762 100644
--- a/packages/rol/example/PDE-OPT/dynamic/semilinear/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/dynamic/semilinear/CMakeLists.txt
@@ -13,13 +13,13 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   # Need ROL_TpetraMultiVector.hpp
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_01
     SOURCES example_01.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     SemilinearParabolicDataCopy
     SOURCE_FILES
       input_ex01.xml plotresults.m
@@ -28,7 +28,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   )
 
   IF(TPL_ENABLE_MPI)
-    TRIBITS_ADD_EXECUTABLE(
+    ROL_ADD_EXECUTABLE(
       AugmentedSystem
       SOURCES AugmentedSystem.cpp
       ADD_DIR_TO_NAME
diff --git a/packages/rol/example/PDE-OPT/dynamic/semilinear/example_01.cpp b/packages/rol/example/PDE-OPT/dynamic/semilinear/example_01.cpp
index 7741ff79235f..1a9e91d92e9a 100644
--- a/packages/rol/example/PDE-OPT/dynamic/semilinear/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/dynamic/semilinear/example_01.cpp
@@ -12,7 +12,7 @@
 */
 
 #include "Teuchos_Comm.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
 
@@ -41,7 +41,7 @@ int main(int argc, char *argv[]) {
   using RealT = double;
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
   ROL::Ptr<const Teuchos::Comm<int>> comm
     = Tpetra::getDefaultComm();
 
diff --git a/packages/rol/example/PDE-OPT/dynamic/thermal-fluids/CMakeLists.txt b/packages/rol/example/PDE-OPT/dynamic/thermal-fluids/CMakeLists.txt
index d1e9881afc80..86227a9f35fe 100644
--- a/packages/rol/example/PDE-OPT/dynamic/thermal-fluids/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/dynamic/thermal-fluids/CMakeLists.txt
@@ -11,13 +11,13 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_01
     SOURCES example_01.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     DynamicThermalFluidsDataCopy
     SOURCE_FILES
       input.xml
diff --git a/packages/rol/example/PDE-OPT/dynamic/thermal-fluids/example_01.cpp b/packages/rol/example/PDE-OPT/dynamic/thermal-fluids/example_01.cpp
index 5f8b57abd890..3d90bdb7288c 100644
--- a/packages/rol/example/PDE-OPT/dynamic/thermal-fluids/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/dynamic/thermal-fluids/example_01.cpp
@@ -12,7 +12,7 @@
 */
 
 #include "Teuchos_Comm.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
 
@@ -51,7 +51,7 @@ int main(int argc, char *argv[]) {
   using RealT = double;
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
   ROL::Ptr<const Teuchos::Comm<int>> comm
     = Tpetra::getDefaultComm();
 
diff --git a/packages/rol/example/PDE-OPT/flow-opt/3dim/CMakeLists.txt b/packages/rol/example/PDE-OPT/flow-opt/3dim/CMakeLists.txt
index 81b83fa35f00..18e601d5ff68 100644
--- a/packages/rol/example/PDE-OPT/flow-opt/3dim/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/flow-opt/3dim/CMakeLists.txt
@@ -1,6 +1,6 @@
 ADD_SUBDIRECTORY(models)
 
-TRIBITS_COPY_FILES_TO_BINARY_DIR(
+ROL_COPY_FILES_TO_BINARY_DIR(
   Project3dimDataCopy
   SOURCE_FILES
     mesh/diffuser-tet-clip.jou
diff --git a/packages/rol/example/PDE-OPT/flow-opt/3dim/models/darcy/CMakeLists.txt b/packages/rol/example/PDE-OPT/flow-opt/3dim/models/darcy/CMakeLists.txt
index 9f66aa87008c..a07266ed984a 100644
--- a/packages/rol/example/PDE-OPT/flow-opt/3dim/models/darcy/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/flow-opt/3dim/models/darcy/CMakeLists.txt
@@ -11,13 +11,13 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_01
     SOURCES example_01.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     Project3dimDarcyDataCopy
     SOURCE_FILES
       input.xml
@@ -29,5 +29,36 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
   )
 
+ENDIF()
+
+IF(${PROJECT_NAME}_ENABLE_Intrepid2 AND
+   ${PROJECT_NAME}_ENABLE_Ifpack2   AND
+   ${PROJECT_NAME}_ENABLE_MueLu     AND
+   ${PROJECT_NAME}_ENABLE_Amesos2   AND
+   ${PROJECT_NAME}_ENABLE_Tpetra )
+
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/teuchos/src/sol)
+
+  # Need ROL_TpetraMultiVector.hpp
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
+
+  ROL_ADD_EXECUTABLE(
+    example_01_Kokkos
+    SOURCES example_01K.cpp
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_COPY_FILES_TO_BINARY_DIR(
+    Project3dimDarcyDataCopyK
+    SOURCE_FILES
+      input.xml
+      plotDarcy.m
+      plotmesh.m
+      vtkwrite.m
+      vtkwritecell.m
+    SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}"
+    DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
+  )
 
 ENDIF()
diff --git a/packages/rol/example/PDE-OPT/flow-opt/3dim/models/darcy/example_01.cpp b/packages/rol/example/PDE-OPT/flow-opt/3dim/models/darcy/example_01.cpp
index bf3bc79d827f..b2565738c863 100644
--- a/packages/rol/example/PDE-OPT/flow-opt/3dim/models/darcy/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/flow-opt/3dim/models/darcy/example_01.cpp
@@ -13,8 +13,7 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -47,7 +46,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int>> comm
     = Tpetra::getDefaultComm();
   const int myRank = comm->getRank();
diff --git a/packages/rol/example/PDE-OPT/flow-opt/3dim/models/darcy/example_01K.cpp b/packages/rol/example/PDE-OPT/flow-opt/3dim/models/darcy/example_01K.cpp
new file mode 100644
index 000000000000..9d6aceaad99d
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/flow-opt/3dim/models/darcy/example_01K.cpp
@@ -0,0 +1,224 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_01.cpp
+    \brief Shows how to solve the Darcy porosity optimization problem.
+*/
+
+#include "Teuchos_Comm.hpp"
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+
+#include "ROL_TpetraMultiVector.hpp"
+#include "ROL_Reduced_Objective_SimOpt.hpp"
+#include "ROL_Bounds.hpp"
+#include "ROL_Solver.hpp"
+#include "ROL_SingletonVector.hpp"
+#include "ROL_ConstraintFromObjective.hpp"
+
+#include "../../../../TOOLS/meshreaderK.hpp"
+#include "../../../../TOOLS/pdeconstraintK.hpp"
+#include "../../../../TOOLS/pdeobjectiveK.hpp"
+#include "../../../../TOOLS/pdevectorK.hpp"
+#include "../../../../TOOLS/integralconstraintK.hpp"
+
+#include "pde_darcyK.hpp"
+#include "obj_darcyK.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+int main(int argc, char *argv[]) {
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  Kokkos::ScopeGuard kokkosScope (argc, argv);
+  ROL::Ptr<const Teuchos::Comm<int>> comm
+    = Tpetra::getDefaultComm();
+  const int myRank = comm->getRank();
+  if ((iprint > 0) && (myRank == 0))
+    outStream = ROL::makePtrFromRef(std::cout);
+  else
+    outStream = ROL::makePtrFromRef(bhs);
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+
+    /*** Read in XML input ***/
+    std::string filename = "input.xml";
+    auto parlist = ROL::getParametersFromXmlFile(filename);
+
+    bool output = parlist->sublist("SimOpt").sublist("Solve").get("Output Iteration History",false);
+    output = (iprint > 0) && (myRank==0) && output;
+    parlist->sublist("SimOpt").sublist("Solve").set("Output Iteration History",output);
+
+    /*** Initialize main data structure. ***/
+    auto meshMgr = ROL::makePtr<MeshReader<RealT,DeviceT>>(*parlist);
+    // Initialize PDE describing Darcy equations.
+    auto pde = ROL::makePtr<PDE_Darcy<RealT,DeviceT>>(*parlist);
+    auto con = ROL::makePtr<PDE_Constraint<RealT,DeviceT>>(pde,meshMgr,comm,*parlist,*outStream);
+    // Cast the constraint and get the assembler.
+    auto assembler = con->getAssembler();
+    con->setSolveParameters(*parlist);
+
+    // Create state vectors
+    auto u_ptr = assembler->createStateVector(); u_ptr->randomize();
+    auto up = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(u_ptr,pde,assembler,*parlist);
+    auto p_ptr = assembler->createStateVector(); p_ptr->randomize();
+    auto pp = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(p_ptr,pde,assembler,*parlist);
+    auto r_ptr = assembler->createResidualVector(); r_ptr->putScalar(0.0);
+    auto rp = ROL::makePtr<PDE_DualSimVector<RealT,DeviceT>>(r_ptr,pde,assembler,*parlist);
+
+    // Create control vectors
+    bool useParamVar = parlist->sublist("Problem").get("Use Optimal Constant Velocity",false);
+    int dim = 2;
+    auto z_ptr = assembler->createControlVector(); z_ptr->randomize();
+    ROL::Ptr<std::vector<RealT>>            z0_ptr;
+    ROL::Ptr<ROL::StdVector<RealT>>         z0p;
+    ROL::Ptr<ROL::TpetraMultiVector<RealT>> z1p;
+    ROL::Ptr<ROL::Vector<RealT>>            zp;
+    if (useParamVar) {
+      z0_ptr = ROL::makePtr<std::vector<RealT>>(dim);
+      z0p = ROL::makePtr<ROL::StdVector<RealT>>(z0_ptr);
+      z1p = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(z_ptr,pde,assembler,*parlist);
+      zp = ROL::makePtr<PDE_OptVector<RealT>>(z1p,z0p,myRank);
+    }
+    else {
+      zp = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(z_ptr,pde,assembler,*parlist);
+    }
+
+    // Create combined vector
+    auto xp = ROL::makePtr<ROL::Vector_SimOpt<RealT>>(up,zp);
+
+    // Initialize quadratic objective function.
+    auto qoi  = ROL::makePtr<QoI_VelocityTracking_Darcy<RealT,DeviceT>>(*parlist,
+                                                                        pde->getPressureFE(),
+                                                                        pde->getControlFE(),
+                                                                        pde->getPermeability());
+    auto obj  = ROL::makePtr<PDE_Objective<RealT,DeviceT>>(qoi,assembler);
+    auto robj = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(obj,con,up,zp,pp,true,false);
+
+    // Build bound constraint
+    ROL::Ptr<ROL::Vector<RealT>> lp, hp;
+    if (useParamVar) {
+      auto l0p = ROL::makePtr<ROL::StdVector<RealT>>(dim,ROL::ROL_NINF<RealT>());
+      auto h0p = ROL::makePtr<ROL::StdVector<RealT>>(dim,ROL::ROL_INF<RealT>());
+      auto l1_ptr = assembler->createControlVector();
+      auto h1_ptr = assembler->createControlVector();
+      auto l1p = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(l1_ptr,pde,assembler,*parlist);
+      auto h1p = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(h1_ptr,pde,assembler,*parlist);
+      l1p->setScalar(0.0);
+      h1p->setScalar(1.0);
+      lp = ROL::makePtr<PDE_OptVector<RealT>>(l1p,l0p,myRank);
+      hp = ROL::makePtr<PDE_OptVector<RealT>>(h1p,h0p,myRank);
+    }
+    else {
+      lp = zp->clone(); lp->setScalar(0.0);
+      hp = zp->clone(); hp->setScalar(1.0);
+    }
+    auto bnd = ROL::makePtr<ROL::Bounds<RealT>>(lp, hp);
+    // Build optimization problem
+    auto optProb = ROL::makePtr<ROL::Problem<RealT>>(robj, zp);
+    optProb->addBoundConstraint(bnd);
+    optProb->finalize(false,true,*outStream);
+
+    // Run derivative checks
+    bool checkDeriv = parlist->sublist("Problem").get("Check derivatives",false);
+    if ( checkDeriv ) {
+      auto rup = up->clone(); rup->randomize(-1.0,1.0);
+      auto rzp = zp->clone(); rzp->randomize( 0.0,1.0);
+      auto rpp = pp->clone(); rpp->randomize(-1.0,1.0);
+      auto dup = up->clone(); dup->randomize(-1.0,1.0);
+      auto dzp = zp->clone(); dzp->randomize( 0.0,1.0);
+      con->checkApplyJacobian_1(*rup,*rzp,*dup,*rup,true,*outStream);
+      con->checkApplyJacobian_2(*rup,*rzp,*dzp,*rup,true,*outStream);
+      con->checkInverseJacobian_1(*rup,*rup,*rup,*rzp,true,*outStream);
+      con->checkInverseAdjointJacobian_1(*rup,*rup,*rup,*rzp,true,*outStream);
+      con->checkApplyAdjointHessian_11(*rup,*rzp,*rpp,*dup,*rup,true,*outStream);
+      con->checkApplyAdjointHessian_21(*rup,*rzp,*rpp,*dzp,*rup,true,*outStream);
+      con->checkApplyAdjointHessian_12(*rup,*rzp,*rpp,*dup,*rzp,true,*outStream);
+      con->checkApplyAdjointHessian_22(*rup,*rzp,*rpp,*dzp,*rzp,true,*outStream);
+      obj->checkGradient_1(*rup,*rzp,*dup,true,*outStream);
+      obj->checkGradient_2(*rup,*rzp,*dzp,true,*outStream);
+      obj->checkHessVec_11(*rup,*rzp,*dup,true,*outStream);
+      obj->checkHessVec_12(*rup,*rzp,*dzp,true,*outStream);
+      obj->checkHessVec_21(*rup,*rzp,*dup,true,*outStream);
+      obj->checkHessVec_22(*rup,*rzp,*dzp,true,*outStream);
+      robj->checkGradient(*rzp,*dzp,true,*outStream);
+      robj->checkHessVec(*rzp,*dzp,true,*outStream);
+      //optProb->check(*outStream);
+    }
+
+    // Solve optimization problem
+    zp->setScalar(0.5);
+    up->zero(); pp->zero();
+    bool opt = parlist->sublist("Problem").get("Solve Optimization Problem",true);
+    if (opt) {
+      std::ifstream infile("control.txt");
+      if (infile.good())
+        assembler->inputTpetraVector(z_ptr,"control.txt");
+      if (useParamVar) {
+        std::ifstream infile0; infile0.open("target.txt");
+        if (infile0.good()) {
+          for (int i = 0; i < dim; ++i) infile0 >> (*z0_ptr)[i];
+          infile0.close();
+        }
+      }
+      ROL::Solver<RealT> optSolver(optProb, *parlist);
+      optSolver.solve(*outStream);
+      con->outputTpetraVector(z_ptr,"control.txt");
+      if (useParamVar) {
+        std::ofstream outfile; outfile.open("target.txt");
+        for (const auto x : *z0_ptr) {
+          outfile << std::scientific << std::setprecision(16);
+          outfile << x << std::endl;
+        }
+        outfile.close();
+      }
+    }
+
+    // Output
+    assembler->printMeshData(*outStream);
+    RealT tol(1.e-8);
+    Teuchos::Array<RealT> res(1,0);
+    con->solve(*rp,*up,*zp,tol);
+    con->outputTpetraVector(u_ptr,"state.txt");
+    con->value(*rp,*up,*zp,tol);
+    r_ptr->norm2(res.view(0,1));
+    *outStream << "Residual Norm: " << res[0] << std::endl;
+    assembler->printCellAveragesPDE(pde,u_ptr,z_ptr);
+    errorFlag += (res[0] > 1.e-6 ? 1 : 0);
+    //con->outputTpetraData();
+
+    // Get a summary from the time monitor.
+    Teuchos::TimeMonitor::summarize();
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/flow-opt/3dim/models/darcy/obj_darcyK.hpp b/packages/rol/example/PDE-OPT/flow-opt/3dim/models/darcy/obj_darcyK.hpp
new file mode 100644
index 000000000000..83cece94c50f
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/flow-opt/3dim/models/darcy/obj_darcyK.hpp
@@ -0,0 +1,606 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  obj.hpp
+    \brief Provides the interface for local (cell-based) objective function computations.
+*/
+
+#ifndef OBJ_DARCYK_HPP
+#define OBJ_DARCYK_HPP
+
+#include "../../../../TOOLS/qoiK.hpp"
+#include "pde_darcyK.hpp"
+#include "permeabilityK.hpp"
+
+
+template <class Real, class DeviceType>
+class QoI_VelocityTracking_Darcy : public QoI<Real, DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type>            fePrs_, feCtrl_;
+  const ROL::Ptr<Permeability<Real,DeviceType>> perm_;
+  scalar_view                        target_, weight_;
+  Real                               rad_, yvel_, frac_, twpow_;
+  bool                               onlyAxial_;
+
+  Real xTarget(const std::vector<Real> &x) const {
+    const Real X = x[0], Y = x[1];
+    //return xWeight(x) ? -X*Y/(rad_*rad_-Y*Y) : zero;
+    //return xWeight(x) ? -X*Y/std::sqrt(rad_*rad_-Y*Y) : zero;
+    //return polyWeight(x) * (-X*Y/std::sqrt(rad_*rad_-Y*Y));
+    //return -X*Y/std::sqrt(rad_*rad_-Y*Y);
+    return -X*Y/((rad_*rad_-Y*Y)*(rad_*rad_-Y*Y));
+  }
+
+  Real yTarget(const std::vector<Real> &x) const {
+    const Real one(1), Y = x[1];
+    //return yWeight(x) ? one : zero;
+    //return yWeight(x) ? std::sqrt(rad_*rad_-Y*Y) : zero;
+    //return polyWeight(x) * std::sqrt(rad_*rad_-Y*Y);
+    //return std::sqrt(rad_*rad_-Y*Y);
+    return one/(rad_*rad_-Y*Y);
+  }
+
+  Real xWeight(const std::vector<Real> &x) const {
+    return yWeight(x);
+  }
+
+  Real yWeight(const std::vector<Real> &x) const {
+    //const Real zero(0), one(1), Y = x[1];
+    //return (std::abs(Y) <= frac_*rad_ ? one : zero);
+    return polyWeight(x);
+  }
+
+  Real polyWeight(const std::vector<Real> &x) const {
+    const Real zero(0), one(1), Y = x[1], p = twpow_;
+    const Real yTOP = 9.976339196;
+    const Real yBOT = -yTOP;
+    Real val = 0, at = 0, bt = 0;
+    at = one / std::pow(-yTOP,p);
+    bt = one / std::pow(-yBOT,p);
+    if (Y > zero) val = at*std::pow(Y-yTOP,p);
+    else          val = bt*std::pow(Y-yBOT,p);
+    //std::cout << Y << "  " << val << std::endl;
+    return val;
+  }
+
+public:
+  QoI_VelocityTracking_Darcy(ROL::ParameterList             &list,
+                              const ROL::Ptr<fe_type>           &fePrs,
+                              const ROL::Ptr<fe_type>           &feCtrl,
+                              const ROL::Ptr<Permeability<Real,DeviceType>> &perm)
+    : fePrs_(fePrs), feCtrl_(feCtrl), perm_(perm) {
+    rad_         = list.sublist("Problem").get("Diffuser Radius",5.0);
+    yvel_        = list.sublist("Problem").get("Target Axial Velocity",15.0);
+    frac_        = list.sublist("Problem").get("Integration Domain Fraction",0.95);
+    onlyAxial_   = list.sublist("Problem").get("Only Use Axial Velocity",false);
+    twpow_       = list.sublist("Problem").get("Target Weighting Power",0.0);
+    Real xWScal  = list.sublist("Problem").get("Radial Tracking Scale",1.0);
+    Real yWScal  = list.sublist("Problem").get("Axial Tracking Scale",1.0);
+    bool useNorm = list.sublist("Problem").get("Use Normalized Misfit",false);
+    useNorm = onlyAxial_ ? false : useNorm;
+    xWScal  = onlyAxial_ ? static_cast<Real>(0) : xWScal;
+    const int c = fePrs_->gradN().extent_int(0);
+    const int p = fePrs_->gradN().extent_int(2);
+    const int d = fePrs_->gradN().extent_int(3);
+    target_ = scalar_view("target",c,p,d);
+    weight_ = scalar_view("weight",c,p,d);
+    std::vector<Real> x(2);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        x[0] = (fePrs_->cubPts())(i,j,0);
+        x[1] = (fePrs_->cubPts())(i,j,1);
+        target_(i,j,0) = xTarget(x);
+        target_(i,j,1) = yTarget(x);
+        if (useNorm && yWeight(x)) {
+          xWScal = static_cast<Real>(1)
+                  /(std::pow(target_(i,j,0),2) + std::pow(target_(i,j,1),2));
+          yWScal = xWScal;
+        }
+        weight_(i,j,0) = x[0] * xWScal * xWeight(x);
+        weight_(i,j,1) = x[0] * yWScal * yWeight(x);
+      }
+    }
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c = fePrs_->gradN().extent_int(0);
+    const int p = fePrs_->gradN().extent_int(2);
+    const int d = fePrs_->gradN().extent_int(3);
+    Real yvel = (z_param == ROL::nullPtr) ? yvel_ : (*z_param)[0];
+    // Initialize output val
+    val = scalar_view("val",c);
+    // Compute cost integral
+    scalar_view gradU("gradU",c,p,d);
+    scalar_view vel("vel",c,p,d);
+    scalar_view wvel("wvel",c,p,d);
+    scalar_view valZ("valZ",c,p);
+    scalar_view alpha("alpha",c,p);
+    fePrs_->evaluateGradient(gradU, u_coeff);
+    feCtrl_->evaluateValue(valZ, z_coeff);
+    perm_->compute(alpha, valZ, fePrs_->cubPts(), 0);
+
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k) {
+          vel(i,j,k)  = alpha(i,j)*gradU(i,j,k)+yvel*target_(i,j,k);
+          wvel(i,j,k) = weight_(i,j,k)*vel(i,j,k);
+        }
+      }
+    }
+
+    fePrs_->computeIntegral(val,vel,wvel);
+    // Scale by one half
+    rst::scale(val,static_cast<Real>(0.5));
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c = fePrs_->gradN().extent_int(0);
+    const int f = fePrs_->gradN().extent_int(1);
+    const int p = fePrs_->gradN().extent_int(2);
+    const int d = fePrs_->gradN().extent_int(3);
+    Real yvel = (z_param == ROL::nullPtr) ? yvel_ : (*z_param)[0];
+    // Initialize output grad
+    grad = scalar_view("grad", c, f);
+    // Compute cost integral
+    scalar_view gradU("gradU",c,p,d);
+    scalar_view awvel("awvel",c,p,d);
+    scalar_view valZ("valZ",c,p);
+    scalar_view alpha("alpha",c,p);
+    fePrs_->evaluateGradient(gradU, u_coeff);
+    feCtrl_->evaluateValue(valZ, z_coeff);
+    perm_->compute(alpha, valZ, fePrs_->cubPts(), 0);
+
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k) {
+          awvel(i,j,k) = weight_(i,j,k)*alpha(i,j)*(alpha(i,j)*gradU(i,j,k)+yvel*target_(i,j,k));
+        }
+      }
+    }
+
+    fst::integrate(grad,awvel,fePrs_->gradNdetJ(),false);
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c  = fePrs_->gradN().extent_int(0);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int p  = fePrs_->gradN().extent_int(2);
+    const int d  = fePrs_->gradN().extent_int(3);
+    Real yvel = (z_param == ROL::nullPtr) ? yvel_ : (*z_param)[0];
+    // Initialize output grad
+    grad = scalar_view("grad",c,fc);
+    // Compute cost integral
+    scalar_view gradU("gradU",c,p,d);
+    scalar_view valZ("valZ",c,p);
+    scalar_view alpha("alpha",c,p);
+    scalar_view dalpha("dalpha",c,p);
+    scalar_view deriv("deriv",c,p);
+    fePrs_->evaluateGradient(gradU, u_coeff);
+    feCtrl_->evaluateValue(valZ, z_coeff);
+    perm_->compute(alpha, valZ, fePrs_->cubPts(), 0);
+    perm_->compute(dalpha, valZ, fePrs_->cubPts(), 1);
+
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k) {
+          deriv(i,j) += weight_(i,j,k)*(alpha(i,j)*gradU(i,j,k)+yvel*target_(i,j,k))
+                        *dalpha(i,j)*gradU(i,j,k);
+        }
+      }
+    }
+
+    fst::integrate(grad,deriv,feCtrl_->NdetJ(),false);
+  }
+
+  std::vector<Real> gradient_3(std::vector<scalar_view> & grad,
+                               const scalar_view u_coeff,
+                               const scalar_view z_coeff = scalar_view(),
+                               const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if (z_param != ROL::nullPtr) {
+      // Get relevant dimensions
+      const int c = fePrs_->gradN().extent_int(0);
+      const int p = fePrs_->gradN().extent_int(2);
+      const int d = fePrs_->gradN().extent_int(3);
+      Real yvel = (z_param == ROL::nullPtr) ? yvel_ : (*z_param)[0];
+      // Initialize output val
+      std::vector<Real> g_param(d,static_cast<Real>(0));
+      grad.clear(); grad.resize(d);
+      for (int i = 0; i < d; ++i)
+        grad[i] = scalar_view("grad",c);
+      // Compute cost integral
+      scalar_view gradU("gradU",c,p,d);
+      scalar_view wvel("wvel",c,p,d);
+      scalar_view valZ("valZ",c,p);
+      scalar_view alpha("alpha",c,p);
+      fePrs_->evaluateGradient(gradU, u_coeff);
+      feCtrl_->evaluateValue(valZ, z_coeff);
+      perm_->compute(alpha, valZ, fePrs_->cubPts(), 0);
+
+      for (int i = 0; i < c; ++i) {
+        for (int j = 0; j < p; ++j) {
+          for (int k = 0; k < d; ++k) {
+            wvel(i,j,k) = weight_(i,j,k)*(alpha(i,j)*gradU(i,j,k)+yvel*target_(i,j,k));
+          }
+        }
+      }
+
+      fePrs_->computeIntegral(grad[0],wvel,target_);
+
+      return g_param;
+    }
+    else {
+      throw Exception::Zero(">>> QoI_VelocityTracking_Darcy::gradient_3 is zero.");
+    }
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c = fePrs_->gradN().extent_int(0);
+    const int f = fePrs_->gradN().extent_int(1);
+    const int p = fePrs_->gradN().extent_int(2);
+    const int d = fePrs_->gradN().extent_int(3);
+    // Initialize output hess
+    hess = scalar_view("hess",c,f);
+    // Compute cost integral
+    scalar_view gradV("gradV",c,p,d);
+    scalar_view awvel("awvel",c,p,d);
+    scalar_view valZ("valZ",c,p);
+    scalar_view alpha("alpha",c,p);
+    fePrs_->evaluateGradient(gradV, v_coeff);
+    feCtrl_->evaluateValue(valZ, z_coeff);
+    perm_->compute(alpha, valZ, fePrs_->cubPts(), 0);
+
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k) {
+          awvel(i,j,k) = alpha(i,j)*alpha(i,j)*weight_(i,j,k)*gradV(i,j,k);
+        }
+      }
+    }
+
+    fst::integrate(hess,awvel,fePrs_->gradNdetJ(),false);
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c = fePrs_->gradN().extent_int(0);
+    const int f = fePrs_->gradN().extent_int(1);
+    const int p = fePrs_->gradN().extent_int(2);
+    const int d = fePrs_->gradN().extent_int(3);
+    Real yvel = (z_param == ROL::nullPtr) ? yvel_ : (*z_param)[0];
+    // Initialize output grad
+    hess = scalar_view("hess",c,f);
+    // Compute cost integral
+    scalar_view gradU("gradU",c,p,d);
+    scalar_view awvel("awvel",c,p,d);
+    scalar_view valZ("valZ",c,p);
+    scalar_view valV("valV",c,p);
+    scalar_view alpha("alpha",c,p);
+    scalar_view dalpha("dalpha",c,p);
+    fePrs_->evaluateGradient(gradU, u_coeff);
+    feCtrl_->evaluateValue(valZ, z_coeff);
+    feCtrl_->evaluateValue(valV, v_coeff);
+    perm_->compute( alpha, valZ, fePrs_->cubPts(), 0);
+    perm_->compute(dalpha, valZ, fePrs_->cubPts(), 1);
+
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k) {
+          awvel(i,j,k)  = static_cast<Real>(2)*alpha(i,j)*gradU(i,j,k)+yvel*target_(i,j,k);
+          awvel(i,j,k) *= weight_(i,j,k)*dalpha(i,j)*valV(i,j);
+        }
+      }
+    }
+
+    fst::integrate(hess,awvel,fePrs_->gradNdetJ(),false);
+  }
+
+  void HessVec_13(scalar_view & hess,
+                  const ROL::Ptr<const std::vector<Real>> & v_param,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if (z_param != ROL::nullPtr) {
+      // Get relevant dimensions
+      const int c = fePrs_->gradN().extent_int(0);
+      const int f = fePrs_->gradN().extent_int(1);
+      const int p = fePrs_->gradN().extent_int(2);
+      const int d = fePrs_->gradN().extent_int(3);
+      // Initialize output val
+      hess = scalar_view("hess",c,f);
+      // Compute cost integral
+      scalar_view wvel("wvel",c,p,d);
+      scalar_view valZ("valZ",c,p);
+      scalar_view alpha("alpha",c,p);
+      feCtrl_->evaluateValue(valZ, z_coeff);
+      perm_->compute(alpha, valZ, fePrs_->cubPts(), 0);
+
+      for (int i = 0; i < c; ++i) {
+        for (int j = 0; j < p; ++j) {
+          for (int k = 0; k < d; ++k) {
+            wvel(i,j,k) = weight_(i,j,k)*alpha(i,j)*target_(i,j,k)*(*v_param)[0];
+          }
+        }
+      }
+
+      fst::integrate(hess,wvel,fePrs_->gradNdetJ(),false);
+    }
+    else {
+      throw Exception::NotImplemented(">>> HessVec_13 not implemented.");
+    }
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c  = fePrs_->gradN().extent_int(0);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int p  = fePrs_->gradN().extent_int(2);
+    const int d  = fePrs_->gradN().extent_int(3);
+    Real yvel = (z_param == ROL::nullPtr) ? yvel_ : (*z_param)[0];
+    // Initialize output grad
+    hess = scalar_view("hess",c,fc);
+    // Compute cost integral
+    scalar_view gradU("gradU",c,p,d);
+    scalar_view gradV("gradV",c,p,d);
+    scalar_view valZ("valZ",c,p);
+    scalar_view alpha("alpha",c,p);
+    scalar_view dalpha("dalpha",c,p);
+    scalar_view deriv("deriv",c,p);
+    fePrs_->evaluateGradient(gradU, u_coeff);
+    fePrs_->evaluateGradient(gradV, v_coeff);
+    feCtrl_->evaluateValue(valZ, z_coeff);
+    perm_->compute(alpha, valZ, fePrs_->cubPts(), 0);
+    perm_->compute(dalpha, valZ, fePrs_->cubPts(), 1);
+
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k) {
+          deriv(i,j) += weight_(i,j,k)*(alpha(i,j)*gradU(i,j,k)+yvel*target_(i,j,k))
+                        *dalpha(i,j)*gradV(i,j,k);
+          deriv(i,j) += weight_(i,j,k)*dalpha(i,j)*gradU(i,j,k)*alpha(i,j)*gradV(i,j,k);
+        }
+      }
+    }
+
+    fst::integrate(hess,deriv,feCtrl_->NdetJ(),false);
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c  = fePrs_->gradN().extent_int(0);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int p  = fePrs_->gradN().extent_int(2);
+    const int d  = fePrs_->gradN().extent_int(3);
+    Real yvel = (z_param == ROL::nullPtr) ? yvel_ : (*z_param)[0];
+    // Initialize output hess
+    hess = scalar_view("hess",c,fc);
+    // Compute cost integral
+    scalar_view gradU("gradU",c,p,d);
+    scalar_view valZ("valZ",c,p);
+    scalar_view valV("valV",c,p);
+    scalar_view alpha("alpha",c,p);
+    scalar_view dalpha("dalpha",c,p);
+    scalar_view ddalpha("ddalpha",c,p);
+    scalar_view deriv("deriv",c,p);
+    fePrs_->evaluateGradient(gradU, u_coeff);
+    feCtrl_->evaluateValue(valZ, z_coeff);
+    feCtrl_->evaluateValue(valV, v_coeff);
+    perm_->compute(  alpha, valZ, fePrs_->cubPts(), 0);
+    perm_->compute( dalpha, valZ, fePrs_->cubPts(), 1);
+    perm_->compute(ddalpha, valZ, fePrs_->cubPts(), 2);
+
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k) {
+          deriv(i,j) += weight_(i,j,k)*(alpha(i,j)*gradU(i,j,k)+yvel*target_(i,j,k))
+                        *ddalpha(i,j)*valV(i,j)*gradU(i,j,k);
+          deriv(i,j) += weight_(i,j,k)*dalpha(i,j)*gradU(i,j,k)
+                        *dalpha(i,j)*valV(i,j)*gradU(i,j,k);
+        }
+      }
+    }
+
+    fst::integrate(hess,deriv,feCtrl_->NdetJ(),false);
+  }
+
+  void HessVec_23(scalar_view & hess,
+                  const ROL::Ptr<const std::vector<Real>> & v_param,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if (z_param != ROL::nullPtr) {
+      // Get relevant dimensions
+      const int c  = fePrs_->gradN().extent_int(0);
+      const int fc = feCtrl_->N().extent_int(1);
+      const int p  = fePrs_->gradN().extent_int(2);
+      const int d  = fePrs_->gradN().extent_int(3);
+      // Initialize output val
+      hess = scalar_view("hess",c,fc);
+      // Compute cost integral
+      scalar_view gradU("gradU",c,p,d);
+      scalar_view wvel("wvel",c,p);
+      scalar_view valZ("valZ",c,p);
+      scalar_view dalpha("dalpha",c,p);
+      fePrs_->evaluateGradient(gradU, u_coeff);
+      feCtrl_->evaluateValue(valZ, z_coeff);
+      perm_->compute(dalpha, valZ, fePrs_->cubPts(), 1);
+
+      for (int i = 0; i < c; ++i) {
+        for (int j = 0; j < p; ++j) {
+          for (int k = 0; k < d; ++k) {
+            wvel(i,j) += weight_(i,j,k)*dalpha(i,j)*gradU(i,j,k)*target_(i,j,k)*(*v_param)[0];
+          }
+        }
+      }
+
+      fst::integrate(hess,wvel,feCtrl_->NdetJ(),false);
+    }
+    else {
+      throw Exception::Zero(">>> QoI_VelocityTracking_Darcy::HessVec_23 is zero.");
+    }
+  }
+
+  std::vector<Real> HessVec_31(std::vector<scalar_view> & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if (z_param != ROL::nullPtr) {
+      // Get relevant dimensions
+      const int c = fePrs_->gradN().extent_int(0);
+      const int p = fePrs_->gradN().extent_int(2);
+      const int d = fePrs_->gradN().extent_int(3);
+      // Initialize output val
+      std::vector<Real> h_param(d,static_cast<Real>(0));
+      hess.clear(); hess.resize(d);
+      for (int k = 0; k < d; ++k)
+        hess[k] = scalar_view("hess",c);
+      // Compute cost integral
+      scalar_view gradV("gradV",c,p,d);
+      scalar_view wvel("wvel",c,p,d);
+      scalar_view valZ("valZ",c,p);
+      scalar_view alpha("alpha",c,p);
+      fePrs_->evaluateGradient(gradV, v_coeff);
+      feCtrl_->evaluateValue(valZ, z_coeff);
+      perm_->compute(alpha, valZ, fePrs_->cubPts(), 0);
+
+      for (int i = 0; i < c; ++i) {
+        for (int j = 0; j < p; ++j) {
+          for (int k = 0; k < d; ++k) {
+            wvel(i,j,k) = weight_(i,j,k)*alpha(i,j)*gradV(i,j,k);
+          }
+        }
+      }
+
+      fePrs_->computeIntegral(hess[0],wvel,target_);
+
+      return h_param;
+    }
+    else {
+      throw Exception::Zero(">>> QoI_VelocityTracking_Darcy::HessVec_31 is zero.");
+    }
+  }
+
+  std::vector<Real> HessVec_32(std::vector<scalar_view> & hess,
+                               const scalar_view v_coeff,
+                               const scalar_view u_coeff,
+                               const scalar_view z_coeff = scalar_view(),
+                               const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if (z_param != ROL::nullPtr) {
+      // Get relevant dimensions
+      const int c = fePrs_->gradN().extent_int(0);
+      const int p = fePrs_->gradN().extent_int(2);
+      const int d = fePrs_->gradN().extent_int(3);
+      // Initialize output val
+      std::vector<Real> h_param(d,static_cast<Real>(0));
+      hess.clear(); hess.resize(d);
+      for (int k = 0; k < d; ++k)
+        hess[k] = scalar_view("hess",c);
+      // Compute cost integral
+      scalar_view gradU("gradU",c,p,d);
+      scalar_view wvel("wvel",c,p,d);
+      scalar_view valZ("valZ",c,p);
+      scalar_view valV("valV",c,p);
+      scalar_view dalpha("dalpha",c,p);
+      fePrs_->evaluateGradient(gradU, u_coeff);
+      feCtrl_->evaluateValue(valZ, z_coeff);
+      feCtrl_->evaluateValue(valV, v_coeff);
+      perm_->compute(dalpha, valZ, fePrs_->cubPts(), 1);
+
+      for (int i = 0; i < c; ++i) {
+        for (int j = 0; j < p; ++j) {
+          for (int k = 0; k < d; ++k) {
+            wvel(i,j,k) = weight_(i,j,k)*dalpha(i,j)*gradU(i,j,k)*valV(i,j);
+          }
+        }
+      }
+
+      fePrs_->computeIntegral(hess[0],wvel,target_);
+
+      return h_param;
+    }
+    else {
+      throw Exception::Zero(">>> QoI_VelocityTracking_Darcy::HessVec_32 is zero.");
+    }
+  }
+
+  std::vector<Real> HessVec_33(std::vector<scalar_view> & hess,
+                               const ROL::Ptr<const std::vector<Real>> & v_param,
+                               const scalar_view u_coeff,
+                               const scalar_view z_coeff = scalar_view(),
+                               const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if (z_param != ROL::nullPtr) {
+      // Get relevant dimensions
+      const int c = fePrs_->gradN().extent_int(0);
+      const int p = fePrs_->gradN().extent_int(2);
+      const int d = fePrs_->gradN().extent_int(3);
+      // Initialize output val
+      std::vector<Real> h_param(d,static_cast<Real>(0));
+      hess.clear(); hess.resize(d);
+      for (int k = 0; k < d; ++k)
+        hess[k] = scalar_view("hess",c);
+      // Compute cost integral
+      scalar_view wtarget("wtarget",c,p,d);
+      for (int i = 0; i < c; ++i) {
+        for (int j = 0; j < p; ++j) {
+          for (int k = 0; k < d; ++k) {
+            wtarget(i,j,k) = weight_(i,j,k)*target_(i,j,k);
+          }
+        }
+      }
+      fePrs_->computeIntegral(hess[0],wtarget,target_);
+      rst::scale(hess[0],(*v_param)[0]);
+
+      return h_param;
+    }
+    else {
+      throw Exception::Zero(">>> QoI_VelocityTracking_Darcy::HessVec_33 is zero.");
+    }
+  }
+
+}; // QoI_VelocityTracking_Darcy
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/flow-opt/3dim/models/darcy/pde_darcyK.hpp b/packages/rol/example/PDE-OPT/flow-opt/3dim/models/darcy/pde_darcyK.hpp
new file mode 100644
index 000000000000..65abb5021c4d
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/flow-opt/3dim/models/darcy/pde_darcyK.hpp
@@ -0,0 +1,826 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  pde_darcy.hpp
+    \brief Implements the local PDE interface for the Darcy porosity optimization problem.
+*/
+
+#ifndef PDE_DARCYK_HPP
+#define PDE_DARCYK_HPP
+
+#include "../../../../TOOLS/pdeK.hpp"
+#include "../../../../TOOLS/feK.hpp"
+#include "../../../../TOOLS/fieldhelperK.hpp"
+#include "permeabilityK.hpp"
+
+#include "Intrepid2_HVOL_C0_FEM.hpp"
+#include "Intrepid2_HGRAD_QUAD_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_QUAD_C2_FEM.hpp"
+#include "Intrepid2_HGRAD_TRI_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_TRI_C2_FEM.hpp"
+#include "Intrepid2_DefaultCubatureFactory.hpp"
+#include "Intrepid2_FunctionSpaceTools.hpp"
+
+#include "ROL_Ptr.hpp"
+
+template <class Real, class DeviceType>
+class PDE_Darcy : public PDE<Real, DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  // Finite element basis information
+  basis_ptr basisPtrPrs_, basisPtrCtrl_;
+  std::vector<basis_ptr> basisPtrs_, basisPtrsCtrl_;
+  // Cell cubature information
+  ROL::Ptr<Intrepid2::Cubature<DeviceType,Real,Real>> cellCub_, bdryCub_;
+  // Cell node information
+  scalar_view volCellNodes_;
+  std::vector<std::vector<scalar_view>> bdryCellNodes_;
+  std::vector<std::vector<std::vector<int>>> bdryCellLocIds_;
+  // Finite element definition
+  ROL::Ptr<fe_type> fePrs_, feCtrl_;
+  std::vector<std::vector<ROL::Ptr<fe_type>>> fePrsBdry_, feCtrlBdry_;
+  // Local degrees of freedom on boundary, for each side of the reference cell (first index).
+  std::vector<std::vector<int>> fpidx_;
+  // Field pattern, offsets, etc.
+  std::vector<std::vector<int>> fieldPattern_;  // local Field/DOF pattern; set from DOF manager 
+  int numFields_;                               // number of fields (equations in the PDE)
+  int numDofs_;                                 // total number of degrees of freedom for all (local) fields
+  std::vector<int> offset_;                     // for each field, a counting offset
+  std::vector<int> numFieldDofs_;               // for each field, number of degrees of freedom
+  std::vector<std::vector<int>> fieldPatternCtrl_;  // local Field/DOF pattern; set from DOF manager 
+  int numFieldsCtrl_;                               // number of fields (equations in the PDE)
+  int numDofsCtrl_;                                 // total number of degrees of freedom for all (local) fields
+  std::vector<int> offsetCtrl_;                     // for each field, a counting offset
+  std::vector<int> numFieldDofsCtrl_;               // for each field, number of degrees of freedom
+  
+  // Problem parameters.
+  Real Patm_, inVelocity_, dynvisco_;
+  bool useNoSlip_;
+
+  ROL::Ptr<Permeability<Real,DeviceType>> perm_;
+  ROL::Ptr<FieldUtils::FieldInfo> fieldInfo_, fieldInfoCtrl_;
+
+  void multiplyByRadius(scalar_view &input,
+                        const scalar_view cubPts,
+                        bool isField, bool useReciprocal = false) const {
+    int c = cubPts.extent_int(0);
+    int p = cubPts.extent_int(1);
+    scalar_view r("r",c,p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        if (!useReciprocal) r(i,j) = cubPts(i,j,0);
+        else                r(i,j) = static_cast<Real>(1)/cubPts(i,j,0);
+      }
+    }
+    scalar_view in(input);
+    if (!isField)
+      fst::scalarMultiplyDataData(input,r,in);
+    else
+      fst::scalarMultiplyDataField(input,r,in);
+  }
+
+public:
+  PDE_Darcy(ROL::ParameterList &parlist) : Patm_(101.325 /* kg/mm-s^2 */) {
+    // Finite element fields.
+    int cubDegree     = parlist.sublist("Problem").get("Cubature Degree",4);
+    int bdryCubDegree = parlist.sublist("Problem").get("Boundary Cubature Degree",2);
+    int basisDegPres  = parlist.sublist("Problem").get("Pressure Basis Degree",1);
+    int basisDegCtrl  = parlist.sublist("Problem").get("Density Basis Degree",1);
+    std::string elemType = parlist.sublist("Problem").get("Element Type","TET");
+    if (elemType == "TET") {
+      if (basisDegPres == 2) {
+        basisPtrPrs_ = ROL::makePtr<Intrepid2::Basis_HGRAD_TET_C2_FEM<DeviceType,Real,Real>>();
+      }
+      else {
+        std::cout << "\nFirst-order TET basis selected.\n";
+        basisPtrPrs_ = ROL::makePtr<Intrepid2::Basis_HGRAD_TET_C1_FEM<DeviceType,Real,Real>>();
+      }
+    }
+    else {
+      if (basisDegPres == 2) {
+        basisPtrPrs_ = ROL::makePtr<Intrepid2::Basis_HGRAD_HEX_C2_FEM<DeviceType,Real,Real>>();
+      }
+      else {
+        basisPtrPrs_ = ROL::makePtr<Intrepid2::Basis_HGRAD_HEX_C1_FEM<DeviceType,Real,Real>>();
+      }
+    }
+    basisPtrs_.push_back(basisPtrPrs_); // Pressure component
+
+    shards::CellTopology cellType = basisPtrs_[0]->getBaseCellTopology(); // get the cell type from any basis
+    if (basisDegCtrl == 1) {
+      if (elemType == "TET")
+        basisPtrCtrl_ = ROL::makePtr<Intrepid2::Basis_HGRAD_HEX_C1_FEM<DeviceType,Real,Real>>();
+      else
+        basisPtrCtrl_ = ROL::makePtr<Intrepid2::Basis_HGRAD_TET_C1_FEM<DeviceType,Real,Real>>();
+    }
+    else {
+      basisPtrCtrl_ = ROL::makePtr<Intrepid2::Basis_HVOL_C0_FEM<DeviceType,Real,Real>>(cellType);
+    }
+    basisPtrsCtrl_.push_back(basisPtrCtrl_); // Control component
+    // Quadrature rules.
+    Intrepid2::DefaultCubatureFactory cubFactory;                            // create cubature factory
+    cellCub_ = cubFactory.create<DeviceType,Real,Real>(cellType, cubDegree); // create default cubature
+    shards::CellTopology bdryCellType = cellType.getCellTopologyData(1, 0);
+    bdryCub_ = cubFactory.create<DeviceType,Real,Real>(bdryCellType, bdryCubDegree);
+
+    // Other problem parameters.
+    dynvisco_     = parlist.sublist("Problem").get("Dynamic Viscosity", 0.84e-8); // kg/mm-s
+    Real density  = parlist.sublist("Problem").get("Fluid Density", 8.988e-11);   // kg/mm^3
+    Real inRadius = parlist.sublist("Problem").get("Inlet Radius", 0.6875);       // mm
+    Real Reynolds = parlist.sublist("Problem").get("Reynolds Number", 50.0);      // dimensionless
+    inVelocity_ = Reynolds * dynvisco_ / (density*static_cast<Real>(2)*inRadius); // mm/s
+    perm_       = ROL::makePtr<Permeability<Real,DeviceType>>(parlist);
+    useNoSlip_  = parlist.sublist("Problem").get("Use No Slip", false);
+
+    numDofs_ = 0;
+    numFields_ = basisPtrs_.size();
+    offset_.resize(numFields_);
+    numFieldDofs_.resize(numFields_);
+    for (int i=0; i<numFields_; ++i) {
+      offset_[i] = i==0 ? 0 : offset_[i-1] + basisPtrs_[i-1]->getCardinality();
+      numFieldDofs_[i] = basisPtrs_[i]->getCardinality();
+      numDofs_ += numFieldDofs_[i];
+    }
+    numDofsCtrl_ = 0;
+    numFieldsCtrl_ = basisPtrsCtrl_.size();
+    offsetCtrl_.resize(numFieldsCtrl_);
+    numFieldDofsCtrl_.resize(numFieldsCtrl_);
+    for (int i=0; i<numFieldsCtrl_; ++i) {
+      offsetCtrl_[i] = i==0 ? 0 : offsetCtrl_[i-1] + basisPtrsCtrl_[i-1]->getCardinality();
+      numFieldDofsCtrl_[i] = basisPtrsCtrl_[i]->getCardinality();
+      numDofsCtrl_ += numFieldDofsCtrl_[i];
+    }
+  }
+
+  void residual(scalar_view & res,
+                const scalar_view u_coeff,
+                const scalar_view z_coeff = scalar_view(),
+                const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c = fePrs_->gradN().extent_int(0);
+    const int f = fePrs_->gradN().extent_int(1);
+    const int p = fePrs_->gradN().extent_int(2);
+    const int d = fePrs_->gradN().extent_int(3);
+    // Initialize residuals.
+    res = scalar_view("res", c, f);
+    // Evaluate/interpolate finite element fields on cells.
+    scalar_view gradP("gradP", c, p, d);
+    scalar_view AgradP("AgradP", c, p, d);
+    scalar_view valCtrl("valCtrl", c, p);
+    scalar_view alpha("alpha", c, p);
+    fePrs_->evaluateGradient(gradP, u_coeff);
+    feCtrl_->evaluateValue(valCtrl, z_coeff);
+    perm_->compute(alpha, valCtrl, fePrs_->cubPts(), 0);
+    fst::scalarMultiplyDataData(AgradP,alpha,gradP);
+    /*** Evaluate weak form of the residual. ***/
+    //multiplyByRadius(AgradP,fePrs_->cubPts(),false);
+    fst::integrate(res,AgradP,fePrs_->gradNdetJ(),false);
+    // Boundary conditions.
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      // APPLY DIRICHLET CONDITIONS
+      for (int i = 0; i < numSideSets; ++i) {
+        if (i==0) /* inflow condition */ {
+          const int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            const int numCellsSide = bdryCellLocIds_[i][j].size();
+            if (numCellsSide > 0) {
+              const int numCubPerSide = fePrsBdry_[i][j]->cubPts().extent_int(1);
+              scalar_view nRes("nRes", numCellsSide, f);
+              scalar_view nVal("nVal", numCellsSide, numCubPerSide);
+	      Kokkos::deep_copy(nVal, -inVelocity_);
+              //multiplyByRadius(nVal,fePrsBdry_[i][j]->cubPts(),false);
+              fst::integrate(nRes,nVal,fePrsBdry_[i][j]->NdetJ(),false);
+              for (int k = 0; k < numCellsSide; ++k) {
+                const int cidx = bdryCellLocIds_[i][j][k];
+                for (int l = 0; l < f; ++l) {
+                  res(cidx,l) += nRes(k,l);
+                }
+              }
+            }
+          }
+        }
+        else if (i==1) /* outflow condition */ {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numBdryDofs = fpidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                res(cidx,fpidx_[j][l]) = u_coeff(cidx,fpidx_[j][l]) - Patm_;
+              }
+            }
+          }
+        }
+        else if (useNoSlip_ && i==2) /* do-nothing condition */ {}
+      }
+    }
+  }
+
+  void Jacobian_1(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c = fePrs_->gradN().extent_int(0);
+    const int f = fePrs_->gradN().extent_int(1);
+    const int p = fePrs_->gradN().extent_int(2);
+    const int d = fePrs_->gradN().extent_int(3);
+    // Initialize Jacobians.
+    jac = scalar_view("jac", c, f, f);
+    // Evaluate/interpolate finite element fields on cells.
+    scalar_view valCtrl("valCtrl", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view alphaPhi("alphaPhi", c, f, p, d);
+    feCtrl_->evaluateValue(valCtrl, z_coeff);
+    perm_->compute(alpha, valCtrl, fePrs_->cubPts(), 0);
+    // Multiply velocity with alpha
+    fst::scalarMultiplyDataField(alphaPhi, alpha, fePrs_->gradN());
+    /*** Evaluate weak form of the Jacobian. ***/
+    //multiplyByRadius(alphaPhi,fePrs_->cubPts(),true);
+    fst::integrate(jac,alphaPhi,fePrs_->gradNdetJ(),false);
+    // APPLY DIRICHLET CONDITIONS
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if (i==0) /* inflow condition */ {}
+        else if (i==1) /* outflow condition */ {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numBdryDofs = fpidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                for (int m = 0; m < f; ++m) {
+                  jac(cidx,fpidx_[j][l],m) = static_cast<Real>(0);
+                }
+                jac(cidx,fpidx_[j][l],fpidx_[j][l]) = static_cast<Real>(1);
+              }
+            }
+          }
+        }
+        else if (useNoSlip_ && i==2) /* do-nothing condition */ {}
+      }
+    }
+  }
+
+  void Jacobian_2(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c  = fePrs_->gradN().extent_int(0);
+    const int f  = fePrs_->gradN().extent_int(1);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int p  = fePrs_->gradN().extent_int(2);
+    const int d  = fePrs_->gradN().extent_int(3);
+    // Initialize Jacobians.
+    jac = scalar_view("jac", c, f, fc);
+    // Evaluate on FE basis.
+    scalar_view valZ("valZ", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view alphaN("alphaN", c, fc, p);
+    scalar_view gradP("gradP", c, p, d);
+    scalar_view gradPgradN("gradPgradN", c, f, p);
+    fePrs_->evaluateGradient(gradP, u_coeff);
+    feCtrl_->evaluateValue(valZ, z_coeff);
+    perm_->compute(alpha, valZ, fePrs_->cubPts(), 1);
+    fst::scalarMultiplyDataField(alphaN,alpha,feCtrl_->N());
+    fst::dotMultiplyDataField(gradPgradN,gradP,fePrs_->gradNdetJ());
+    /*** Evaluate weak form of the residual. ***/
+    //multiplyByRadius(gradPgradN,fePrs_->cubPts(),true);
+    fst::integrate(jac,gradPgradN,alphaN,false);
+    // APPLY DIRICHLET CONDITIONS
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if (i==0) /* inflow condition */ {}
+        else if (i==1) /* outflow condition*/ {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numBdryDofs = fpidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                for (int m=0; m < fc; ++m) {
+                  jac(cidx,fpidx_[j][l],m) = static_cast<Real>(0);
+                }
+              }
+            }
+          }
+        }
+        else if (useNoSlip_ && i==2) /* do-nothing condition */ {}
+      }
+    }
+  }
+
+  void Jacobian_3(std::vector<scalar_view> & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> Jacobian_3 is zero!");
+  }
+
+  void Hessian_11(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> Hessian_11 is zero!");
+  }
+
+  void Hessian_12(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c  = fePrs_->gradN().extent_int(0);
+    const int f  = fePrs_->gradN().extent_int(1);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int p  = fePrs_->gradN().extent_int(2);
+    const int d  = fePrs_->gradN().extent_int(3);
+    // Initialize hessians.
+    hess = scalar_view("hess", c, fc, f);
+    // Apply Dirichlet conditions
+    scalar_view l0_coeff("l0_coeff", c, f);
+    Kokkos::deep_copy(l0_coeff, l_coeff);
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if (i==0) /* inflow condition */ {}
+        else if (i==1) /* outflow condition */ {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numBdryDofs = fpidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                l0_coeff(cidx,fpidx_[j][l]) = static_cast<Real>(0);
+              }
+            }
+          }
+        }
+        else if (useNoSlip_ && i==2) /* do-nothing condition */ {}
+      }
+    }
+    // Evaluate/interpolate finite element fields on cells.
+    scalar_view valZ("valZ", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view gradL("gradL", c, p, d);
+    scalar_view alphaL("alphaL", c, fc, p, d);
+    fePrs_->evaluateGradient(gradL, l0_coeff);
+    feCtrl_->evaluateValue(valZ, z_coeff);
+    perm_->compute(alpha, valZ, fePrs_->cubPts(), 1);
+    /*** Evaluate weak form of the Hessian. ***/
+    // Multiply velocity with alpha
+    for (int j = 0; j < c; ++j) {
+      for (int k = 0; k < fc; ++k) {
+        for (int l = 0; l < p; ++l) {
+          for (int m = 0; m < d; ++m) {
+            alphaL(j,k,l,m) = alpha(j,l) * (feCtrl_->N())(j,k,l) * gradL(j,l,m);
+          }
+        }
+      }
+    }
+    //multiplyByRadius(alphaL,fePrs_->cubPts(),true);
+    fst::integrate(hess,alphaL,fePrs_->gradNdetJ(),false);
+  }
+
+  void Hessian_13(std::vector<scalar_view> & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> Hessian_13 is zero!");
+  }
+
+  void Hessian_21(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c  = fePrs_->gradN().extent_int(0);
+    const int f  = fePrs_->gradN().extent_int(1);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int p  = fePrs_->gradN().extent_int(2);
+    const int d  = fePrs_->gradN().extent_int(3);
+    // Initialize hessians.
+    hess = scalar_view("hess", c, f, fc);
+    // Apply Dirichlet conditions
+    scalar_view l0_coeff("l0_coeff", c, f);
+    Kokkos::deep_copy(l0_coeff, l_coeff);
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if (i==0) /* inflow condition*/ {}
+        else if (i==1) /* outflow condition */ {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numBdryDofs = fpidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                l0_coeff(cidx,fpidx_[j][l]) = static_cast<Real>(0);
+              }
+            }
+          }
+        }
+        else if (useNoSlip_ && i==2) /* do-nothing condition */ {}
+      }
+    }
+    // Evaluate/interpolate finite element fields on cells.
+    scalar_view valZ("valZ", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view gradL("gradL", c, p, d);
+    scalar_view alphaL("alphaL", c, fc, p, d);
+    fePrs_->evaluateGradient(gradL, l0_coeff);
+    feCtrl_->evaluateValue(valZ, z_coeff);
+    perm_->compute(alpha, valZ, fePrs_->cubPts(), 1);
+    /*** Evaluate weak form of the Hessian. ***/
+    for (int j = 0; j < c; ++j) {
+      for (int k = 0; k < fc; ++k) {
+        for (int l = 0; l < p; ++l) {
+          for (int m = 0; m < d; ++m) {
+            alphaL(j,k,l,m) = alpha(j,l) * feCtrl_->N()(j,k,l) * gradL(j,l,m);
+          }
+        }
+      }
+    }
+    //multiplyByRadius(alphaL,fePrs_->cubPts(),true);
+    fst::integrate(hess,fePrs_->gradNdetJ(),alphaL,false);
+  }
+
+  void Hessian_22(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c  = fePrs_->gradN().extent_int(0);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int p  = fePrs_->gradN().extent_int(2);
+    const int d  = fePrs_->gradN().extent_int(3);
+    // Initialize hessians.
+    hess = scalar_view("hess", c, fc, fc);
+    // Apply Dirichlet conditions
+    scalar_view l0_coeff("l0_coeff", c, p);
+    Kokkos::deep_copy(l0_coeff,l_coeff);
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if (i==0) /* inflow condition */ {}
+        else if (i==1) /* outflow condition */ {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numBdryDofs = fpidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                l0_coeff(cidx,fpidx_[j][l]) = static_cast<Real>(0);
+              }
+            }
+          }
+        }
+        else if (useNoSlip_ && i==2) /* do-nothing condition */ {}
+      }
+    }
+    // Evaluate/interpolate finite element fields on cells.
+    scalar_view valZ("valZ", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view alphaUL("alphaUL", c, fc, p);
+    scalar_view gradL("gradL", c, p, d);
+    scalar_view gradU("gradU", c, p, d);
+    fePrs_->evaluateGradient(gradL, l0_coeff);
+    fePrs_->evaluateGradient(gradU, u_coeff);
+    feCtrl_->evaluateValue(valZ, z_coeff);
+    perm_->compute(alpha, valZ, fePrs_->cubPts(), 2);
+    // Multiply velocity with alpha
+    Real dot(0);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < fc; ++j) {
+        for (int k = 0; k < p; ++k) {
+          dot = static_cast<Real>(0);
+          for (int l = 0; l < d; ++l) {
+            dot += gradU(i,k,l) * gradL(i,k,l);
+          }
+          alphaUL(i,j,k) = alpha(i,k) * (feCtrl_->N())(i,j,k) * dot;
+        }
+      }
+    }
+    /*** Evaluate weak form of the Hessian. ***/
+    // Velocity equation.
+    //multiplyByRadius(alphaUL,fePrs_->cubPts(),true);
+    fst::integrate(hess,alphaUL,feCtrl_->NdetJ(),false);
+  }
+
+  void Hessian_23(std::vector<scalar_view> & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> Hessian_23 is zero!");
+  }
+
+  void Hessian_31(std::vector<scalar_view> & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> Hessian_31 is zero!");
+  }
+
+  void Hessian_32(std::vector<scalar_view> & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> Hessian_32 is zero!");
+  }
+
+  void Hessian_33(std::vector<std::vector<scalar_view>> & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> Hessian_33 is zero!");
+  }
+
+  void RieszMap_1(scalar_view & riesz) override {
+    // Retrieve dimensions.
+    const int c = fePrs_->gradN().extent_int(0);
+    const int f = fePrs_->gradN().extent_int(1);
+    //const int p = fePrs_->gradN().extent_int(2);
+    //const int d = fePrs_->gradN().extent_int(3);
+    // Initialize Jacobians.
+    riesz = scalar_view("riesz", c, f, f);
+    Kokkos::deep_copy(riesz, fePrs_->stiffMat());
+    rst::add(riesz, fePrs_->massMat());
+    // Evaluate/interpolate finite element fields on cells.
+    //scalar_view valPhi ("valPhi", c, f, p);
+    //scalar_view gradPhi("gradPhi", c, f, p, d);
+    //Kokkos::deep_copy(valPhi, fePrs_->N());
+    //Kokkos::deep_copy(gradPhi, fePrs_->gradN());
+    ///*** Evaluate weak form of the Jacobian. ***/
+    ////multiplyByRadius(gradPhi,fePrs_->cubPts(),true);
+    ////multiplyByRadius(valPhi,fePrs_->cubPts(),true);
+    //fst::integrate(*riesz,
+    //                                              *gradPhi,             // alpha dPhi/dx
+    //                                              *fePrs_->gradNdetJ(), // dPhi/dx
+    //                                              Intrepid::COMP_CPP,
+    //                                              false);
+    //fst::integrate(*riesz,
+    //                                              *valPhi,              // alpha dPhi/dx
+    //                                              *fePrs_->NdetJ(),     // dPhi/dx
+    //                                              Intrepid::COMP_CPP,
+    //                                              true);
+  }
+
+  void RieszMap_2(scalar_view & riesz) override {
+    // Retrieve dimensions.
+    const int c  = feCtrl_->gradN().extent_int(0);
+    const int f  = feCtrl_->gradN().extent_int(1);
+    //const int p  = feCtrl_->gradN().extent_int(2);
+    // Initialize Jacobians.
+    riesz = scalar_view("riesz", c, f, f);
+    rst::add(riesz, feCtrl_->massMat());
+    // Evaluate on FE basis.
+    //scalar_view valPsi;
+    //valPsi  = scalar_view(c, f, p);
+    //*valPsi = *feCtrl_->N();
+    ///*** Evaluate weak form of the residual. ***/
+    ////multiplyByRadius(valPsi,feCtrl_->cubPts(),true);
+    //fst::integrate(*riesz,
+    //                                              *valPsi,           // Psi
+    //                                              *feCtrl_->NdetJ(), // Psi
+    //                                              Intrepid::COMP_CPP,
+    //                                              false);
+  }
+
+  std::vector<basis_ptr> getFields() override {
+    return basisPtrs_;
+  }
+
+  std::vector<basis_ptr> getFields2() override {
+    return basisPtrsCtrl_;
+  }
+
+  void setCellNodes(const scalar_view &volCellNodes,
+                    const std::vector<std::vector<scalar_view>> &bdryCellNodes,
+                    const std::vector<std::vector<std::vector<int>>> &bdryCellLocIds) override {
+    volCellNodes_ = volCellNodes;
+    bdryCellNodes_ = bdryCellNodes;
+    bdryCellLocIds_ = bdryCellLocIds;
+    // Finite element definition.
+    fePrs_  = ROL::makePtr<fe_type>(volCellNodes_,basisPtrPrs_,cellCub_);
+    feCtrl_ = ROL::makePtr<fe_type>(volCellNodes_,basisPtrCtrl_,cellCub_,false);
+    fpidx_  = fePrs_->getBoundaryDofs();
+    // Construct control boundary FE
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      fePrsBdry_.resize(numSideSets);
+      feCtrlBdry_.resize(numSideSets);
+      for (int i = 0; i < numSideSets; ++i) {
+        int numLocSides = bdryCellNodes[i].size();
+        fePrsBdry_[i].resize(numLocSides);
+        feCtrlBdry_[i].resize(numLocSides);
+        for (int j = 0; j < numLocSides; ++j) {
+          if (bdryCellNodes[i][j] != scalar_view()) {
+            fePrsBdry_[i][j]  = ROL::makePtr<fe_type>(bdryCellNodes[i][j],basisPtrPrs_,bdryCub_,j);
+            feCtrlBdry_[i][j] = ROL::makePtr<fe_type>(bdryCellNodes[i][j],basisPtrCtrl_,bdryCub_,j);
+          }
+        }
+      }
+    }
+  }
+
+  void setFieldPattern(const std::vector<std::vector<int>> &fieldPattern,
+                       const std::vector<std::vector<int>> &fieldPattern2) override {
+    fieldPattern_ = fieldPattern;
+    fieldInfo_    = ROL::makePtr<FieldUtils::FieldInfo>(numFields_,numDofs_,numFieldDofs_,fieldPattern_);
+    fieldPatternCtrl_ = fieldPattern2;
+    fieldInfoCtrl_    = ROL::makePtr<FieldUtils::FieldInfo>(numFieldsCtrl_,numDofsCtrl_,numFieldDofsCtrl_,fieldPatternCtrl_);
+  }
+
+  void printData(std::string tag,
+                 const scalar_view u_coeff,
+                 const scalar_view z_coeff = scalar_view(),
+                 const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c = fePrs_->gradN().extent_int(0);
+    const int p = fePrs_->gradN().extent_int(2);
+    const int d = fePrs_->gradN().extent_int(3);
+    // Evaluate/interpolate finite element fields on cells.
+    scalar_view gradP("gradP", c, p, d);
+    scalar_view AgradP("AgradP", c, p, d);
+    scalar_view valCtrl("valCtrl", c, p);
+    scalar_view alpha("alpha", c, p);
+    fePrs_->evaluateGradient(gradP, u_coeff);
+    feCtrl_->evaluateValue(valCtrl, z_coeff);
+    perm_->compute(alpha, valCtrl, fePrs_->cubPts(), 0);
+    fst::scalarMultiplyDataData(AgradP,alpha,gradP);
+    // Print to velocity file
+    std::stringstream nameVel;
+    nameVel << "velocity" << tag << ".txt";
+    std::ofstream fileVel;
+    fileVel.open(nameVel.str());
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        fileVel << std::scientific << std::setprecision(15);
+        for (int k = 0; k < d; ++k) fileVel << std::setw(25) << (fePrs_->cubPts())(i,j,k);
+        for (int k = 0; k < d; ++k) fileVel << std::setw(25) << -AgradP(i,j,k);
+        fileVel << std::endl;
+      }
+    }
+    fileVel.close();
+    // Print to permeability file
+    std::stringstream namePer;
+    namePer << "permeability" << tag << ".txt";
+    std::ofstream filePer;
+    filePer.open(namePer.str());
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        filePer << std::scientific << std::setprecision(15);
+        for (int k = 0; k < d; ++k) filePer << std::setw(25) << (fePrs_->cubPts())(i,j,k);
+        filePer << std::setw(25) << dynvisco_*alpha(i,j);
+        filePer << std::endl;
+      }
+    }
+    filePer.close();
+  }
+
+  void printCellAverages(std::string tag,
+                 const scalar_view u_coeff,
+                 const scalar_view z_coeff = scalar_view(),
+                 const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c = fePrs_->gradN().extent_int(0);
+    const int p = fePrs_->gradN().extent_int(2);
+    const int d = fePrs_->gradN().extent_int(3);
+    // Evaluate/interpolate finite element fields on cells.
+    scalar_view gradP("gradP", c, p, d);
+    scalar_view AgradP("AgradP", c, p, d);
+    scalar_view velX("velX", c, p);
+    scalar_view velY("velY", c, p);
+    scalar_view velZ("velZ", c, p);
+    scalar_view valCtrl("valCtrl", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view onesVec("onesVec", c, p);
+    scalar_view cellVol("cellVol", c);
+    scalar_view avgX("avgX", c);
+    scalar_view avgY("avgY", c);
+    scalar_view avgZ("avgZ", c);
+    scalar_view avgP("avgP", c);
+    fePrs_->evaluateGradient(gradP, u_coeff);
+    feCtrl_->evaluateValue(valCtrl, z_coeff);
+    Kokkos::deep_copy(onesVec, static_cast<Real>(1));
+    perm_->compute(alpha, valCtrl, fePrs_->cubPts(), 0);
+    fst::scalarMultiplyDataData(AgradP,alpha,gradP);
+    // Extract velocity components.
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        velX(i,j) = -AgradP(i,j,0);
+        velY(i,j) = -AgradP(i,j,1);
+        velZ(i,j) = -AgradP(i,j,2);
+      }
+    }
+    // Compute cell volumes.
+    fst::integrate(cellVol,onesVec,onesVec,false);
+    // Compute velocity X cell average.
+    fst::integrate(avgX,velX,onesVec,false);
+    for (int i = 0; i < c; ++i) avgX(i) /= cellVol(i);
+    // Compute velocity Y cell average.
+    fst::integrate(avgY,velY,onesVec,false);
+    for (int i = 0; i < c; ++i) avgY(i) /= cellVol(i);
+    // Compute velocity Z cell average.
+    fst::integrate(avgZ,velZ,onesVec,false);
+    for (int i = 0; i < c; ++i) avgZ(i) /= cellVol(i);
+    // Compute permeability cell average.
+    fst::integrate(avgP,alpha,onesVec,false);
+    for (int i = 0; i < c; ++i) avgP(i) /= cellVol(i);
+    rst::scale(avgP, dynvisco_);
+
+    // Print to velocity file
+    std::stringstream nameVel;
+    nameVel << "velocity" << tag << ".txt";
+    std::ofstream fileVel;
+    fileVel.open(nameVel.str());
+    for (int i = 0; i < c; ++i) {
+      fileVel << std::scientific << std::setprecision(15);
+      fileVel << std::setw(25) << avgX(i);
+      fileVel << std::setw(25) << avgY(i);
+      fileVel << std::setw(25) << avgZ(i);
+      fileVel << std::endl;
+    }
+    fileVel.close();
+    // Print to permeability file
+    std::stringstream namePer;
+    namePer << "permeability" << tag << ".txt";
+    std::ofstream filePer;
+    filePer.open(namePer.str());
+    for (int i = 0; i < c; ++i) {
+      filePer << std::scientific << std::setprecision(15);
+      filePer << std::setw(25) << avgP(i);
+      filePer << std::endl;
+    }
+    filePer.close();
+  }
+
+  const ROL::Ptr<fe_type> getPressureFE(void) const {
+    return fePrs_;
+  }
+
+  const ROL::Ptr<fe_type> getControlFE(void) const {
+    return feCtrl_;
+  }
+
+  const std::vector<ROL::Ptr<fe_type>> getPressureBdryFE(const int sideset = -1) const {
+    int side = (sideset < 0 || sideset > 3) ? 0 : sideset;
+    return fePrsBdry_[side];
+  }
+
+  const std::vector<ROL::Ptr<fe_type>> getControlBdryFE(const int sideset = -1) const {
+    int side = (sideset < 0 || sideset > 3) ? 0 : sideset;
+    return feCtrlBdry_[side];
+  }
+
+  const std::vector<std::vector<int>> getBdryCellLocIds(const int sideset = -1) const {
+    int side = (sideset < 0 || sideset > 3) ? 0 : sideset;
+    return bdryCellLocIds_[side];
+  }
+
+  const ROL::Ptr<Permeability<Real,DeviceType>> getPermeability(void) const {
+    return perm_;
+  }
+
+  const ROL::Ptr<const FieldUtils::FieldInfo> getStateFieldInfo(void) const {
+    return fieldInfo_;
+  }
+
+  const ROL::Ptr<const FieldUtils::FieldInfo> getControlFieldInfo(void) const {
+    return fieldInfoCtrl_;
+  }
+
+}; // PDE_Darcy
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/flow-opt/3dim/models/darcy/permeabilityK.hpp b/packages/rol/example/PDE-OPT/flow-opt/3dim/models/darcy/permeabilityK.hpp
new file mode 100644
index 000000000000..acfc748947b5
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/flow-opt/3dim/models/darcy/permeabilityK.hpp
@@ -0,0 +1,93 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+#ifndef PERMEABILITYK_HPP
+#define PERMEABILITYK_HPP
+
+#include <vector>
+#include "ROL_ParameterList.hpp"
+#include "ROL_Ptr.hpp"
+#include "../../../../TOOLS/feK.hpp"
+
+template<class Real, class DeviceType>
+class Permeability {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+
+private:
+  Real dRadius_, viscosity_, minPerm_, maxPerm_;
+  int type_;
+
+  Real value(const Real z) const {
+    if (type_==1) {
+      const Real a(minPerm_/viscosity_), b(std::log(maxPerm_/minPerm_));
+      return a * std::exp(b * z);
+    }
+    else {
+      const Real a(minPerm_/viscosity_), b(maxPerm_/viscosity_);
+      return a + z * (b - a);
+    }
+  }
+
+  Real deriv1(const Real z) const {
+    if (type_==1) {
+      const Real a(minPerm_/viscosity_), b(std::log(maxPerm_/minPerm_));
+      return a * b * std::exp(b * z);
+    }
+    else {
+      const Real a(minPerm_/viscosity_), b(maxPerm_/viscosity_);
+      return b - a;
+    }
+  }
+
+  Real deriv2(const Real z) const {
+    if (type_==1) {
+      const Real a(minPerm_/viscosity_), b(std::log(maxPerm_/minPerm_));
+      return a * b * b * std::exp(b * z);
+    }
+    else {
+      return static_cast<Real>(0);
+    }
+  }
+
+public:
+  Permeability(ROL::ParameterList &list) {
+    bool useDarcy = list.sublist("Problem").get("Use Darcy Flow",true);
+    dRadius_   = list.sublist("Problem").get("Diffuser Radius",5.0);        // mm
+    viscosity_ = list.sublist("Problem").get("Dynamic Viscosity", 0.84e-8); // kg/mm-s
+    minPerm_   = list.sublist("Problem").get("Minimum Permeability", 3e-7); // mm^2
+    maxPerm_   = list.sublist("Problem").get("Maximum Permeability", 3e-6); // mm^2
+    if (useDarcy) dRadius_ += static_cast<Real>(10);
+    type_      = list.sublist("Problem").get("Parametrization Type", 0);
+    // type_ = 0 ... K = z kmin + (1-z) kmax
+    // type_ = 1 ... K = exp((1-z) log(kmin) + z log(kmax))
+  }
+
+  void compute(scalar_view &alpha, const scalar_view z, const scalar_view pts, int deriv=0) const {
+    const Real tol = std::sqrt(ROL::ROL_EPSILON<Real>());
+    const Real zero(0), one(1);
+    const int c = pts.extent_int(0);
+    const int p = pts.extent_int(1);
+    const int d = pts.extent_int(2);
+    Real weight(0), norm(0);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        norm = zero;
+        for (int k = 0; k < d; ++k) norm += pts(i,j,k)*pts(i,j,k);
+        weight = (std::sqrt(norm) <= dRadius_ + tol ? one : zero);
+        // Compute spatially dependent viscosity
+        if (deriv==1)      alpha(i,j) = weight * deriv1(z(i,j));
+        else if (deriv==2) alpha(i,j) = weight * deriv2(z(i,j));
+        else               alpha(i,j) = weight * value(z(i,j));
+      }
+    }
+  }
+};
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/CMakeLists.txt b/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/CMakeLists.txt
index 06b19a92f1ce..27bc1b65b1b6 100644
--- a/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/CMakeLists.txt
@@ -1,6 +1,6 @@
 ADD_SUBDIRECTORY(models)
 
-TRIBITS_COPY_FILES_TO_BINARY_DIR(
+ROL_COPY_FILES_TO_BINARY_DIR(
   ProjectAxisymmetricDataCopy
   SOURCE_FILES
     mesh/axi-diffuser-tri-clip.jou
diff --git a/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/brinkman/CMakeLists.txt b/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/brinkman/CMakeLists.txt
index 30694d14463c..293163ca3442 100644
--- a/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/brinkman/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/brinkman/CMakeLists.txt
@@ -11,13 +11,13 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_01
     SOURCES example_01.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     ProjectAxisymmetricBrinkmanDataCopy
     SOURCE_FILES
       input.xml
@@ -27,5 +27,34 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
   )
 
+ENDIF()
+
+IF(${PROJECT_NAME}_ENABLE_Intrepid2 AND
+   ${PROJECT_NAME}_ENABLE_Ifpack2   AND
+   ${PROJECT_NAME}_ENABLE_MueLu     AND
+   ${PROJECT_NAME}_ENABLE_Amesos2   AND
+   ${PROJECT_NAME}_ENABLE_Tpetra )
+
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/teuchos/src/sol)
+
+  # Need ROL_TpetraMultiVector.hpp
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
+
+  ROL_ADD_EXECUTABLE(
+    example_01_Kokkos
+    SOURCES example_01K.cpp
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_COPY_FILES_TO_BINARY_DIR(
+    ProjectAxisymmetricBrinkmanDataCopyK
+    SOURCE_FILES
+      input.xml
+      plotBrinkman.m
+      plotmesh.m
+    SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}"
+    DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
+  )
 
 ENDIF()
diff --git a/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/brinkman/example_01.cpp b/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/brinkman/example_01.cpp
index 7bf0f8078bb8..7fbd9af8ecac 100644
--- a/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/brinkman/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/brinkman/example_01.cpp
@@ -13,8 +13,7 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -50,7 +49,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int>> comm
     = Tpetra::getDefaultComm();
   const int myRank = comm->getRank();
diff --git a/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/brinkman/example_01K.cpp b/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/brinkman/example_01K.cpp
new file mode 100644
index 000000000000..112f1f4f40ff
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/brinkman/example_01K.cpp
@@ -0,0 +1,231 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_01.cpp
+    \brief Shows how to solve the Brinkman porosity optimization problem.
+*/
+
+#include "Teuchos_Comm.hpp"
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+
+#include "ROL_TpetraMultiVector.hpp"
+#include "ROL_Reduced_Objective_SimOpt.hpp"
+#include "ROL_Bounds.hpp"
+#include "ROL_Solver.hpp"
+#include "ROL_SingletonVector.hpp"
+#include "ROL_ConstraintFromObjective.hpp"
+
+#include "../../../../TOOLS/meshreaderK.hpp"
+#include "../../../../TOOLS/pdeconstraintK.hpp"
+#include "../../../../TOOLS/pdeobjectiveK.hpp"
+#include "../../../../TOOLS/pdevectorK.hpp"
+#include "../../../../TOOLS/integralconstraintK.hpp"
+
+#include "pde_brinkmanK.hpp"
+#include "obj_brinkmanK.hpp"
+
+//#include "fenv.h"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+int main(int argc, char *argv[]) {
+  //feenableexcept(FE_DIVBYZERO | FE_OVERFLOW);
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  Kokkos::ScopeGuard kokkosScope (argc, argv);
+  ROL::Ptr<const Teuchos::Comm<int>> comm
+    = Tpetra::getDefaultComm();
+  const int myRank = comm->getRank();
+  if ((iprint > 0) && (myRank == 0))
+    outStream = ROL::makePtrFromRef(std::cout);
+  else
+    outStream = ROL::makePtrFromRef(bhs);
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+
+    /*** Read in XML input ***/
+    std::string filename = "input.xml";
+    auto parlist = ROL::getParametersFromXmlFile(filename);
+
+    bool output = parlist->sublist("SimOpt").sublist("Solve").get("Output Iteration History",false);
+    output = (iprint > 0) && (myRank==0) && output;
+    parlist->sublist("SimOpt").sublist("Solve").set("Output Iteration History",output);
+
+    /*** Initialize main data structure. ***/
+    auto meshMgr = ROL::makePtr<MeshReader<RealT,DeviceT>>(*parlist);
+    // Initialize PDE describing Navier-Stokes equations.
+    auto pde = ROL::makePtr<PDE_NavierStokes<RealT,DeviceT>>(*parlist);
+    auto con = ROL::makePtr<PDE_Constraint<RealT,DeviceT>>(pde,meshMgr,comm,*parlist,*outStream);
+    // Cast the constraint and get the assembler.
+    auto assembler = con->getAssembler();
+    con->setSolveParameters(*parlist);
+
+    // Create state vectors
+    bool useParamVar = parlist->sublist("Problem").get("Use Optimal Constant Velocity",false);
+    int dim = 2;
+    auto u_ptr = assembler->createStateVector();    u_ptr->randomize();
+    auto p_ptr = assembler->createStateVector();    p_ptr->randomize();
+    auto r_ptr = assembler->createResidualVector(); r_ptr->putScalar(0.0);
+    auto up = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(u_ptr,pde,assembler,*parlist);
+    auto pp = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(p_ptr,pde,assembler,*parlist);
+    auto rp = ROL::makePtr<PDE_DualSimVector<RealT,DeviceT>>(r_ptr,pde,assembler,*parlist);
+    // Create control vectors
+    auto z_ptr = assembler->createControlVector();  z_ptr->randomize();
+    ROL::Ptr<ROL::Vector<RealT>>            zp;
+    ROL::Ptr<std::vector<RealT>>            z0_ptr;
+    ROL::Ptr<ROL::StdVector<RealT>>         z0p;
+    ROL::Ptr<ROL::TpetraMultiVector<RealT>> z1p;
+    if (useParamVar) {
+      z0_ptr = ROL::makePtr<std::vector<RealT>>(dim);
+      z0p = ROL::makePtr<ROL::StdVector<RealT>>(z0_ptr);
+      z1p = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(z_ptr,pde,assembler,*parlist);
+    }
+    if (useParamVar)
+      zp = ROL::makePtr<PDE_OptVector<RealT>>(z1p,z0p,myRank);
+    else
+      zp = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(z_ptr,pde,assembler,*parlist);
+    // Create combined vector
+    auto xp = ROL::makePtr<ROL::Vector_SimOpt<RealT>>(up,zp);
+
+    // Initialize quadratic objective function.
+    //auto qoi = ROL::makePtr<QoI_Velocity_NavierStokes<RealT,DeviceT>>(*parlist,
+    //                                                                  pde->getVelocityFE(),
+    //                                                                  pde->getPressureFE(),
+    //                                                                  pde->getVelocityBdryFE(4),
+    //                                                                  pde->getBdryCellLocIds(4),
+    //                                                                  pde->getStateFieldInfo());
+    auto qoi = ROL::makePtr<QoI_VelocityTracking_NavierStokes<RealT,DeviceT>>(*parlist,
+                                                                              pde->getVelocityFE(),
+                                                                              pde->getPressureFE(),
+                                                                              pde->getStateFieldInfo());
+    auto obj = ROL::makePtr<PDE_Objective<RealT,DeviceT>>(qoi,assembler);
+    auto robj = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(obj,con,up,zp,pp,true,false);
+
+    // Build bound constraint
+    ROL::Ptr<ROL::Vector<RealT>> lp, hp;
+    if (useParamVar) {
+      auto l0p = ROL::makePtr<ROL::StdVector<RealT>>(dim,ROL::ROL_NINF<RealT>());
+      auto h0p = ROL::makePtr<ROL::StdVector<RealT>>(dim,ROL::ROL_INF<RealT>());
+      auto l1_ptr = assembler->createControlVector();
+      auto h1_ptr = assembler->createControlVector();
+      auto l1p = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(l1_ptr,pde,assembler,*parlist);
+      auto h1p = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(h1_ptr,pde,assembler,*parlist);
+      l1p->setScalar(0.0);
+      h1p->setScalar(1.0);
+      lp = ROL::makePtr<PDE_OptVector<RealT>>(l1p,l0p,myRank);
+      hp = ROL::makePtr<PDE_OptVector<RealT>>(h1p,h0p,myRank);
+    }
+    else {
+      lp = zp->clone(); lp->setScalar(0.0);
+      hp = zp->clone(); hp->setScalar(1.0);
+    }
+    auto bnd = ROL::makePtr<ROL::Bounds<RealT>>(lp, hp);
+    // Build optimization problem
+    auto optProb = ROL::makePtr<ROL::Problem<RealT>>(robj, zp);
+    optProb->addBoundConstraint(bnd);
+    optProb->finalize(false,true,*outStream);
+
+    // Run derivative checks
+    bool checkDeriv = parlist->sublist("Problem").get("Check derivatives",false);
+    if ( checkDeriv ) {
+      auto rup = up->clone(); rup->randomize(-1.0,1.0);
+      auto rzp = zp->clone(); rzp->randomize( 0.0,1.0);
+      auto rpp = pp->clone(); rpp->randomize(-1.0,1.0);
+      auto dup = up->clone(); dup->randomize(-1.0,1.0);
+      auto dzp = zp->clone(); dzp->randomize( 0.0,1.0);
+      con->checkApplyJacobian_1(*rup,*rzp,*dup,*rup,true,*outStream);
+      con->checkApplyJacobian_2(*rup,*rzp,*dzp,*rup,true,*outStream);
+      con->checkInverseJacobian_1(*rup,*rup,*rup,*rzp,true,*outStream);
+      con->checkInverseAdjointJacobian_1(*rup,*rup,*rup,*rzp,true,*outStream);
+      con->checkApplyAdjointHessian_11(*rup,*rzp,*rpp,*dup,*rup,true,*outStream);
+      con->checkApplyAdjointHessian_21(*rup,*rzp,*rpp,*dzp,*rup,true,*outStream);
+      con->checkApplyAdjointHessian_12(*rup,*rzp,*rpp,*dup,*rzp,true,*outStream);
+      con->checkApplyAdjointHessian_22(*rup,*rzp,*rpp,*dzp,*rzp,true,*outStream);
+      obj->checkGradient_1(*rup,*rzp,*dup,true,*outStream);
+      obj->checkGradient_2(*rup,*rzp,*dzp,true,*outStream);
+      obj->checkHessVec_11(*rup,*rzp,*dup,true,*outStream);
+      obj->checkHessVec_12(*rup,*rzp,*dzp,true,*outStream);
+      obj->checkHessVec_21(*rup,*rzp,*dup,true,*outStream);
+      obj->checkHessVec_22(*rup,*rzp,*dzp,true,*outStream);
+      robj->checkGradient(*rzp,*dzp,true,*outStream);
+      robj->checkHessVec(*rzp,*dzp,true,*outStream);
+      //optProb->check(*outStream);
+    }
+
+    // Solve optimization problem
+    zp->setScalar(0.5);
+    up->zero(); pp->zero();
+    bool opt = parlist->sublist("Problem").get("Solve Optimization Problem",true);
+    if (opt) {
+      std::ifstream infile("control.txt");
+      if (infile.good())
+        assembler->inputTpetraVector(z_ptr,"control.txt");
+      if (useParamVar) {
+        std::ifstream infile0; infile0.open("target.txt");
+        if (infile0.good()) {
+          for (int i = 0; i < dim; ++i) infile0 >> (*z0_ptr)[i];
+          infile0.close();
+        }
+      }
+      ROL::Solver<RealT> optSolver(optProb, *parlist);
+      optSolver.solve(*outStream);
+      con->outputTpetraVector(z_ptr,"control.txt");
+      if (useParamVar) {
+        std::ofstream outfile; outfile.open("target.txt");
+        for (const auto x : *z0_ptr) {
+          outfile << std::scientific << std::setprecision(16);
+          outfile << x << std::endl;
+        }
+        outfile.close();
+      }
+    }
+
+    // Output
+    assembler->printMeshData(*outStream);
+    RealT tol(1.e-8);
+    Teuchos::Array<RealT> res(1,0);
+    con->solve(*rp,*up,*zp,tol);
+    con->outputTpetraVector(u_ptr,"state.txt");
+    con->value(*rp,*up,*zp,tol);
+    r_ptr->norm2(res.view(0,1));
+    *outStream << "Residual Norm: " << res[0] << std::endl;
+    assembler->printDataPDE(pde,u_ptr,z_ptr);
+    errorFlag += (res[0] > 1.e-6 ? 1 : 0);
+    //con->outputTpetraData();
+
+    // Get a summary from the time monitor.
+    Teuchos::TimeMonitor::summarize();
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/brinkman/impermeabilityK.hpp b/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/brinkman/impermeabilityK.hpp
new file mode 100644
index 000000000000..255f08b9d48e
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/brinkman/impermeabilityK.hpp
@@ -0,0 +1,106 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+#ifndef IMPERMEABILITYK_HPP
+#define IMPERMEABILITYK_HPP
+
+#include <vector>
+#include "ROL_ParameterList.hpp"
+#include "ROL_Ptr.hpp"
+#include "../../../../TOOLS/feK.hpp"
+
+template<class Real, class DeviceType>
+class Impermeability {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+
+private:
+  Real dRadius_, viscosity_, minPerm_, maxPerm_;
+  int type_;
+
+  Real value(const Real z) const {
+    if (type_==1) {
+      const Real a(viscosity_/minPerm_), b(viscosity_/maxPerm_);
+      return b + z * (a - b);
+    }
+    else if (type_==2) {
+      const Real a(-std::log(minPerm_)), b(-std::log(maxPerm_)-a);
+      return viscosity_ * std::exp(a + b * z);
+    }
+    else {
+      const Real a(minPerm_), b(maxPerm_ - minPerm_);
+      return viscosity_ / (a + b * z);
+    }
+  }
+
+  Real deriv1(const Real z) const {
+    if (type_==1) {
+      const Real a(viscosity_/minPerm_), b(viscosity_/maxPerm_);
+      return a - b;
+    }
+    else if (type_==2) {
+      const Real a(-std::log(minPerm_)), b(-std::log(maxPerm_)-a);
+      return viscosity_ * b * std::exp(a + b * z);
+    }
+    else {
+      const Real two(2), a(minPerm_), b(maxPerm_ - minPerm_);
+      return -viscosity_ * b / std::pow(a + b * z, two);
+    }
+  }
+
+  Real deriv2(const Real z) const {
+    if (type_==1) {
+      return static_cast<Real>(0);
+    }
+    else if (type_==2) {
+      const Real a(-std::log(minPerm_)), b(-std::log(maxPerm_)-a);
+      return viscosity_ * b * b * std::exp(a + b * z);
+    }
+    else {
+      const Real two(2), three(3), a(minPerm_), b(maxPerm_ - minPerm_);
+      return two * viscosity_ * std::pow(b, two) / std::pow(a + b * z, three);
+    }
+  }
+
+public:
+  Impermeability(ROL::ParameterList &list) {
+    bool useDarcy = list.sublist("Problem").get("Use Darcy Flow",true);
+    dRadius_   = list.sublist("Problem").get("Diffuser Radius",5.0);        // mm
+    viscosity_ = list.sublist("Problem").get("Dynamic Viscosity", 0.84e-8); // kg/mm-s
+    minPerm_   = list.sublist("Problem").get("Minimum Permeability", 3e-7); // mm^2
+    maxPerm_   = list.sublist("Problem").get("Maximum Permeability", 3e-6); // mm^2
+    if (useDarcy) dRadius_ += static_cast<Real>(10);
+    type_      = list.sublist("Problem").get("Parametrization Type", 0);
+    // type_ = 0 ... K = z kmin + (1-z) kmax
+    // type_ = 1 ... K = 1/(z/kmin + (1-z)/kmax)
+    // type_ = 2 ... K = exp((1-z) log(kmin) + z log(kmax))
+  }
+
+  void compute(scalar_view &alpha, const scalar_view z, const scalar_view pts, int deriv=0) const {
+    const Real tol = std::sqrt(ROL::ROL_EPSILON<Real>());
+    const Real zero(0), one(1);
+    const int c = pts.extent_int(0);
+    const int p = pts.extent_int(1);
+    const int d = pts.extent_int(2);
+    Real weight(0), norm(0);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        norm = zero;
+        for (int k = 0; k < d; ++k) norm += pts(i,j,k)*pts(i,j,k);
+        weight = (std::sqrt(norm) <= dRadius_ + tol ? one : zero);
+        // Compute spatially dependent viscosity
+        if (deriv==1)      alpha(i,j) = weight * deriv1(z(i,j));
+        else if (deriv==2) alpha(i,j) = weight * deriv2(z(i,j));
+        else               alpha(i,j) = weight * value(z(i,j));
+      }
+    }
+  }
+};
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/brinkman/obj_brinkmanK.hpp b/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/brinkman/obj_brinkmanK.hpp
new file mode 100644
index 000000000000..a1e1df9b60db
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/brinkman/obj_brinkmanK.hpp
@@ -0,0 +1,2266 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  obj.hpp
+    \brief Provides the interface for local (cell-based) objective function computations.
+*/
+
+#ifndef OBJ_BRINKMANK_HPP
+#define OBJ_BRINKMANK_HPP
+
+#include "../../../../TOOLS/qoiK.hpp"
+#include "pde_brinkmanK.hpp"
+#include "impermeabilityK.hpp"
+
+template <class Real, class DeviceType>
+class QoI_Vorticity_NavierStokes : public QoI<Real, DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> feVel_, fePrs_;
+  const ROL::Ptr<const FieldUtils::FieldInfo> fieldInfo_;
+  scalar_view weight_;
+
+  const Real eps_;
+
+  Real weightFunc(const std::vector<Real> & x) const {
+    return static_cast<Real>(1);
+  }
+
+public:
+  QoI_Vorticity_NavierStokes(const ROL::Ptr<fe_type> &feVel,
+                             const ROL::Ptr<fe_type> &fePrs,
+                             const ROL::Ptr<const FieldUtils::FieldInfo> &fieldInfo)
+    : feVel_(feVel), fePrs_(fePrs), fieldInfo_(fieldInfo), eps_(std::sqrt(ROL::ROL_EPSILON<Real>())) {
+    int c = feVel_->cubPts().extent_int(0);
+    int p = feVel_->cubPts().extent_int(1);
+    int d = feVel_->cubPts().extent_int(2);
+    std::vector<Real> pt(d);
+    weight_ = scalar_view("weight",c,p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k)
+          pt[k] = (feVel_->cubPts())(i,j,k);
+        weight_(i,j) = weightFunc(pt);
+      }
+    }
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = u_coeff.extent_int(0);
+    int p = feVel_->cubPts().extent_int(1);
+    int d = feVel_->cubPts().extent_int(2);
+    // Initialize output val
+    val = scalar_view("val", c);
+    // Get components of the control
+    std::vector<scalar_view> U;
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    // Evaluate on FE basis
+    scalar_view gradUX_eval("gradUX_eval", c, p, d);
+    scalar_view gradUY_eval("gradUY_eval", c, p, d);
+    feVel_->evaluateGradient(gradUX_eval, U[0]);
+    feVel_->evaluateGradient(gradUY_eval, U[1]);
+    // Compute curl
+    scalar_view curlU_eval("curlU_eval", c, p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        curlU_eval(i,j) = gradUY_eval(i,j,0) - gradUX_eval(i,j,1);
+      }
+    }
+    // Multiply by weight
+    scalar_view weighted_curlU_eval("curlU_eval", c, p);
+    fst::scalarMultiplyDataData(weighted_curlU_eval,weight_,curlU_eval);
+    // Compute L2 norm squared
+    feVel_->computeIntegral(val,curlU_eval,weighted_curlU_eval,false);
+    rst::scale(val,static_cast<Real>(0.5));
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = u_coeff.extent_int(0);
+    int fv = feVel_->N().extent_int(1);
+    int fp = fePrs_->N().extent_int(1);
+    int p = feVel_->cubPts().extent_int(1);
+    int d = feVel_->cubPts().extent_int(2);
+    // Initialize output grad
+    scalar_view velUX_grad("velUX_grad", c, fv);
+    scalar_view velUY_grad("velUY_grad", c, fv);
+    scalar_view presU_grad("presU_grad", c, fp);
+    std::vector<scalar_view> G;
+    G.resize(fieldInfo_->numFields);
+    G[0] = velUX_grad;
+    G[1] = velUY_grad;
+    G[2] = presU_grad;
+    // Get components of the control
+    std::vector<scalar_view> U;
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    // Evaluate on FE basis
+    scalar_view gradUX_eval("gradUX_eval", c, p, d);
+    scalar_view gradUY_eval("gradUY_eval", c, p, d);
+    feVel_->evaluateGradient(gradUX_eval, U[0]);
+    feVel_->evaluateGradient(gradUY_eval, U[1]);
+    // Compute curl
+    scalar_view curlU_eval("curlU_eval", c, p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        curlU_eval(i,j) = weight_(i,j)*(gradUY_eval(i,j,0) - gradUX_eval(i,j,1));
+      }
+    }
+    // Build local gradient of state tracking term
+    fst::integrate(velUX_grad,curlU_eval,feVel_->DNDdetJ(1),false);
+    rst::scale(velUX_grad,static_cast<Real>(-1));
+    fst::integrate(velUY_grad,curlU_eval,feVel_->DNDdetJ(0),false);
+
+    FieldUtils::combineFieldCoeff<Real>(grad, G, fieldInfo_);
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Vorticity_NavierStokes::gradient_2 is zero.");
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c  = z_coeff.extent_int(0);
+    int p  = feVel_->cubPts().extent_int(1);
+    int fv = feVel_->N().extent_int(1);
+    int fp = fePrs_->N().extent_int(1);
+    int d = feVel_->cubPts().extent_int(2);
+    // Initialize output grad
+    scalar_view velVX_grad("velVX_grad", c, fv);
+    scalar_view velVY_grad("velVY_grad", c, fv);
+    scalar_view presV_grad("presV_grad", c, fp);
+    std::vector<scalar_view> G;
+    G.resize(fieldInfo_->numFields);
+    G[0] = velVX_grad;
+    G[1] = velVY_grad;
+    G[2] = presV_grad;
+    // Get components of the control
+    std::vector<scalar_view> V;
+    FieldUtils::splitFieldCoeff<Real>(V, v_coeff, fieldInfo_);
+    // Evaluate on FE basis
+    scalar_view gradVX_eval("gradVX_eval", c, p, d);
+    scalar_view gradVY_eval("gradVY_eval", c, p, d);
+    feVel_->evaluateGradient(gradVX_eval, V[0]);
+    feVel_->evaluateGradient(gradVY_eval, V[1]);
+    // Compute curl
+    scalar_view curlV_eval("curlV_eval", c, p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        curlV_eval(i,j) = weight_(i,j)*(gradVY_eval(i,j,0) - gradVX_eval(i,j,1));
+      }
+    }
+    // Build local gradient of state tracking term
+    fst::integrate(velVX_grad,curlV_eval,feVel_->DNDdetJ(1),false);
+    rst::scale(velVX_grad,static_cast<Real>(-1));
+    fst::integrate(velVY_grad,curlV_eval,feVel_->DNDdetJ(0),false);
+
+    FieldUtils::combineFieldCoeff<Real>(hess, G, fieldInfo_, fieldInfo_);
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Vorticity_NavierStokes::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Vorticity_NavierStokes::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Vorticity_NavierStokes::HessVec_22 is zero.");
+  }
+
+}; // QoI_Vorticity_NavierStokes
+
+template <class Real, class DeviceType>
+class QoI_Circulation_NavierStokes : public QoI<Real, DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> feVel_, fePrs_;
+  const ROL::Ptr<const FieldUtils::FieldInfo> fieldInfo_;
+  scalar_view weight_;
+
+  const Real eps_;
+
+  Real weightFunc(const std::vector<Real> & x) const {
+    return static_cast<Real>(1);
+  }
+
+public:
+  QoI_Circulation_NavierStokes(const ROL::Ptr<fe_type> &feVel,
+                               const ROL::Ptr<fe_type> &fePrs,
+                               const ROL::Ptr<const FieldUtils::FieldInfo> &fieldInfo)
+    : feVel_(feVel), fePrs_(fePrs), fieldInfo_(fieldInfo), eps_(std::sqrt(ROL::ROL_EPSILON<Real>())) {
+    int c = feVel_->cubPts().extent_int(0);
+    int p = feVel_->cubPts().extent_int(1);
+    int d = feVel_->cubPts().extent_int(2);
+    std::vector<Real> pt(d);
+    weight_ = scalar_view("weight",c,p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k)
+          pt[k] = (feVel_->cubPts())(i,j,k);
+        weight_(i,j) = weightFunc(pt);
+      }
+    }
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = u_coeff.extent_int(0);
+    int p = feVel_->cubPts().extent_int(1);
+    int d = feVel_->cubPts().extent_int(2);
+    // Initialize output val
+    val = scalar_view("val", c);
+    // Get components of the control
+    std::vector<scalar_view> U;
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    // Evaluate on FE basis
+    scalar_view gradUX_eval("gradUX_eval", c, p, d);
+    scalar_view gradUY_eval("gradUY_eval", c, p, d);
+    feVel_->evaluateGradient(gradUX_eval, U[0]);
+    feVel_->evaluateGradient(gradUY_eval, U[1]);
+    // Compute curl
+    scalar_view curlU_eval("curlU_eval", c, p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j)
+        curlU_eval(i,j)   = gradUY_eval(i,j,0) - gradUX_eval(i,j,1);
+    }
+    // Compute circulation
+    feVel_->computeIntegral(val,curlU_eval,weight_,false);
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = u_coeff.extent_int(0);
+    int fv = feVel_->N().extent_int(1);
+    int fp = fePrs_->N().extent_int(1);
+    // Initialize output grad
+    scalar_view velUX_grad("velUX_grad", c, fv);
+    scalar_view velUY_grad("velUY_grad", c, fv);
+    scalar_view presU_grad("presU_grad", c, fp);
+    std::vector<scalar_view> G;
+    G.resize(fieldInfo_->numFields);
+    G[0] = ROL::makePtrFromRef(velUX_grad);
+    G[1] = ROL::makePtrFromRef(velUY_grad);
+    G[2] = ROL::makePtrFromRef(presU_grad);
+    // Build local gradient of state tracking term
+    fst::integrate(velUX_grad,weight_,feVel_->DNDdetJ(1),false);
+    rst::scale(velUX_grad,static_cast<Real>(-1));
+    fst::integrate(velUY_grad,weight_,feVel_->DNDdetJ(0),false);
+
+    FieldUtils::combineFieldCoeff<Real>(grad, G, fieldInfo_);
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Circulation_NavierStokes::gradient_2 is zero.");
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Circulation_NavierStokes::HessVec_11 is zero.");
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Circulation_NavierStokes::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Circulation_NavierStokes::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Circulation_NavierStokes::HessVec_22 is zero.");
+  }
+
+}; // QoI_Circulation_NavierStokes
+
+template <class Real, class DeviceType>
+class QoI_Directional_NavierStokes : public QoI<Real, DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> feVel_, fePrs_;
+  const ROL::Ptr<const FieldUtils::FieldInfo> fieldInfo_;
+  scalar_view weight_;
+
+  const Real eps_;
+  std::vector<Real> dir_;
+
+  Real weightFunc(const std::vector<Real> & x) const {
+    return static_cast<Real>(1);
+  }
+
+public:
+  QoI_Directional_NavierStokes(ROL::ParameterList &list,
+                               const ROL::Ptr<fe_type> &feVel,
+                               const ROL::Ptr<fe_type> &fePrs,
+                               const ROL::Ptr<const FieldUtils::FieldInfo> &fieldInfo)
+    : feVel_(feVel), fePrs_(fePrs), fieldInfo_(fieldInfo), eps_(std::sqrt(ROL::ROL_EPSILON<Real>())) {
+    const int c = feVel_->gradN().extent_int(0);
+    const int p = feVel_->gradN().extent_int(2);
+    const int d = feVel_->gradN().extent_int(3);
+    std::vector<Real> pt(d);
+    weight_ = scalar_view("weight",c,p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k)
+          pt[k] = (feVel_->cubPts())(i,j,k);
+        weight_(i,j) = weightFunc(pt);
+      }
+    }
+    dir_ = ROL::getArrayFromStringParameter<Real>(list, "Direction");
+    Real norm(0);
+    for (const auto e : dir_) norm += e*e;
+    norm = std::sqrt(norm);
+    for (const auto e : dir_) std::cout << e << std::endl;
+    for (auto & e : dir_) e /= norm;
+    for (const auto e : dir_) std::cout << e << std::endl;
+    //for (int i = 0; i < d; ++i) dir_[i] /= norm;
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c = feVel_->gradN().extent_int(0);
+    const int p = feVel_->gradN().extent_int(2);
+    const int d = feVel_->gradN().extent_int(3);
+    // Initialize output val
+    val = scalar_view("val", c);
+    // Get components of the control
+    std::vector<scalar_view> U;
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    // Evaluate on FE basis
+    scalar_view Uval("Uval", c, p);
+    scalar_view dot("dot", c, p);
+    scalar_view Upar("Upar", c, p, d);
+    scalar_view Uort("Uort", c, p, d);
+    for (int i = 0; i < d; ++i) {
+      Kokkos::deep_copy(Uval, scalar_view(0));
+      feVel_->evaluateValue(Uval, U[i]);
+      for (int j = 0; j < c; ++j) {
+        for (int k = 0; k < p; ++k) {
+          dot(j,k)   += dir_[i] * Uval(j,k);
+          Upar(j,k,i) = std::sqrt(weight_(j,k)) * dir_[i];
+          Uort(j,k,i) = std::sqrt(weight_(j,k)) * Uval(j,k);
+        }
+      }
+    }
+    for (int i = 0; i < d; ++i) {
+      for (int j = 0; j < c; ++j) {
+        for (int k = 0; k < p; ++k) {
+          Upar(j,k,i) *= dot(j,k);
+          Uort(j,k,i) -= Upar(j,k,i);
+          Upar(j,k,i)  = std::min(static_cast<Real>(0),Upar(j,k,i));
+        }
+      }
+    }
+    // Compute L2 norm squared
+    feVel_->computeIntegral(val,Upar,Upar,false);
+    feVel_->computeIntegral(val,Uort,Uort,true);
+    rst::scale(val,static_cast<Real>(0.5));
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fv = feVel_->gradN().extent_int(1);
+    const int fp = fePrs_->gradN().extent_int(1);
+    const int p  = feVel_->gradN().extent_int(2);
+    const int d  = feVel_->gradN().extent_int(3);
+    // Initialize output grad
+    std::vector<scalar_view> G;
+    G.resize(fieldInfo_->numFields);
+    for (int i = 0; i < d; ++i)
+      G[i] = scalar_view("grad", c, fv);
+    G[d] = scalar_view("grad", c, fp);
+    // Get components of the state
+    std::vector<scalar_view> U;
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    // Evaluate on FE basis
+    std::vector<scalar_view> Upar(d), Uort(d);
+    scalar_view Uval("Uval", c, p);
+    scalar_view dot("dot", c, p);
+    for (int i = 0; i < d; ++i) {
+      Kokkos::deep_copy(Uval, static_cast<Real>(0));
+      Upar[i] = scalar_view("Upar", c, p);
+      Uort[i] = scalar_view("Uort", c, p);
+      feVel_->evaluateValue(Uval, U[i]);
+      for (int j = 0; j < c; ++j) {
+        for (int k = 0; k < p; ++k) {
+          dot(j,k)    += dir_[i] * Uval(j,k);
+          Upar[i](j,k) = std::sqrt(weight_(j,k)) * dir_[i];
+          Uort[i](j,k) = std::sqrt(weight_(j,k)) * Uval(j,k);
+        }
+      }
+    }
+    for (int i = 0; i < d; ++i) {
+      for (int j = 0; j < c; ++j) {
+        for (int k = 0; k < p; ++k) {
+          (Upar[i])(j,k) *= dot(j,k);
+          (Uort[i])(j,k) -= (Upar[i])(j,k);
+          (Upar[i])(j,k)  = std::min(static_cast<Real>(0),(Upar[i])(j,k));
+        }
+      }
+    }
+    // Build local gradient of state tracking term
+    for (int i = 0; i < d; ++i) {
+      fst::integrate(G[i],Upar[i],feVel_->NdetJ(),false);
+      fst::integrate(G[i],Uort[i],feVel_->NdetJ(), true);
+    }
+    FieldUtils::combineFieldCoeff<Real>(grad, G, fieldInfo_);
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Directional_NavierStokes::gradient_2 is zero.");
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c  = z_coeff.extent_int(0);
+    int p  = feVel_->cubPts().extent_int(1);
+    int fv = feVel_->N().extent_int(1);
+    int fp = fePrs_->N().extent_int(1);
+    // Initialize output grad
+    scalar_view velVX_grad("velVX_grad", c, fv);
+    scalar_view velVY_grad("velVY_grad", c, fv);
+    scalar_view presV_grad("presV_grad", c, fp);
+    std::vector<scalar_view> G;
+    G.resize(fieldInfo_->numFields);
+    G[0] = velVX_grad;
+    G[1] = velVY_grad;
+    G[2] = presV_grad;
+    // Get components of the control
+    std::vector<scalar_view> U, V;
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(V, v_coeff, fieldInfo_);
+    // Evaluate on FE basis
+    scalar_view UX("UX", c, p);
+    scalar_view UY("UY", c, p);
+    scalar_view VX("VX", c, p);
+    scalar_view VY("VY", c, p);
+    scalar_view WVX("WVX", c, p);
+    scalar_view WVY("WVY", c, p);
+    feVel_->evaluateValue(UX, U[0]);
+    feVel_->evaluateValue(UY, U[1]);
+    feVel_->evaluateValue(VX, V[0]);
+    feVel_->evaluateValue(VY, V[1]);
+    // Compute negative part of x-velocity
+    const Real zero(0), one(1);
+    Real scale(0);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        scale = (UY(i,j) >= zero) ? one : zero;
+        WVX(i,j) =         weight_(i,j) * VX(i,j);
+        WVY(i,j) = scale * weight_(i,j) * VY(i,j);
+      }
+    }
+    // Build local gradient of state tracking term
+    fst::integrate(velVX_grad,WVX,feVel_->NdetJ(),false);
+    fst::integrate(velVY_grad,WVY,feVel_->NdetJ(),false);
+
+    FieldUtils::combineFieldCoeff<Real>(hess, G, fieldInfo_, fieldInfo_);
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Directional_NavierStokes::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Directional_NavierStokes::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Directional_NavierStokes::HessVec_22 is zero.");
+  }
+
+}; // QoI_Directional_NavierStokes
+
+template <class Real, class DeviceType>
+class QoI_Pressure_NavierStokes : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> feVel_, fePrs_;
+  const ROL::Ptr<const FieldUtils::FieldInfo> fieldInfo_;
+  scalar_view weight_;
+
+  const Real width_;
+  const Real outletHeight_;
+  const Real eps_;
+
+  Real weightFunc(const std::vector<Real> & x) const {
+    //return (x[1]<=outletHeight_+eps_ && std::abs(x[0]-width_)<=width_-0.6) ? static_cast<Real>(-1) : static_cast<Real>(0);
+    return static_cast<Real>(1);
+  }
+
+public:
+  QoI_Pressure_NavierStokes(const ROL::Ptr<fe_type> &feVel,
+                            const ROL::Ptr<fe_type> &fePrs,
+                            const ROL::Ptr<const FieldUtils::FieldInfo> &fieldInfo)
+    : feVel_(feVel), fePrs_(fePrs), fieldInfo_(fieldInfo),
+      width_(12.7), outletHeight_(0.1*1.6936),
+      eps_(std::sqrt(ROL::ROL_EPSILON<Real>())) {
+    const int c = feVel_->cubPts().extent_int(0);
+    const int p = feVel_->cubPts().extent_int(1);
+    const int d = feVel_->cubPts().extent_int(2);
+    std::vector<Real> pt(d);
+    weight_ = scalar_view("weight",c,p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k)
+          pt[k] = (feVel_->cubPts())(i,j,k);
+        weight_(i,j) = weightFunc(pt);
+      }
+    }
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c = fePrs_->gradN().extent_int(0);
+    const int p = fePrs_->gradN().extent_int(2);
+    const int d = fePrs_->gradN().extent_int(3);
+    // Initialize output val
+    val = scalar_view("val", c);
+    // Get components of the control
+    std::vector<scalar_view> U;
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    // Evaluate on FE basis
+    scalar_view P("P", c, p);
+    scalar_view WP("WP", c, p);
+    fePrs_->evaluateValue(P, U[d]);
+    // Scale the pressure
+    fst::scalarMultiplyDataData(WP,weight_,P);
+    feVel_->computeIntegral(val,P,WP,false);
+    rst::scale(val,static_cast<Real>(0.5));
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c  = fePrs_->gradN().extent_int(0);
+    const int fv = feVel_->gradN().extent_int(1);
+    const int fp = fePrs_->gradN().extent_int(1);
+    const int p  = fePrs_->gradN().extent_int(2);
+    const int d  = fePrs_->gradN().extent_int(3);
+    // Initialize output grad
+    std::vector<scalar_view> G;
+    G.resize(fieldInfo_->numFields);
+    for (int i = 0; i < d; ++i)
+      G[i] = scalar_view("grad", c, fv);
+    G[d] = scalar_view("grad", c, fp);
+    // Get components of the control
+    std::vector<scalar_view> U;
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    // Evaluate on FE basis
+    scalar_view P("P", c, p);
+    scalar_view WP("WP", c, p);
+    fePrs_->evaluateValue(P, U[d]);
+    // Scale pressure
+    fst::scalarMultiplyDataData(WP,weight_,P);
+    fst::integrate(G[d],WP,fePrs_->NdetJ(),false);
+    FieldUtils::combineFieldCoeff<Real>(grad, G, fieldInfo_);
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Pressure_NavierStokes::gradient_2 is zero.");
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c  = fePrs_->gradN().extent_int(0);
+    const int fv = feVel_->gradN().extent_int(1);
+    const int fp = fePrs_->gradN().extent_int(1);
+    const int p  = fePrs_->gradN().extent_int(2);
+    const int d  = fePrs_->gradN().extent_int(3);
+    // Initialize output grad
+    std::vector<scalar_view> H;
+    H.resize(fieldInfo_->numFields);
+    for (int i = 0; i < d; ++i)
+      H[i] = scalar_view("hess", c, fv);
+    H[d] = scalar_view("hess", c, fp);
+    // Get components of the control
+    std::vector<scalar_view> V;
+    FieldUtils::splitFieldCoeff(V, v_coeff, fieldInfo_);
+    // Evaluate on FE basis
+    scalar_view P("P", c, p);
+    scalar_view WP("WP", c, p);
+    fePrs_->evaluateValue(P, V[d]);
+    // Scale pressure
+    fst::scalarMultiplyDataData(WP,weight_,P);
+    fst::integrate(H[d],WP,fePrs_->NdetJ(),false);
+    FieldUtils::combineFieldCoeff<Real>(hess, H, fieldInfo_, fieldInfo_);
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Pressure_NavierStokes::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Pressure_NavierStokes::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Pressure_NavierStokes::HessVec_22 is zero.");
+  }
+
+}; // QoI_Pressure_NavierStokes
+
+template <class Real, class DeviceType>
+class QoI_Velocity_NavierStokes : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type>                     feVel_, fePrs_;
+  const std::vector<ROL::Ptr<fe_type>>        feVelBdry_;
+  const std::vector<std::vector<int>>         bdryCellLocIds_;
+  const ROL::Ptr<const FieldUtils::FieldInfo> fieldInfo_;
+  std::vector<Real>                           target_;
+  bool                                        onlyAxial_;
+
+  scalar_view getBoundaryCoeff(const scalar_view cell_coeff, int locSideId) const {
+    std::vector<int> bdryCellLocId = bdryCellLocIds_[locSideId];
+    const int numCellsSide = bdryCellLocId.size();
+    const int f = feVel_->N().extent_int(1);
+    
+    scalar_view bdry_coeff("bdry_coeff", numCellsSide, f);
+    for (int i = 0; i < numCellsSide; ++i) {
+      for (int j = 0; j < f; ++j)
+        bdry_coeff(i, j) = cell_coeff(bdryCellLocId[i], j);
+    }
+    return bdry_coeff;
+  }
+
+public:
+  QoI_Velocity_NavierStokes(ROL::ParameterList   &list,
+                            const ROL::Ptr<fe_type> &feVel,
+                            const ROL::Ptr<fe_type> &fePrs,
+                            const std::vector<ROL::Ptr<fe_type>> &feVelBdry,
+                            const std::vector<std::vector<int>>   &bdryCellLocIds,
+                            const ROL::Ptr<const FieldUtils::FieldInfo> &fieldInfo)
+    : feVel_(feVel), fePrs_(fePrs),
+      feVelBdry_(feVelBdry), bdryCellLocIds_(bdryCellLocIds),
+      fieldInfo_(fieldInfo) {
+    target_.clear(); target_.resize(2);
+    target_[0] = list.sublist("Problem").get("Target Radial Velocity",0.0);
+    target_[1] = list.sublist("Problem").get("Target Axial Velocity",-15.0);
+    onlyAxial_ = list.sublist("Problem").get("Only Use Axial Velocity",false);
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c = feVel_->gradN().extent_int(0);
+    const int d = feVel_->gradN().extent_int(3);
+    // Get components of the control
+    std::vector<scalar_view> U;
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    // Initialize output val
+    val = scalar_view("val", c);
+    // Compute cost integral
+    std::vector<Real> target = (z_param == ROL::nullPtr) ? target_ : *z_param;
+    const int numLocSides = bdryCellLocIds_.size();
+    for (int l = 0; l < numLocSides; ++l) {
+      const int numCellsSide  = bdryCellLocIds_[l].size();
+      if ( numCellsSide ) {
+        const int numCubPerSide = feVelBdry_[l]->cubPts().extent_int(1);
+        scalar_view valU_eval("valU_eval", numCellsSide, numCubPerSide);
+        scalar_view intVal("intVal", numCellsSide);
+        for (int k = 0; k < d; ++k) {
+          if ((k==0 && !onlyAxial_) || k==1) {
+            scalar_view u_coeff_bdry = getBoundaryCoeff(U[k], l);
+	    Kokkos::deep_copy(valU_eval, static_cast<Real>(0));
+            feVelBdry_[l]->evaluateValue(valU_eval, u_coeff_bdry);
+            for (int i = 0; i < numCellsSide; ++i) {
+              for (int j = 0; j < numCubPerSide; ++j)
+                valU_eval(i,j) -= target[k];
+            }
+            feVelBdry_[l]->computeIntegral(intVal,valU_eval,valU_eval,true);
+          }
+        }
+        // Add to integral value
+        for (int i = 0; i < numCellsSide; ++i) {
+          int cidx = bdryCellLocIds_[l][i];
+          val(cidx) += static_cast<Real>(0.5)*intVal(i);
+        }
+      }
+    }
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fv = feVel_->gradN().extent_int(1);
+    const int fp = fePrs_->gradN().extent_int(1);
+    const int d  = feVel_->gradN().extent_int(3);
+    // Initialize output grad
+    std::vector<scalar_view> G;
+    G.resize(fieldInfo_->numFields);
+    for (int i = 0; i < d; ++i)
+      G[i] = scalar_view("grad", c, fv);
+    G[d] = scalar_view("grad", c, fp);
+    // Get components of the state
+    std::vector<scalar_view> U;
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    // Compute cost integral
+    std::vector<Real> target = (z_param == ROL::nullPtr) ? target_ : *z_param;
+    const int numLocSides = bdryCellLocIds_.size();
+    for (int l = 0; l < numLocSides; ++l) {
+      const int numCellsSide  = bdryCellLocIds_[l].size();
+      if ( numCellsSide ) {
+        const int numCubPerSide = feVelBdry_[l]->cubPts().extent_int(1);
+        scalar_view valU_eval("valU_eval", numCellsSide, numCubPerSide);
+        scalar_view intGrad("intGrad", numCellsSide, fv);
+        for (int k = 0; k < d; ++k) {
+          if ((k==0 && !onlyAxial_) || k==1) {
+            scalar_view u_coeff_bdry = getBoundaryCoeff(U[k], l);
+	    Kokkos::deep_copy(valU_eval, static_cast<Real>(0));
+            feVelBdry_[l]->evaluateValue(valU_eval, u_coeff_bdry);
+            for (int i = 0; i < numCellsSide; ++i) {
+              for (int j = 0; j < numCubPerSide; ++j)
+                valU_eval(i,j) -= target[k];
+            }
+            // Compute gradient of squared L2-norm of diff
+            fst::integrate(intGrad,valU_eval,feVelBdry_[l]->NdetJ(),false);
+            // Add to integral value
+            for (int i = 0; i < numCellsSide; ++i) {
+              int cidx = bdryCellLocIds_[l][i];
+              for (int j = 0; j < fv; ++j)
+                (G[k])(cidx,j) += intGrad(i,j);
+            }
+          }
+        }
+      }
+    }
+    FieldUtils::combineFieldCoeff(grad, G, fieldInfo_);
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Velocity_NavierStokes::gradient_2 is zero.");
+  }
+
+  std::vector<Real> gradient_3(std::vector<scalar_view> & grad,
+                               const scalar_view u_coeff,
+                               const scalar_view z_coeff = scalar_view(),
+                               const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if (z_param != ROL::nullPtr) {
+      // Get relevant dimensions
+      const int c = feVel_->gradN().extent_int(0);
+      const int d = feVel_->gradN().extent_int(3);
+      // Get components of the control
+      std::vector<scalar_view> U;
+      FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+      // Initialize output val
+      std::vector<Real> g_param(d,static_cast<Real>(0));
+      grad.clear(); grad.resize(d);
+      for (int k = 0; k < d; ++k)
+        grad[k] = scalar_view("grad", c);
+      // Compute cost integral
+      const int numLocSides = bdryCellLocIds_.size();
+      for (int l = 0; l < numLocSides; ++l) {
+        const int numCellsSide  = bdryCellLocIds_[l].size();
+        if ( numCellsSide ) {
+          const int numCubPerSide = feVelBdry_[l]->cubPts().extent_int(1);
+          std::vector<scalar_view> intVal(d);
+          scalar_view valU_eval("valU_eval", numCellsSide, numCubPerSide);
+          scalar_view weight("weight", numCellsSide, numCubPerSide);
+          for (int k = 0; k < d; ++k) {
+            intVal[k] = scalar_view("intVal", numCellsSide);
+            if ((k==0 && !onlyAxial_) || k==1) {
+              scalar_view u_coeff_bdry = getBoundaryCoeff(U[k], l);
+	      Kokkos::deep_copy(valU_eval, static_cast<Real>(0));
+              feVelBdry_[l]->evaluateValue(valU_eval, u_coeff_bdry);
+              for (int i = 0; i < numCellsSide; ++i) {
+                for (int j = 0; j < numCubPerSide; ++j) {
+                  weight(i,j) = static_cast<Real>(1);
+                  valU_eval(i,j) *= static_cast<Real>(-1);
+                  valU_eval(i,j) += (*z_param)[k];
+                }
+              }
+              feVelBdry_[l]->computeIntegral(intVal[k],weight,valU_eval,true);
+            }
+          }
+          // Add to integral value
+          for (int k = 0; k < d; ++k) {
+            for (int i = 0; i < numCellsSide; ++i) {
+              int cidx = bdryCellLocIds_[l][i];
+              (grad[k])(cidx) += (intVal[k])(i);
+            }
+          }
+        }
+      }
+      return g_param;
+    }
+    else {
+      throw Exception::Zero(">>> QoI_Velocity_NavierStokes::gradient_3 is zero.");
+    }
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fv = feVel_->gradN().extent_int(1);
+    const int fp = fePrs_->gradN().extent_int(1);
+    const int d  = feVel_->gradN().extent_int(3);
+    // Initialize output grad
+    std::vector<scalar_view> H;
+    H.resize(fieldInfo_->numFields);
+    for (int i = 0; i < d; ++i)
+      H[i] = scalar_view("hess", c, fv);
+    H[d] = scalar_view("hess", c, fp);
+    // Get components of the direction
+    std::vector<scalar_view> V;
+    FieldUtils::splitFieldCoeff<Real>(V, v_coeff, fieldInfo_);
+    // Compute cost integral
+    const int numLocSides = bdryCellLocIds_.size();
+    for (int l = 0; l < numLocSides; ++l) {
+      const int numCellsSide  = bdryCellLocIds_[l].size();
+      if ( numCellsSide ) {
+        const int numCubPerSide = feVelBdry_[l]->cubPts().extent_int(1);
+        scalar_view valV_eval("valV_eval", numCellsSide, numCubPerSide);
+        scalar_view intHess("intHess", numCellsSide, fv);
+        for (int k = 0; k < d; ++k) {
+          if ((k==0 && !onlyAxial_) || k==1) {
+            scalar_view v_coeff_bdry = getBoundaryCoeff(V[k], l);
+	    Kokkos::deep_copy(valV_eval, static_cast<Real>(0));
+            feVelBdry_[l]->evaluateValue(valV_eval, v_coeff_bdry);
+            // Compute hessian of squared L2-norm of diff
+            fst::integrate(intHess,valV_eval,feVelBdry_[l]->NdetJ(),false);
+            // Add to integral value
+            for (int i = 0; i < numCellsSide; ++i) {
+              int cidx = bdryCellLocIds_[l][i];
+              for (int j = 0; j < fv; ++j) {
+                (H[k])(cidx,j) += intHess(i,j);
+              }
+            }
+          }
+        }
+      }
+    }
+    FieldUtils::combineFieldCoeff<Real>(hess, H, fieldInfo_);
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Velocity_NavierStokes::HessVec_12 is zero.");
+  }
+
+  void HessVec_13(scalar_view & hess,
+                  const ROL::Ptr<const std::vector<Real>> & v_param,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if (z_param != ROL::nullPtr) {
+      // Get relevant dimensions
+      const int c  = feVel_->gradN().extent_int(0);
+      const int fv = feVel_->gradN().extent_int(1);
+      const int fp = fePrs_->gradN().extent_int(1);
+      const int d  = feVel_->gradN().extent_int(3);
+      // Initialize output grad
+      std::vector<scalar_view> H;
+      H.resize(fieldInfo_->numFields);
+      for (int i = 0; i < d; ++i)
+        H[i] = scalar_view("hess", c, fv);
+      H[d] = scalar_view("hess", c, fp);
+      // Compute cost integral
+      const int numLocSides = bdryCellLocIds_.size();
+      for (int l = 0; l < numLocSides; ++l) {
+        const int numCellsSide  = bdryCellLocIds_[l].size();
+        if ( numCellsSide ) {
+          const int numCubPerSide = feVelBdry_[l]->cubPts().extent_int(1);
+          scalar_view valU_eval("valU_eval", numCellsSide, numCubPerSide);
+          scalar_view intHess("intHess", numCellsSide, fv);
+          for (int k = 0; k < d; ++k) {
+            if ((k==0 && !onlyAxial_) || k==1) {
+              for (int i = 0; i < numCellsSide; ++i) {
+                for (int j = 0; j < numCubPerSide; ++j) {
+                  valU_eval(i,j) = -(*v_param)[k];
+                }
+              }
+              // Compute gradient of squared L2-norm of diff
+              fst::integrate(intHess,valU_eval,feVelBdry_[l]->NdetJ(),false);
+              // Add to integral value
+              for (int i = 0; i < numCellsSide; ++i) {
+                int cidx = bdryCellLocIds_[l][i];
+                for (int j = 0; j < fv; ++j)
+                  (H[k])(cidx,j) += intHess(i,j);
+              }
+            }
+          }
+        }
+      }
+      FieldUtils::combineFieldCoeff(hess, H, fieldInfo_);
+    }
+    else {
+      throw Exception::NotImplemented(">>> HessVec_13 not implemented.");
+    }
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Velocity_NavierStokes::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Velocity_NavierStokes::HessVec_22 is zero.");
+  }
+
+  void HessVec_23(scalar_view & hess,
+                  const ROL::Ptr<const std::vector<Real>> & v_param,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Velocity_NavierStokes::HessVec_23 is zero.");
+  }
+
+  std::vector<Real> HessVec_31(std::vector<scalar_view> & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if (z_param != ROL::nullPtr) {
+      // Get relevant dimensions
+      const int c = feVel_->gradN().extent_int(0);
+      const int d = feVel_->gradN().extent_int(3);
+      // Get components of the state
+      std::vector<scalar_view> V;
+      FieldUtils::splitFieldCoeff<Real>(V, v_coeff, fieldInfo_);
+      // Initialize output val
+      std::vector<Real> h_param(d,static_cast<Real>(0));
+      hess.clear(); hess.resize(d);
+      for (int k = 0; k < d; ++k)
+        hess[k] = scalar_view("hess", c);
+      // Compute cost integral
+      const int numLocSides = bdryCellLocIds_.size();
+      for (int l = 0; l < numLocSides; ++l) {
+        const int numCellsSide  = bdryCellLocIds_[l].size();
+        if ( numCellsSide ) {
+          const int numCubPerSide = feVelBdry_[l]->cubPts().extent_int(1);
+          std::vector<scalar_view> intVal(d);
+          scalar_view valV_eval("valV_eval", numCellsSide, numCubPerSide);
+          scalar_view weight("weight", numCellsSide, numCubPerSide);
+          for (int k = 0; k < d; ++k) {
+            intVal[k] = scalar_view("intVal", numCellsSide);
+            if ((k==0 && !onlyAxial_) || k==1) {
+              scalar_view v_coeff_bdry = getBoundaryCoeff(V[k], l);
+	      Kokkos::deep_copy(valV_eval, static_cast<Real>(0));
+              feVelBdry_[l]->evaluateValue(valV_eval, v_coeff_bdry);
+              for (int i = 0; i < numCellsSide; ++i) {
+                for (int j = 0; j < numCubPerSide; ++j) {
+                  weight(i,j) = static_cast<Real>(1);
+                  valV_eval(i,j) *= static_cast<Real>(-1);
+                }
+              }
+              feVelBdry_[l]->computeIntegral(intVal[k],weight,valV_eval,true);
+            }
+          }
+          // Add to integral value
+          for (int k = 0; k < d; ++k) {
+            for (int i = 0; i < numCellsSide; ++i) {
+              int cidx = bdryCellLocIds_[l][i];
+              (hess[k])(cidx) += (intVal[k])(i);
+            }
+          }
+        }
+      }
+      return h_param;
+    }
+    else {
+      throw Exception::Zero(">>> QoI_Velocity_NavierStokes::HessVec_31 is zero.");
+    }
+  }
+
+  std::vector<Real> HessVec_32(std::vector<scalar_view> & hess,
+                               const scalar_view v_coeff,
+                               const scalar_view u_coeff,
+                               const scalar_view z_coeff = scalar_view(),
+                               const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> HessVec_32 is zero.");
+  }
+
+  std::vector<Real> HessVec_33(std::vector<scalar_view> & hess,
+                               const ROL::Ptr<const std::vector<Real>> & v_param,
+                               const scalar_view u_coeff,
+                               const scalar_view z_coeff = scalar_view(),
+                               const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if (z_param != ROL::nullPtr) {
+      // Get relevant dimensions
+      const int c = feVel_->gradN().extent_int(0);
+      const int d = feVel_->gradN().extent_int(3);
+      // Get components of the control
+      std::vector<scalar_view> U;
+      FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+      // Initialize output val
+      std::vector<Real> h_param(d,static_cast<Real>(0));
+      hess.clear(); hess.resize(d);
+      for (int k = 0; k < d; ++k)
+        hess[k] = scalar_view("hess", c);
+      // Compute cost integral
+      const int numLocSides = bdryCellLocIds_.size();
+      for (int l = 0; l < numLocSides; ++l) {
+        const int numCellsSide  = bdryCellLocIds_[l].size();
+        if ( numCellsSide ) {
+          const int numCubPerSide = feVelBdry_[l]->cubPts().extent_int(1);
+          std::vector<scalar_view> intVal(d);
+          scalar_view valU_eval("valU_eval", numCellsSide, numCubPerSide);
+          scalar_view weight("weight", numCellsSide, numCubPerSide);
+          for (int k = 0; k < d; ++k) {
+            intVal[k] = scalar_view("intVal", numCellsSide);
+            if ((k==0 && !onlyAxial_) || k==1) {
+              Kokkos::deep_copy(valU_eval, static_cast<Real>(0));
+              for (int i = 0; i < numCellsSide; ++i) {
+                for (int j = 0; j < numCubPerSide; ++j) {
+                  weight(i,j) = static_cast<Real>(1);
+                  valU_eval(i,j) = (*v_param)[k];
+                }
+              }
+              feVelBdry_[l]->computeIntegral(intVal[k],weight,valU_eval,true);
+            }
+          }
+          // Add to integral value
+          for (int k = 0; k < d; ++k) {
+            for (int i = 0; i < numCellsSide; ++i) {
+              int cidx = bdryCellLocIds_[l][i];
+              (hess[k])(cidx) += (intVal[k])(i);
+            }
+          }
+        }
+      }
+      return h_param;
+    }
+    else {
+      throw Exception::Zero(">>> QoI_Velocity_NavierStokes::HessVec_33 is zero.");
+    }
+  }
+
+}; // QoI_Velocity_NavierStokes
+
+
+template <class Real, class DeviceType>
+class QoI_VelocityTracking_NavierStokes : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type>                     feVel_, fePrs_;
+  const ROL::Ptr<const FieldUtils::FieldInfo> fieldInfo_;
+  scalar_view                                 target_, weight_;
+  Real                                        rad_, yvel_, frac_, twpow_;
+  bool                                        onlyAxial_;
+
+  Real xTarget(const std::vector<Real> &x) const {
+    const Real X = x[0], Y = x[1];
+    //return xWeight(x) ? -X*Y/(rad_*rad_-Y*Y) : zero;
+    //return xWeight(x) ? -X*Y/std::sqrt(rad_*rad_-Y*Y) : zero;
+    //return polyWeight(x) * (-X*Y/std::sqrt(rad_*rad_-Y*Y));
+    //return -X*Y/std::sqrt(rad_*rad_-Y*Y);
+    return -X*Y/((rad_*rad_-Y*Y)*(rad_*rad_-Y*Y));
+  }
+
+  Real yTarget(const std::vector<Real> &x) const {
+    const Real one(1), Y = x[1];
+    //return yWeight(x) ? one : zero;
+    //return yWeight(x) ? std::sqrt(rad_*rad_-Y*Y) : zero;
+    //return polyWeight(x) * std::sqrt(rad_*rad_-Y*Y);
+    //return std::sqrt(rad_*rad_-Y*Y);
+    return one/(rad_*rad_-Y*Y);
+  }
+
+  Real xWeight(const std::vector<Real> &x) const {
+    return yWeight(x);
+  }
+
+  Real yWeight(const std::vector<Real> &x) const {
+    //const Real zero(0), one(1), Y = x[1];
+    //return (std::abs(Y) <= frac_*rad_ ? one : zero);
+    return polyWeight(x);
+  }
+
+  Real polyWeight(const std::vector<Real> &x) const {
+    const Real zero(0), one(1), Y = x[1], p = twpow_;
+    const Real yTOP = 9.976339196;
+    const Real yBOT = -yTOP;
+    Real val = 0, at = 0, bt = 0;
+    at = one / std::pow(-yTOP,p);
+    bt = one / std::pow(-yBOT,p);
+    if (Y > zero) val = at*std::pow(Y-yTOP,p);
+    else          val = bt*std::pow(Y-yBOT,p);
+    //std::cout << Y << "  " << val << std::endl;
+    return val;
+  }
+
+public:
+  QoI_VelocityTracking_NavierStokes(ROL::ParameterList                      &list,
+                                    const ROL::Ptr<fe_type>                    &feVel,
+                                    const ROL::Ptr<fe_type>                    &fePrs,
+                                    const ROL::Ptr<const FieldUtils::FieldInfo> &fieldInfo)
+    : feVel_(feVel), fePrs_(fePrs), fieldInfo_(fieldInfo) {
+    rad_         = list.sublist("Problem").get("Diffuser Radius",5.0);
+    yvel_        = list.sublist("Problem").get("Target Axial Velocity",15.0);
+    frac_        = list.sublist("Problem").get("Integration Domain Fraction",0.95);
+    onlyAxial_   = list.sublist("Problem").get("Only Use Axial Velocity",false);
+    twpow_       = list.sublist("Problem").get("Target Weighting Power",0.0);
+    Real xWScal  = list.sublist("Problem").get("Radial Tracking Scale",1.0);
+    Real yWScal  = list.sublist("Problem").get("Axial Tracking Scale",1.0);
+    bool useNorm = list.sublist("Problem").get("Use Normalized Misfit",false);
+    useNorm = onlyAxial_ ? false : useNorm;
+    xWScal  = onlyAxial_ ? static_cast<Real>(0) : xWScal;
+    const int c = fePrs_->gradN().extent_int(0);
+    const int p = fePrs_->gradN().extent_int(2);
+    target_ = scalar_view("target",c,p,2);
+    weight_ = scalar_view("weight",c,p,2);
+    std::vector<Real> x(2);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        x[0] = (feVel_->cubPts())(i,j,0);
+        x[1] = (feVel_->cubPts())(i,j,1);
+        target_(i,j,0) = xTarget(x);
+        target_(i,j,1) = yTarget(x);
+        if (useNorm && yWeight(x)) {
+          xWScal = static_cast<Real>(1)
+                  /(std::pow(target_(i,j,0),2) + std::pow(target_(i,j,1),2));
+          yWScal = xWScal;
+        }
+        weight_(i,j,0) = x[0] * xWScal * xWeight(x);
+        weight_(i,j,1) = x[0] * yWScal * yWeight(x);
+      }
+    }
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c = fePrs_->gradN().extent_int(0);
+    const int p = fePrs_->gradN().extent_int(2);
+    const int d = fePrs_->gradN().extent_int(3);
+    Real yvel = (z_param == ROL::nullPtr) ? yvel_ : (*z_param)[0];
+    // Initialize output val
+    val = scalar_view("val", c);
+    // Get components of the velocity
+    std::vector<scalar_view> U;
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    // Compute cost integral
+    scalar_view vel("vel",c,p,d);
+    scalar_view wvel("wvel",c,p,d);
+    scalar_view velx("velx",c,p);
+    scalar_view vely("vely",c,p);
+    feVel_->evaluateValue(velx, U[0]);
+    feVel_->evaluateValue(vely, U[1]);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        vel(i,j,0)  = velx(i,j)-yvel*target_(i,j,0);
+        vel(i,j,1)  = vely(i,j)-yvel*target_(i,j,1);
+        wvel(i,j,0) = weight_(i,j,0)*vel(i,j,0);
+        wvel(i,j,1) = weight_(i,j,1)*vel(i,j,1);
+      }
+    }
+
+    feVel_->computeIntegral(val,vel,wvel);
+    // Scale by one half
+    rst::scale(val,static_cast<Real>(0.5));
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c  = fePrs_->gradN().extent_int(0);
+    const int fv = feVel_->gradN().extent_int(1);
+    const int fp = fePrs_->gradN().extent_int(1);
+    const int p  = fePrs_->gradN().extent_int(2);
+    const int d  = fePrs_->gradN().extent_int(3);
+    Real yvel = (z_param == ROL::nullPtr) ? yvel_ : (*z_param)[0];
+    // Initialize output grad
+    std::vector<scalar_view> G;
+    G.resize(fieldInfo_->numFields);
+    for (int i = 0; i < d; ++i)
+      G[i] = scalar_view("grad", c, fv);
+    G[d] = scalar_view("grad", c, fp);
+    // Get components of the velocity
+    std::vector<scalar_view> U;
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    // Compute cost integral
+    scalar_view velx("velx",c,p);
+    scalar_view vely("vely",c,p);
+    scalar_view wvelx("wvelx",c,p);
+    scalar_view wvely("wvely",c,p);
+    feVel_->evaluateValue(velx, U[0]);
+    feVel_->evaluateValue(vely, U[1]);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        wvelx(i,j) = weight_(i,j,0)*(velx(i,j)-yvel*target_(i,j,0));
+        wvely(i,j) = weight_(i,j,1)*(vely(i,j)-yvel*target_(i,j,1));
+      }
+    }
+
+    fst::integrate(G[0],wvelx,feVel_->NdetJ(),false);
+    fst::integrate(G[1],wvely,feVel_->NdetJ(),false);
+    FieldUtils::combineFieldCoeff<Real>(grad, G, fieldInfo_);
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> gradient_2 is zero.");
+  }
+
+  std::vector<Real> gradient_3(std::vector<scalar_view> & grad,
+                               const scalar_view u_coeff,
+                               const scalar_view z_coeff = scalar_view(),
+                               const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if (z_param != ROL::nullPtr) {
+      // Get relevant dimensions
+      const int c = fePrs_->gradN().extent_int(0);
+      const int p = fePrs_->gradN().extent_int(2);
+      const int d = fePrs_->gradN().extent_int(3);
+      Real yvel = (z_param == ROL::nullPtr) ? yvel_ : (*z_param)[0];
+      // Initialize output val
+      std::vector<Real> g_param(d,static_cast<Real>(0));
+      grad.clear(); grad.resize(d);
+      for (int i = 0; i < d; ++i)
+        grad[i] = scalar_view("grad", c);
+      // Get components of the velocity
+      std::vector<scalar_view> U;
+      FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+      // Compute cost integral
+      scalar_view vel("vel",c,p,d);
+      scalar_view velx("velx",c,p);
+      scalar_view vely("vely",c,p);
+      feVel_->evaluateValue(velx, U[0]);
+      feVel_->evaluateValue(vely, U[1]);
+      for (int i = 0; i < c; ++i) {
+        for (int j = 0; j < p; ++j) {
+          vel(i,j,0)  = -weight_(i,j,0)*(velx(i,j)-yvel*target_(i,j,0));
+          vel(i,j,1)  = -weight_(i,j,1)*(vely(i,j)-yvel*target_(i,j,1));
+        }
+      }
+
+      feVel_->computeIntegral(grad[0],vel,target_);
+      
+      return g_param;
+    }
+    else {
+      throw Exception::Zero(">>> gradient_3 is zero.");
+    }
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c  = fePrs_->gradN().extent_int(0);
+    const int fv = feVel_->gradN().extent_int(1);
+    const int fp = fePrs_->gradN().extent_int(1);
+    const int p  = fePrs_->gradN().extent_int(2);
+    const int d  = fePrs_->gradN().extent_int(3);
+    // Initialize output grad
+    std::vector<scalar_view> H;
+    H.resize(fieldInfo_->numFields);
+    for (int i = 0; i < d; ++i)
+      H[i] = scalar_view("hess", c, fv);
+    H[d] = scalar_view("hess", c, fp);
+    // Get components of the velocity
+    std::vector<scalar_view> V;
+    FieldUtils::splitFieldCoeff<Real>(V, v_coeff, fieldInfo_);
+    // Compute cost integral
+    scalar_view velx("velx", c,p);
+    scalar_view vely("vely", c,p);
+    scalar_view wvelx("wvelx", c,p);
+    scalar_view wvely("wvely", c,p);
+    feVel_->evaluateValue(velx, V[0]);
+    feVel_->evaluateValue(vely, V[1]);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        wvelx(i,j)  = weight_(i,j,0)*velx(i,j);
+        wvely(i,j)  = weight_(i,j,1)*vely(i,j);
+      }
+    }
+
+    fst::integrate(H[0],wvelx,feVel_->NdetJ(),false);
+    fst::integrate(H[1],wvely,feVel_->NdetJ(),false);
+    FieldUtils::combineFieldCoeff<Real>(hess, H, fieldInfo_);
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> HessVec_12 is zero.");
+  }
+
+  void HessVec_13(scalar_view & hess,
+                  const ROL::Ptr<const std::vector<Real>> & v_param,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if (z_param != ROL::nullPtr) {
+      // Get relevant dimensions
+      const int c  = fePrs_->gradN().extent_int(0);
+      const int fv = feVel_->gradN().extent_int(1);
+      const int fp = fePrs_->gradN().extent_int(1);
+      const int p  = fePrs_->gradN().extent_int(2);
+      const int d  = fePrs_->gradN().extent_int(3);
+      // Initialize output val
+      std::vector<scalar_view> H;
+      H.resize(fieldInfo_->numFields);
+      for (int i = 0; i < d; ++i)
+        H[i] = scalar_view("hess", c, fv);
+      H[d] = scalar_view("hess", c, fp);
+      // Compute cost integral
+      scalar_view wvelx("wvelx", c,p);
+      scalar_view wvely("wvely", c,p);
+
+      for (int i = 0; i < c; ++i) {
+        for (int j = 0; j < p; ++j) {
+          wvelx(i,j) = -weight_(i,j,0)*target_(i,j,0)*(*v_param)[0];
+          wvely(i,j) = -weight_(i,j,1)*target_(i,j,1)*(*v_param)[0];
+        }
+      }
+
+      fst::integrate(H[0],wvelx,feVel_->NdetJ(),false);
+      fst::integrate(H[1],wvely,feVel_->NdetJ(),false);
+      FieldUtils::combineFieldCoeff<Real>(hess, H, fieldInfo_);
+    }
+    else {
+      throw Exception::Zero(">>> HessVec_13 is zero.");
+    }
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> HessVec_22 is zero.");
+  }
+
+  void HessVec_23(scalar_view & hess,
+                  const ROL::Ptr<const std::vector<Real>> & v_param,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> HessVec_23 is zero.");
+  }
+
+  std::vector<Real> HessVec_31(std::vector<scalar_view> & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if (z_param != ROL::nullPtr) {
+      // Get relevant dimensions
+      const int c = fePrs_->gradN().extent_int(0);
+      const int p = fePrs_->gradN().extent_int(2);
+      const int d = fePrs_->gradN().extent_int(3);
+      // Initialize output val
+      std::vector<Real> h_param(d,static_cast<Real>(0));
+      hess.clear(); hess.resize(d);
+      for (int k = 0; k < d; ++k)
+        hess[k] = scalar_view("hess", c);
+      // Get components of the velocity
+      std::vector<scalar_view> V;
+      FieldUtils::splitFieldCoeff<Real>(V, v_coeff, fieldInfo_);
+      // Compute cost integral
+      scalar_view velx("velx",c,p);
+      scalar_view vely("vely",c,p);
+      scalar_view wvel("wvel",c,p,d);
+      feVel_->evaluateValue(velx, V[0]);
+      feVel_->evaluateValue(vely, V[1]);
+      for (int i = 0; i < c; ++i) {
+        for (int j = 0; j < p; ++j) {
+          wvel(i,j,0) = -weight_(i,j,0)*velx(i,j);
+          wvel(i,j,1) = -weight_(i,j,1)*vely(i,j);
+        }
+      }
+
+      feVel_->computeIntegral(hess[0],wvel,target_);
+
+      return h_param;
+    }
+    else {
+      throw Exception::Zero(">>> HessVec_31 is zero.");
+    }
+  }
+
+  std::vector<Real> HessVec_32(std::vector<scalar_view> & hess,
+                               const scalar_view v_coeff,
+                               const scalar_view u_coeff,
+                               const scalar_view z_coeff = scalar_view(),
+                               const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> HessVec_32 is zero.");
+  }
+
+  std::vector<Real> HessVec_33(std::vector<scalar_view> & hess,
+                               const ROL::Ptr<const std::vector<Real>> & v_param,
+                               const scalar_view u_coeff,
+                               const scalar_view z_coeff = scalar_view(),
+                               const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if (z_param != ROL::nullPtr) {
+      // Get relevant dimensions
+      const int c = fePrs_->gradN().extent_int(0);
+      const int p = fePrs_->gradN().extent_int(2);
+      const int d = fePrs_->gradN().extent_int(3);
+      // Initialize output val
+      std::vector<Real> h_param(d,static_cast<Real>(0));
+      hess.clear(); hess.resize(d);
+      for (int k = 0; k < d; ++k)
+        hess[k] = scalar_view("hess", c);
+      // Compute cost integral
+      scalar_view wtarget("wtarget",c,p,d);
+      for (int i = 0; i < c; ++i) {
+        for (int j = 0; j < p; ++j) {
+          for (int k = 0; k < d; ++k) {
+            wtarget(i,j,k) = weight_(i,j,k)*target_(i,j,k);
+          }
+        }
+      }
+      feVel_->computeIntegral(hess[0],wtarget,target_);
+      rst::scale(hess[0],(*v_param)[0]);
+      
+      return h_param;
+    }
+    else {
+      throw Exception::Zero(">>> HessVec_33 is zero.");
+    }
+  }
+
+}; // QoI_Velocity_Darcy2
+
+
+template <class Real, class DeviceType>
+class QoI_Power_NavierStokes : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> feVel_, fePrs_, feCtrl_;
+  const ROL::Ptr<const FieldUtils::FieldInfo> fieldInfo_, fieldInfoCtrl_;
+
+  ROL::Ptr<Impermeability<Real,DeviceType>> imp_;
+  Real viscosity_;
+
+public:
+  QoI_Power_NavierStokes(ROL::ParameterList                      &list,
+                         const ROL::Ptr<fe_type>                    &feVel,
+                         const ROL::Ptr<fe_type>                    &fePrs,
+                         const ROL::Ptr<fe_type>                    &feCtrl,
+                         const ROL::Ptr<const FieldUtils::FieldInfo> &fieldInfo,
+                         const ROL::Ptr<const FieldUtils::FieldInfo> &fieldInfoCtrl)
+    : feVel_(feVel), fePrs_(fePrs), feCtrl_(feCtrl), fieldInfo_(fieldInfo), fieldInfoCtrl_(fieldInfoCtrl) {
+    viscosity_ = list.sublist("Problem").get("Viscosity",5e-3);
+    imp_ = ROL::makePtr<Impermeability<Real,DeviceType>>(list);
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c = feVel_->gradN().extent_int(0);
+    const int p = feVel_->gradN().extent_int(2);
+    const int d = feVel_->gradN().extent_int(3);
+    // Initialize output val
+    val = scalar_view("val", c);
+    // Get components of the control
+    std::vector<scalar_view> U, Z;
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoCtrl_);
+    // Evaluate on FE basis
+    scalar_view Zval("Zval", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view Uval("Uval", c, p);
+    scalar_view aUval("aUval", c, p);
+    scalar_view Ugrad("Ugrad", c, p, d);
+    scalar_view nUgrad("nUgrad", c, p, d);
+    feCtrl_->evaluateValue(Zval, Z[0]);
+    imp_->compute(alpha, Zval, feVel_->cubPts(), 0); 
+    for (int i = 0; i < d; ++i) {
+      Kokkos::deep_copy(Uval, static_cast<Real>(0));
+      Kokkos::deep_copy(Ugrad, static_cast<Real>(0));
+      Kokkos::deep_copy(aUval, static_cast<Real>(0));
+      Kokkos::deep_copy(nUgrad, static_cast<Real>(0));
+      // Evaluate on FE basis
+      feVel_->evaluateValue(Uval, U[i]);
+      feVel_->evaluateGradient(Ugrad, U[i]);
+      // Scale
+      fst::scalarMultiplyDataData(aUval, alpha, Uval);
+      rst::scale(nUgrad, Ugrad, viscosity_);
+      // Integrate
+      feVel_->computeIntegral(val, nUgrad, Ugrad, true);
+      feVel_->computeIntegral(val, aUval, Uval, true);
+    }
+    rst::scale(val, static_cast<Real>(0.5));
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fv = feVel_->gradN().extent_int(1);
+    const int fp = fePrs_->gradN().extent_int(1);
+    const int p  = feVel_->gradN().extent_int(2);
+    const int d  = feVel_->gradN().extent_int(3);
+    // Initialize output grad
+    std::vector<scalar_view> G;
+    G.resize(fieldInfo_->numFields);
+    for (int i = 0; i < d; ++i)
+      G[i] = scalar_view("grad", c, fv);
+    G[d] = scalar_view("grad", c, fp);
+    // Get components of the control
+    std::vector<scalar_view> U, Z;
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoCtrl_);
+    // Create storage
+    scalar_view Zval("Zval", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view Uval("Uval", c, p);
+    scalar_view aUval("aUval", c, p);
+    scalar_view Ugrad("Ugrad", c, p, d);
+    feCtrl_->evaluateValue(Zval, Z[0]);
+    imp_->compute(alpha, Zval, feVel_->cubPts(), 0); 
+    for (int i = 0; i < d; ++i) {
+      Kokkos::deep_copy(Uval, static_cast<Real>(0));
+      Kokkos::deep_copy(Ugrad, static_cast<Real>(0));
+      Kokkos::deep_copy(aUval, static_cast<Real>(0));
+      // Evaluate on FE basis
+      feVel_->evaluateValue(Uval, U[i]);
+      feVel_->evaluateGradient(Ugrad, U[i]);
+      // Scale
+      fst::scalarMultiplyDataData(aUval, alpha, Uval);
+      rst::scale(Ugrad, viscosity_);
+      // Integrate
+      fst::integrate(G[i],Ugrad,feVel_->gradNdetJ(),false);
+      fst::integrate(G[i],aUval,feVel_->NdetJ(),true);
+    }
+    FieldUtils::combineFieldCoeff<Real>(grad, G, fieldInfo_);
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int p  = feVel_->gradN().extent_int(2);
+    const int d  = feVel_->gradN().extent_int(3);
+    // Initialize output grad
+    std::vector<scalar_view> G;
+    G.resize(fieldInfoCtrl_->numFields);
+    G[0] = scalar_view("grad", c, fc);
+    // Get components of the control
+    std::vector<scalar_view> U, Z;
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoCtrl_);
+    // Create storage
+    scalar_view Zval("Zval", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view Uval("Uval", c, p);
+    scalar_view aUval("aUval", c, p);
+    scalar_view aU2val("aU2val", c, p);
+    feCtrl_->evaluateValue(Zval, Z[0]);
+    imp_->compute(alpha, Zval, feVel_->cubPts(), 1); 
+    for (int i = 0; i < d; ++i) {
+      Kokkos::deep_copy(Uval, static_cast<Real>(0));
+      Kokkos::deep_copy(aUval, static_cast<Real>(0));
+      Kokkos::deep_copy(aU2val, static_cast<Real>(0));
+      // Evaluate on FE basis
+      feVel_->evaluateValue(Uval, U[i]);
+      // Scale
+      fst::scalarMultiplyDataData(aUval, alpha, Uval);
+      fst::scalarMultiplyDataData(aU2val, aUval, Uval);
+      // Integrate
+      fst::integrate(G[0],aU2val,feCtrl_->NdetJ(),true);
+    }
+    rst::scale(G[0], static_cast<Real>(0.5));
+    FieldUtils::combineFieldCoeff<Real>(grad, G, fieldInfoCtrl_);
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fv = feVel_->gradN().extent_int(1);
+    const int fp = fePrs_->gradN().extent_int(1);
+    const int p  = feVel_->gradN().extent_int(2);
+    const int d  = feVel_->gradN().extent_int(3);
+    // Initialize output grad
+    std::vector<scalar_view> H;
+    H.resize(fieldInfo_->numFields);
+    for (int i = 0; i < d; ++i)
+      H[i] = scalar_view("hess", c, fv);
+    H[d] = scalar_view("hess", c, fp);
+    // Get components of the control
+    std::vector<scalar_view> V, Z;
+    FieldUtils::splitFieldCoeff<Real>(V, v_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoCtrl_);
+    // Create storage
+    scalar_view Zval("Zval", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view Vval("Vval", c, p);
+    scalar_view aVval("aVval", c, p);
+    scalar_view Vgrad("Vgrad", c, p, d);
+    feCtrl_->evaluateValue(Zval, Z[0]);
+    imp_->compute(alpha, Zval, feVel_->cubPts(), 0); 
+    for (int i = 0; i < d; ++i) {
+      Kokkos::deep_copy(Vval, static_cast<Real>(0));
+      Kokkos::deep_copy(Vgrad, static_cast<Real>(0));
+      Kokkos::deep_copy(aVval, static_cast<Real>(0));
+      // Evaluate on FE basis
+      feVel_->evaluateValue(Vval, V[i]);
+      feVel_->evaluateGradient(Vgrad, V[i]);
+      // Scale
+      fst::scalarMultiplyDataData(aVval, alpha, Vval);
+      rst::scale(Vgrad, viscosity_);
+      // Integrate
+      fst::integrate(H[i],Vgrad,feVel_->gradNdetJ(),false);
+      fst::integrate(H[i],aVval,feVel_->NdetJ(),true);
+    }
+    FieldUtils::combineFieldCoeff<Real>(hess, H, fieldInfo_);
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fv = feVel_->gradN().extent_int(1);
+    const int fp = fePrs_->gradN().extent_int(1);
+    const int p  = feVel_->gradN().extent_int(2);
+    const int d  = feVel_->gradN().extent_int(3);
+    // Initialize output grad
+    std::vector<scalar_view> H;
+    H.resize(fieldInfo_->numFields);
+    for (int i = 0; i < d; ++i)
+      H[i] = scalar_view("hess", c, fv);
+    H[d] = scalar_view("hess", c, fp);
+    // Get components of the control
+    std::vector<scalar_view> V, U, Z;
+    FieldUtils::splitFieldCoeff<Real>(V, v_coeff, fieldInfoCtrl_);
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoCtrl_);
+    // Create storage
+    scalar_view Zval("Zval", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view Uval("Uval", c, p);
+    scalar_view aUval("Uval", c, p);
+    scalar_view Vval("Vval", c, p);
+    scalar_view aVval("Vval", c, p);
+    feCtrl_->evaluateValue(Vval, V[0]);
+    feCtrl_->evaluateValue(Zval, Z[0]);
+    imp_->compute(alpha, Zval, feVel_->cubPts(), 1); 
+    fst::scalarMultiplyDataData(aVval, alpha, Vval);
+    for (int i = 0; i < d; ++i) {
+      Kokkos::deep_copy(Uval, static_cast<Real>(0));
+      Kokkos::deep_copy(aUval, static_cast<Real>(0));
+      // Evaluate on FE basis
+      feVel_->evaluateValue(Uval, U[i]);
+      // Scale
+      fst::scalarMultiplyDataData(aUval, aVval, Uval);
+      // Integrate
+      fst::integrate(H[i],aUval,feVel_->NdetJ(),false);
+    }
+    FieldUtils::combineFieldCoeff<Real>(hess, H, fieldInfo_);
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int p  = feVel_->gradN().extent_int(2);
+    const int d  = feVel_->gradN().extent_int(3);
+    // Initialize output grad
+    std::vector<scalar_view> H;
+    H.resize(fieldInfoCtrl_->numFields);
+    H[0] = scalar_view("hess", c, fc);
+    // Get components of the control
+    std::vector<scalar_view> V, U, Z;
+    FieldUtils::splitFieldCoeff<Real>(V, v_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoCtrl_);
+    // Create storage
+    scalar_view Zval("Zval", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view Uval("Uval", c, p);
+    scalar_view aUval("aUval", c, p);
+    scalar_view aUVval("aUVval", c, p);
+    scalar_view Vval("Vval", c, p);
+    feCtrl_->evaluateValue(Zval, Z[0]);
+    imp_->compute(alpha, Zval, feVel_->cubPts(), 1); 
+    for (int i = 0; i < d; ++i) {
+      Kokkos::deep_copy(Vval, static_cast<Real>(0));
+      Kokkos::deep_copy(Uval, static_cast<Real>(0));
+      Kokkos::deep_copy(aUval, static_cast<Real>(0));
+      Kokkos::deep_copy(aUVval, static_cast<Real>(0));
+      // Evaluate on FE basis
+      feVel_->evaluateValue(Vval, V[i]);
+      feVel_->evaluateValue(Uval, U[i]);
+      // Scale
+      fst::scalarMultiplyDataData(aUval, alpha, Uval);
+      fst::scalarMultiplyDataData(aUVval, aUval, Vval);
+      // Integrate
+      fst::integrate(H[0],aUVval,feCtrl_->NdetJ(),true);
+    }
+    FieldUtils::combineFieldCoeff<Real>(hess, H, fieldInfoCtrl_);
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int p  = feVel_->gradN().extent_int(2);
+    const int d  = feVel_->gradN().extent_int(3);
+    // Initialize output grad
+    std::vector<scalar_view> H;
+    H.resize(fieldInfoCtrl_->numFields);
+    H[0] = scalar_view(c, fc);
+    // Get components of the control
+    std::vector<scalar_view> V, U, Z;
+    FieldUtils::splitFieldCoeff<Real>(V, v_coeff, fieldInfoCtrl_);
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoCtrl_);
+    // Create storage
+    scalar_view Zval("Zval", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view Uval("Uval", c, p);
+    scalar_view aUval("aUval", c, p);
+    scalar_view aU2val("aU2val", c, p);
+    scalar_view Vval("Vval", c, p);
+    scalar_view aVval("aVval", c, p);
+    feCtrl_->evaluateValue(Vval, V[0]);
+    feCtrl_->evaluateValue(Zval, Z[0]);
+    imp_->compute(alpha, Zval, feVel_->cubPts(), 2); 
+    fst::scalarMultiplyDataData(aVval, alpha, Vval);
+    for (int i = 0; i < d; ++i) {
+      Kokkos::deep_copy(Uval, static_cast<Real>(0));
+      Kokkos::deep_copy(aUval, static_cast<Real>(0));
+      Kokkos::deep_copy(aU2val, static_cast<Real>(0));
+      // Evaluate on FE basis
+      feVel_->evaluateValue(Uval, U[i]);
+      // Scale
+      fst::scalarMultiplyDataData(aUval, aVval, Uval);
+      fst::scalarMultiplyDataData(aU2val, aUval, Uval);
+      // Integrate
+      fst::integrate(H[0],aU2val,feCtrl_->NdetJ(),true);
+    }
+    rst::scale(H[0], static_cast<Real>(0.5));
+    FieldUtils::combineFieldCoeff<Real>(hess, H, fieldInfoCtrl_);
+  }
+
+}; // QoI_Power_NavierStokes
+
+template <class Real, class DeviceType>
+class QoI_State_NavierStokes : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  ROL::Ptr<QoI<Real,DeviceType>> qoi_;
+
+public:
+  QoI_State_NavierStokes(ROL::ParameterList &parlist,
+                         const ROL::Ptr<fe_type> &feVel,
+                         const ROL::Ptr<fe_type> &fePrs,
+                         const ROL::Ptr<const FieldUtils::FieldInfo> &fieldInfo) {
+    std::string stateObj = parlist.sublist("Problem").get("Objective type","Vorticity");
+    if ( stateObj != "Vorticity"
+      && stateObj != "Circulation"
+      && stateObj != "Directional"
+      && stateObj != "Pressure"
+      && stateObj != "Velocity"
+      && stateObj != "VelocityTracking"
+      && stateObj != "Power" ) {
+      throw Exception::NotImplemented(">>> (QoI_State_NavierStokes): Unknown objective type."); 
+    }
+    if ( stateObj == "Vorticity" )
+      qoi_ = ROL::makePtr<QoI_Vorticity_NavierStokes<Real,DeviceType>>(feVel,fePrs,fieldInfo);
+    else if ( stateObj == "Directional" )
+      qoi_ = ROL::makePtr<QoI_Directional_NavierStokes<Real,DeviceType>>(parlist,feVel,fePrs,fieldInfo);
+    else if ( stateObj == "Pressure" )
+      qoi_ = ROL::makePtr<QoI_Pressure_NavierStokes<Real,DeviceType>>(feVel,fePrs,fieldInfo);
+    else if ( stateObj == "Velocity" )
+      throw Exception::NotImplemented(">>> (QoI_State_NavierStokes): Incorrect Constructor.");
+    else if ( stateObj == "VelocityTracking" )
+      qoi_ = ROL::makePtr<QoI_VelocityTracking_NavierStokes<Real,DeviceType>>(parlist,feVel,fePrs,fieldInfo);
+    else if ( stateObj == "Power" )
+      throw Exception::NotImplemented(">>> (QoI_State_NavierStokes): Incorrect Constructor.");
+    else
+      qoi_ = ROL::makePtr<QoI_Circulation_NavierStokes<Real,DeviceType>>(feVel,fePrs,fieldInfo);
+  }
+  QoI_State_NavierStokes(ROL::ParameterList   &list,
+                         const ROL::Ptr<fe_type> &feVel,
+                         const ROL::Ptr<fe_type> &fePrs,
+                         const std::vector<ROL::Ptr<fe_type>> &feVelBdry,
+                            const std::vector<std::vector<int>>   &bdryCellLocIds,
+                            const ROL::Ptr<const FieldUtils::FieldInfo> &fieldInfo) {
+    std::string stateObj = list.sublist("Problem").get("Objective type","Vorticity");
+    if ( stateObj != "Velocity" ) {
+      throw Exception::NotImplemented(">>> (QoI_State_NavierStokes): Incorrect Constructor.");
+    }
+    qoi_ = ROL::makePtr<QoI_Velocity_NavierStokes<Real,DeviceType>>(list,feVel,fePrs,feVelBdry,bdryCellLocIds,fieldInfo);
+  }
+  QoI_State_NavierStokes(ROL::ParameterList &list,
+                         const ROL::Ptr<fe_type> &feVel,
+                         const ROL::Ptr<fe_type> &fePrs,
+                         const ROL::Ptr<fe_type> &feCtrl,
+                         const ROL::Ptr<const FieldUtils::FieldInfo> &fieldInfo,
+                         const ROL::Ptr<const FieldUtils::FieldInfo> &fieldInfoCtrl) {
+    std::string stateObj = list.sublist("Problem").get("Objective type","Vorticity");
+    if ( stateObj != "Power" ) {
+      throw Exception::NotImplemented(">>> (QoI_State_NavierStokes): Incorrect Constructor.");
+    }
+    qoi_ = ROL::makePtr<QoI_Power_NavierStokes<Real,DeviceType>>(feVel,fePrs,feCtrl,fieldInfo,fieldInfoCtrl);
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    return qoi_->value(val, u_coeff, z_coeff, z_param);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    qoi_->gradient_1(grad, u_coeff, z_coeff, z_param);
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    qoi_->gradient_2(grad, u_coeff, z_coeff, z_param);
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    qoi_->HessVec_11(hess, v_coeff, u_coeff, z_coeff, z_param);
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    qoi_->HessVec_12(hess, v_coeff, u_coeff, z_coeff, z_param);
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    qoi_->HessVec_21(hess, v_coeff, u_coeff, z_coeff, z_param);
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    qoi_->HessVec_22(hess, v_coeff, u_coeff, z_coeff, z_param);
+  }
+
+};
+
+template <class Real, class DeviceType>
+class QoI_Volume_NavierStokes : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> fe_;
+  const ROL::Ptr<const FieldUtils::FieldInfo> fieldInfo_;
+
+  scalar_view weight_;
+  Real volFraction_;
+
+public:
+  QoI_Volume_NavierStokes(ROL::ParameterList &list,
+                          const ROL::Ptr<fe_type> &fe,
+                          const ROL::Ptr<const FieldUtils::FieldInfo> &fieldInfo)
+    : fe_(fe), fieldInfo_(fieldInfo) {
+    const Real tol = std::sqrt(ROL::ROL_EPSILON<Real>());
+    const Real zero(0), one(1);
+    volFraction_   = list.sublist("Problem").get("Volume Fraction",0.5);
+    Real outHeight = list.sublist("Problem").get("Outlet Height",1.5*1.6936);
+    const int c = fe_->cubPts().extent_int(0);
+    const int p = fe_->cubPts().extent_int(1);
+    const int d = fe_->cubPts().extent_int(2);
+    std::vector<Real> pt(d);
+    weight_ = scalar_view("weight", c, p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k)
+          pt[k] = (fe_->cubPts())(i,j,k);
+        weight_(i,j) = (pt[1] <= outHeight+tol ? one : zero);
+      }
+    }
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c = fe_->cubPts().extent_int(0);
+    const int p = fe_->cubPts().extent_int(1);
+    // Initialize output val
+    val = scalar_view("val", c);
+    // Get components of the control
+    std::vector<scalar_view> Z;
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfo_);
+    // Evaluate on FE basis
+    scalar_view Z0("Z0", c, p);
+    fe_->evaluateValue(Z0, Z[0]);
+    // Integrate the density minus volume fraction
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j)
+        Z0(i,j) -= volFraction_;
+    }
+    fe_->computeIntegral(val,weight_,Z0,true);
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Volume_NavierStokes::gradient_1 is zero.");
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c = fe_->cubPts().extent_int(0);
+    const int f = fe_->N().extent_int(1);
+    // Initialize output grad
+    std::vector<scalar_view> G;
+    G.resize(fieldInfo_->numFields);
+    G[0] = scalar_view("grad", c, f);
+    // Integrate density
+    fst::integrate(G[0],weight_,fe_->NdetJ(),false);
+    FieldUtils::combineFieldCoeff(grad, G, fieldInfo_);
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Volume_NavierStokes::HessVec_11 is zero.");
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Volume_NavierStokes::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Volume_NavierStokes::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Volume_NavierStokes::HessVec_22 is zero.");
+  }
+
+}; // QoI_Volume_NavierStokes
+
+template <class Real>
+class StdObjective_NavierStokes : public ROL::StdObjective<Real> {
+private:
+  std::string stateObj_;
+
+public:
+  StdObjective_NavierStokes(ROL::ParameterList &parlist) {
+    stateObj_ = parlist.sublist("Problem").get("Objective type","Vorticity");
+    if ( stateObj_ != "Vorticity"
+      && stateObj_ != "Circulation"
+      && stateObj_ != "Directional"
+      && stateObj_ != "Pressure"
+      && stateObj_ != "Velocity"
+      && stateObj_ != "VelocityTracking"
+      && stateObj_ != "Power" ) {
+      throw Exception::NotImplemented(">>> (StdObjective_NavierStokes): Unknown objective type."); 
+    }
+  }
+
+  Real value(const std::vector<Real> &x, Real &tol) {
+    Real val(0);
+    if ( stateObj_ == "Vorticity"
+      || stateObj_ == "Directional"
+      || stateObj_ == "Pressure"
+      || stateObj_ == "Velocity"
+      || stateObj_ == "VelocityTracking"
+      || stateObj_ == "Power" ) {
+      val = x[0];
+    }
+    else {
+      val = static_cast<Real>(0.5)*x[0]*x[0];
+    }
+    return val;
+  }
+
+  void gradient(std::vector<Real> &g, const std::vector<Real> &x, Real &tol) {
+    const Real one(1);
+    if ( stateObj_ == "Vorticity"
+      || stateObj_ == "Directional"
+      || stateObj_ == "Pressure"
+      || stateObj_ == "Velocity"
+      || stateObj_ == "VelocityTracking"
+      || stateObj_ == "Power" ) {
+      g[0] = one;
+    }
+    else {
+      g[0] = x[0];
+    }
+  }
+
+  void hessVec(std::vector<Real> &hv, const std::vector<Real> &v, const std::vector<Real> &x, Real &tol) {
+    const Real zero(0);
+    if ( stateObj_ == "Vorticity"
+      || stateObj_ == "Directional"
+      || stateObj_ == "Pressure"
+      || stateObj_ == "Velocity"
+      || stateObj_ == "VelocityTracking"
+      || stateObj_ == "Power" ) {
+      hv[0] = zero;
+    }
+    else {
+      hv[0] = v[0];
+    }
+  }
+
+}; // OBJ_SCALAR
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/brinkman/pde_brinkmanK.hpp b/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/brinkman/pde_brinkmanK.hpp
new file mode 100644
index 000000000000..b027b4518c33
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/brinkman/pde_brinkmanK.hpp
@@ -0,0 +1,1246 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  pde_brinkman.hpp
+    \brief Implements the local PDE interface for the Brinkman porosity optimization problem.
+
+    Implementes the weak form of the Navier-Stokes-Brinkman equations
+    \f[
+       \begin{aligned}
+         -\nabla\cdot(\mu\nabla u) + \rho (u\cdot\nabla) u + \nabla p &= -\mu K(z)^{-1}u \\
+         \nabla\cdot u &=0
+    \f]
+*/
+
+#ifndef PDE_BRINKMANK_HPP
+#define PDE_BRINKMANK_HPP
+
+#include "../../../../TOOLS/pdeK.hpp"
+#include "../../../../TOOLS/feK.hpp"
+#include "../../../../TOOLS/fieldhelperK.hpp"
+#include "impermeabilityK.hpp"
+
+#include "Intrepid2_HVOL_C0_FEM.hpp"
+#include "Intrepid2_HGRAD_QUAD_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_QUAD_C2_FEM.hpp"
+#include "Intrepid2_HGRAD_TRI_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_TRI_C2_FEM.hpp"
+#include "Intrepid2_DefaultCubatureFactory.hpp"
+#include "Intrepid2_FunctionSpaceTools.hpp"
+#include "Intrepid2_CellTools.hpp"
+
+#include "ROL_Ptr.hpp"
+
+template <class Real, class DeviceType>
+class PDE_NavierStokes : public PDE<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  // Finite element basis information
+  basis_ptr basisPtrVel_, basisPtrPrs_, basisPtrCtrl_;
+  std::vector<basis_ptr> basisPtrs_, basisPtrsCtrl_;
+  // Cell cubature information
+  ROL::Ptr<Intrepid2::Cubature<DeviceType,Real,Real>> cellCub_, bdryCub_;
+  // Cell node information
+  scalar_view volCellNodes_;
+  std::vector<std::vector<scalar_view>> bdryCellNodes_;
+  std::vector<std::vector<std::vector<int>>> bdryCellLocIds_;
+  // Finite element definition
+  ROL::Ptr<fe_type> feVel_, fePrs_, feCtrl_;
+  std::vector<std::vector<ROL::Ptr<fe_type>>> feVelBdry_;
+  // Local degrees of freedom on boundary, for each side of the reference cell (first index).
+  std::vector<std::vector<int>> fvidx_, fpidx_;
+  // Coordinates of degrees freedom on boundary cells.
+  // Indexing:  [sideset number][local side id](cell number, value at dof)
+  std::vector<std::vector<scalar_view>> bdryCellDofValues_;
+  // Field pattern, offsets, etc.
+  std::vector<std::vector<int>> fieldPattern_;  // local Field/DOF pattern; set from DOF manager 
+  int numFields_;                               // number of fields (equations in the PDE)
+  int numDofs_;                                 // total number of degrees of freedom for all (local) fields
+  std::vector<int> offset_;                     // for each field, a counting offset
+  std::vector<int> numFieldDofs_;               // for each field, number of degrees of freedom
+  std::vector<std::vector<int>> fieldPatternCtrl_;  // local Field/DOF pattern; set from DOF manager 
+  int numFieldsCtrl_;                               // number of fields (equations in the PDE)
+  int numDofsCtrl_;                                 // total number of degrees of freedom for all (local) fields
+  std::vector<int> offsetCtrl_;                     // for each field, a counting offset
+  std::vector<int> numFieldDofsCtrl_;               // for each field, number of degrees of freedom
+  
+  // Problem parameters.
+  Real Patm_, density_, viscosity_, inVelocity_;
+  bool useStokes_, useNoSlip_, usePresOut_;
+
+  ROL::Ptr<Impermeability<Real,DeviceType>> imp_;
+  ROL::Ptr<FieldHelper<Real,DeviceType>> fieldHelper_;
+  ROL::Ptr<FieldUtils::FieldInfo> fieldInfo_, fieldInfoCtrl_;
+
+  // Extract velocity coefficients on boundary.
+  scalar_view getBoundaryCoeff(const scalar_view cell_coeff, int sideSet, int cell) const {
+    std::vector<int> bdryCellLocId = bdryCellLocIds_[sideSet][cell];
+    const int numCellsSide = bdryCellLocId.size();
+    const int f = basisPtrVel_->getCardinality();
+    
+    scalar_view bdry_coeff("bdry_coeff", numCellsSide, f);
+    for (int i = 0; i < numCellsSide; ++i) {
+      for (int j = 0; j < f; ++j) {
+        bdry_coeff(i, j) = cell_coeff(bdryCellLocId[i], j);
+      }
+    }
+    return bdry_coeff;
+  }
+
+  Real DirichletFunc(const std::vector<Real> & coords, int sideset, int locSideId, int dir) const {
+    const std::vector<Real> param = PDE<Real,DeviceType>::getParameter();
+    Real val(0);
+    // sideset == 0: Inflow
+    // sideset == 1: No Slip
+    // sideset == 2: Outflow
+    // sideset == 3: No Normal
+    if (sideset==0) val = (dir==1 ? inVelocity_ : static_cast<Real>(0));
+    return val;
+  }
+
+  void computeDirichlet(void) {
+    // Compute Dirichlet values at DOFs.
+    int d = basisPtrVel_->getBaseCellTopology().getDimension();
+    int numSidesets = bdryCellLocIds_.size();
+    bdryCellDofValues_.resize(numSidesets);
+    for (int i=0; i<numSidesets; ++i) {
+      int numLocSides = bdryCellLocIds_[i].size();
+      bdryCellDofValues_[i].resize(numLocSides);
+      for (int j=0; j<numLocSides; ++j) {
+        int c = bdryCellLocIds_[i][j].size();
+        int f = basisPtrVel_->getCardinality();
+        bdryCellDofValues_[i][j] = scalar_view("bdryCellDofValues", c, f, d);
+        scalar_view coords("coords", c, f, d);
+        if (c > 0) {
+          feVel_->computeDofCoords(coords, bdryCellNodes_[i][j]);
+        }
+        for (int k=0; k<c; ++k) {
+          for (int l=0; l<f; ++l) {
+            std::vector<Real> dofpoint(d);
+            //std::cout << "Sideset " << i << " LocalSide " << j << "  Cell " << k << "  Field " << l << "  Coord ";
+            for (int m=0; m<d; ++m) {
+              dofpoint[m] = coords(k, l, m);
+              //std::cout << dofpoint[m] << "  ";
+            }
+
+            for (int m=0; m<d; ++m) {
+              (bdryCellDofValues_[i][j])(k, l, m) = DirichletFunc(dofpoint, i, j, m);
+              //std::cout << "  " << m << "-Value " << DirichletFunc(dofpoint, i, j, m);
+            }
+            //std::cout << std::endl;
+          }
+        }
+      }
+    }
+  }
+
+  Real viscosityFunc(const std::vector<Real> & coords) const {
+    return viscosity_;
+  }
+
+  void computeViscosity(scalar_view &nu) const {
+    int c = feVel_->gradN().extent_int(0);
+    int p = feVel_->gradN().extent_int(2);
+    int d = feVel_->gradN().extent_int(3);
+    std::vector<Real> pt(d);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for ( int k = 0; k < d; ++k) {
+          pt[k] = (feVel_->cubPts())(i,j,k);
+        }
+        // Compute spatially dependent viscosity
+        nu(i,j) = viscosityFunc(pt);
+      }
+    }
+  }
+
+  void multiplyByRadius(scalar_view &input, bool isField, bool useReciprocal = false) const {
+    int c = feVel_->gradN().extent_int(0);
+    int p = feVel_->gradN().extent_int(2);
+    scalar_view r("r",c,p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        if (!useReciprocal) r(i,j) = (feVel_->cubPts())(i,j,0);
+        else                r(i,j) = 1/(feVel_->cubPts())(i,j,0);
+      }
+    }
+    scalar_view in(input);
+    if (!isField)
+      fst::scalarMultiplyDataData(input,r,in);
+    else
+      fst::scalarMultiplyDataField(input,r,in);
+  }
+
+public:
+  PDE_NavierStokes(ROL::ParameterList &parlist) {
+    // Finite element fields.
+    int cubDegree     = parlist.sublist("Problem").get("Cubature Degree",4);
+    int bdryCubDegree = parlist.sublist("Problem").get("Boundary Cubature Degree",2);
+    int probDim       = parlist.sublist("Problem").get("Problem Dimension",2);
+    int basisDegCtrl  = parlist.sublist("Problem").get("Density Basis Degree",1);
+    std::string elemType = parlist.sublist("Problem").get("Element Type","QUAD");
+    TEUCHOS_TEST_FOR_EXCEPTION(probDim!=2, std::invalid_argument,
+      ">>> rol/example/PDE-OPT/flow-opt/axisymmetric/models/brinkman/pde_brinkman.hpp: Problem dimension is not 2!");
+    if (elemType == "TRI") {
+      basisPtrVel_ = ROL::makePtr<Intrepid2::Basis_HGRAD_TRI_C2_FEM<DeviceType,Real,Real>>();
+      basisPtrPrs_ = ROL::makePtr<Intrepid2::Basis_HGRAD_TRI_C1_FEM<DeviceType,Real,Real>>();
+    }
+    else {
+      basisPtrVel_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C2_FEM<DeviceType,Real,Real>>();
+      basisPtrPrs_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceType,Real,Real>>();
+    }
+    basisPtrs_.clear();
+    basisPtrs_.resize(probDim,basisPtrVel_); // Velocity component
+    basisPtrs_.push_back(basisPtrPrs_);      // Pressure component
+    shards::CellTopology cellType = basisPtrs_[0]->getBaseCellTopology(); // get the cell type from any basis
+    if (basisDegCtrl == 1) {
+      if (elemType == "TRI")
+        basisPtrCtrl_ = ROL::makePtr<Intrepid2::Basis_HGRAD_TRI_C1_FEM<DeviceType,Real,Real>>();
+      else
+        basisPtrCtrl_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceType,Real,Real>>();
+    }
+    else
+      basisPtrCtrl_ = ROL::makePtr<Intrepid2::Basis_HVOL_C0_FEM<DeviceType,Real,Real>>(cellType);
+    basisPtrsCtrl_.clear();
+    basisPtrsCtrl_.push_back(basisPtrCtrl_); // Control component
+    // Quadrature rules.
+    Intrepid2::DefaultCubatureFactory cubFactory;                            // create cubature factory
+    cellCub_ = cubFactory.create<DeviceType,Real,Real>(cellType, cubDegree); // create default cubature
+
+    shards::CellTopology bdryCellType = cellType.getCellTopologyData(probDim-1, 0);
+    bdryCub_ = cubFactory.create<DeviceType,Real,Real>(bdryCellType, bdryCubDegree);
+
+    // Other problem parameters.
+    Real dynvisco = parlist.sublist("Problem").get("Dynamic Viscosity", 0.84e-8); // kg/mm-s
+    Real inRadius = parlist.sublist("Problem").get("Inlet Radius", 0.6875);       // mm
+    Real Reynolds = parlist.sublist("Problem").get("Reynolds Number", 50.0);
+    Patm_       = static_cast<Real>(101.325);
+    density_    = parlist.sublist("Problem").get("Fluid Density", 8.988e-11);     // kg/mm^3
+    viscosity_  = parlist.sublist("Problem").get("Effective Viscosity", 0.84e-8); // kg/mm-s
+    inVelocity_ = Reynolds * dynvisco / (static_cast<Real>(2) * inRadius * density_);
+    useStokes_  = parlist.sublist("Problem").get("Use Stokes", false);
+    useNoSlip_  = parlist.sublist("Problem").get("Use No Slip", true);
+    usePresOut_ = parlist.sublist("Problem").get("Use Pressure Outflow",false);
+    imp_        = ROL::makePtr<Impermeability<Real,DeviceType>>(parlist);
+
+    numDofs_ = 0;
+    numFields_ = basisPtrs_.size();
+    offset_.resize(numFields_);
+    numFieldDofs_.resize(numFields_);
+    for (int i=0; i<numFields_; ++i) {
+      offset_[i] = i==0 ? 0 : offset_[i-1] + basisPtrs_[i-1]->getCardinality();
+      numFieldDofs_[i] = basisPtrs_[i]->getCardinality();
+      numDofs_ += numFieldDofs_[i];
+    }
+    numDofsCtrl_ = 0;
+    numFieldsCtrl_ = basisPtrsCtrl_.size();
+    offsetCtrl_.resize(numFieldsCtrl_);
+    numFieldDofsCtrl_.resize(numFieldsCtrl_);
+    for (int i=0; i<numFieldsCtrl_; ++i) {
+      offsetCtrl_[i] = i==0 ? 0 : offsetCtrl_[i-1] + basisPtrsCtrl_[i-1]->getCardinality();
+      numFieldDofsCtrl_[i] = basisPtrsCtrl_[i]->getCardinality();
+      numDofsCtrl_ += numFieldDofsCtrl_[i];
+    }
+  }
+
+  void residual(scalar_view & res,
+                const scalar_view u_coeff,
+                const scalar_view z_coeff = scalar_view(),
+                const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fv = feVel_->gradN().extent_int(1);
+    const int fp = fePrs_->gradN().extent_int(1);
+    const int p  = feVel_->gradN().extent_int(2);
+    const int d  = feVel_->gradN().extent_int(3);
+    // Initialize residuals.
+    std::vector<scalar_view> R;
+    R.resize(numFields_);
+    for (int i = 0; i < d; ++i)
+      R[i] = scalar_view("res", c, fv);
+    R[d] = scalar_view("res", c, fp);
+    // Split coefficients into components.
+    std::vector<scalar_view> U, Z;
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoCtrl_);
+    // Evaluate/interpolate finite element fields on cells.
+    std::vector<scalar_view> nuGradVel(d), valVel(d), gradVel(d), valVelDotgradVel(d), alphaVel(d);
+    scalar_view nu("nu", c, p);
+    scalar_view valVel0("valVel0", c, p, d);
+    scalar_view valPres("valPres", c, p);
+    scalar_view divVel("divVel", c, p);
+    scalar_view valCtrl("valCtrl", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view nuVel0("nuVel0", c, p);
+    computeViscosity(nu);
+    feCtrl_->evaluateValue(valCtrl, Z[0]);
+    imp_->compute(alpha, valCtrl, feVel_->cubPts(), 0);
+    for (int i = 0; i < d; ++i) {
+      nuGradVel[i]        = scalar_view("nuGradVel", c, p, d);
+      valVel[i]           = scalar_view("valVel", c, p);
+      gradVel[i]          = scalar_view("gradVel", c, p, d);
+      valVelDotgradVel[i] = scalar_view("valVelDotgradVel", c, p);
+      alphaVel[i]         = scalar_view("alphaVel", c, p);
+      // Evaluate on FE basis
+      feVel_->evaluateValue(valVel[i], U[i]);
+      feVel_->evaluateGradient(gradVel[i], U[i]);
+      // Multiply velocity gradients with viscosity.
+      fst::tensorMultiplyDataData(nuGradVel[i], nu, gradVel[i]);
+      // Multiply velocity with alpha
+      fst::scalarMultiplyDataData(alphaVel[i], alpha, valVel[i]);
+      // Assemble the velocity vector and its divergence.
+      for (int j = 0; j < c; ++j) {
+        for (int k = 0; k < p; ++k) {
+          valVel0(j,k,i) = (valVel[i])(j,k);
+          divVel(j,k)   += (gradVel[i])(j,k,i);
+        }
+      }
+    }
+    fst::scalarMultiplyDataData(nuVel0, nu, valVel[0]);
+    // Compute nonlinear terms in the Navier-Stokes equations.
+    for (int i = 0; i < d; ++i) {
+      fst::dotMultiplyDataData(valVelDotgradVel[i], valVel0, gradVel[i]);
+      rst::scale(valVelDotgradVel[i], density_);
+    }
+    // Negative pressure
+    fePrs_->evaluateValue(valPres, U[d]);
+    rst::scale(valPres, static_cast<Real>(-1));
+    /*** Evaluate weak form of the residual. ***/
+    // Integrate velocity equation.
+    fst::integrate(R[0],valPres,feVel_->NdetJ(),false);
+    multiplyByRadius(nuVel0,false,true);
+    fst::integrate(R[0],nuVel0,feVel_->NdetJ(),true);
+    multiplyByRadius(valPres,false);
+    for (int i = 0; i < d; ++i) {
+      multiplyByRadius(nuGradVel[i],false);
+      fst::integrate(R[i],nuGradVel[i],feVel_->gradNdetJ(),true);
+      if (!useStokes_) {
+        multiplyByRadius(valVelDotgradVel[i],false);
+        fst::integrate(R[i],valVelDotgradVel[i],feVel_->NdetJ(),true);
+      }
+      fst::integrate(R[i],valPres,feVel_->DNDdetJ(i),true);
+      multiplyByRadius(alphaVel[i],false);
+      fst::integrate(R[i],alphaVel[i],feVel_->NdetJ(),true);
+    }
+    // Integrate pressure equation.
+    multiplyByRadius(divVel,false);
+    fst::integrate(R[d],divVel,fePrs_->NdetJ(),false);
+    fst::integrate(R[d],valVel[0],fePrs_->NdetJ(),true);
+    rst::scale(R[d], static_cast<Real>(-1));
+    // Boundary conditions.
+    applyDirichletToResidual(R,U);
+    // Combine the residuals.
+    FieldUtils::combineFieldCoeff<Real>(res, R, fieldInfo_);
+  }
+
+  void Jacobian_1(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fv = feVel_->gradN().extent_int(1);
+    const int fp = fePrs_->gradN().extent_int(1);
+    const int p  = feVel_->gradN().extent_int(2);
+    const int d  = feVel_->gradN().extent_int(3);
+    // Initialize Jacobians.
+    std::vector<std::vector<scalar_view>> J(numFields_);
+    J[d].resize(numFields_);
+    for (int i = 0; i < d; ++i) {
+      J[i].resize(numFields_);
+      for (int j = 0; j < d; ++j)
+        J[i][j] = scalar_view("jac", c, fv, fv);
+      J[i][d] = scalar_view("jac", c, fv, fp);
+      J[d][i] = scalar_view("jac", c, fp, fv);
+    }
+    J[d][d] = scalar_view("jac", c, fp, fp);
+    // Split coefficients into components.
+    std::vector<scalar_view> U, Z;
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoCtrl_);
+    // Evaluate/interpolate finite element fields on cells.
+    std::vector<scalar_view> valVel(d), gradVel(d);
+    std::vector<std::vector<scalar_view>> dVel(d), dVelPhi(d);
+    scalar_view nu("nu", c, p);
+    scalar_view nuGradPhi("nuGradPhi", c, fv, p, d);
+    scalar_view valCtrl("valCtrl", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view alphaPhi("alphaPhi", c, fv, p);
+    scalar_view valVel0("valVel0", c, p, d);
+    scalar_view valVelDotgradPhi("valVelDotgradPhi", c, fv, p);
+    scalar_view nuNVel0("nuNVel0", c, fv, p);
+    scalar_view posPrsN("posPrsN", c, fp, p);
+    scalar_view negPrsN("negPrsN", c, fp, p);
+    computeViscosity(nu);
+    feCtrl_->evaluateValue(valCtrl, Z[0]);
+    imp_->compute(alpha, valCtrl, feVel_->cubPts(), 0);
+    for (int i = 0; i < d; ++i) {
+      valVel[i]           = scalar_view("valVel", c, p);
+      gradVel[i]          = scalar_view("gradVel", c, p, d);
+      feVel_->evaluateValue(valVel[i], U[i]);
+      feVel_->evaluateGradient(gradVel[i], U[i]);
+      for (int j = 0; j < c; ++j) {
+        for (int k = 0; k < p; ++k)
+          valVel0(j,k,i) = (valVel[i])(j,k);
+      }
+      dVel[i].resize(d);
+      dVelPhi[i].resize(d);
+      for (int j = 0; j < d; ++j) {
+        dVel[i][j]    = scalar_view("dVel", c, p);
+        dVelPhi[i][j] = scalar_view("dVelPhi", c, fv, p);
+        for (int k = 0; k < c; ++k) {
+          for (int l = 0; l < p; ++l)
+            (dVel[i][j])(k,l) = (gradVel[i])(k,l,j);
+        }
+      }
+    }
+    fst::scalarMultiplyDataField(nuNVel0, nu, feVel_->N());
+    // Multiply velocity gradients with viscosity.
+    fst::tensorMultiplyDataField(nuGradPhi, nu, feVel_->gradN());
+    // Compute nonlinear terms in the Navier-Stokes equations.
+    fst::dotMultiplyDataField(valVelDotgradPhi, valVel0, feVel_->gradN());
+    rst::scale(valVelDotgradPhi, density_);
+    for (int i = 0; i < d; ++i) {
+      for (int j = 0; j < d; ++j) {
+        fst::scalarMultiplyDataField(dVelPhi[i][j], dVel[i][j], feVel_->N());
+        rst::scale(dVelPhi[i][j], density_);
+      }
+    }
+    // Multiply velocity with alpha
+    fst::scalarMultiplyDataField(alphaPhi, alpha, feVel_->N());
+    // Negative pressure basis.
+    Kokkos::deep_copy(posPrsN,fePrs_->NdetJ());
+    Kokkos::deep_copy(negPrsN,fePrs_->N());
+    rst::scale(negPrsN, static_cast<Real>(-1));
+    /*** Evaluate weak form of the Jacobian. ***/
+    fst::integrate(J[0][d],feVel_->NdetJ(),negPrsN,false);
+    multiplyByRadius(nuNVel0,true,true);
+    fst::integrate(J[0][0],nuNVel0,feVel_->NdetJ(),false);
+    fst::integrate(J[d][0],fePrs_->N(),feVel_->NdetJ(),false);
+    multiplyByRadius(nuGradPhi,true);
+    multiplyByRadius(valVelDotgradPhi,true);
+    multiplyByRadius(alphaPhi,true);
+    multiplyByRadius(negPrsN,true);
+    multiplyByRadius(posPrsN,true);
+    for (int i = 0; i < d; ++i) {
+      // Velocity equation.
+      fst::integrate(J[i][i],nuGradPhi,feVel_->gradNdetJ(),true);
+      fst::integrate(J[i][i],alphaPhi,feVel_->NdetJ(),true);
+      if (!useStokes_) {
+        fst::integrate(J[i][i],feVel_->NdetJ(),valVelDotgradPhi,true);
+        for (int j = 0; j < d; ++j) {
+          multiplyByRadius(dVelPhi[i][j],true);
+          fst::integrate(J[i][j],feVel_->NdetJ(),dVelPhi[i][j],true);
+        }
+      }
+      fst::integrate(J[i][d],feVel_->DNDdetJ(i),negPrsN,true);
+      // Pressure equation.
+      fst::integrate(J[d][i],posPrsN,feVel_->DND(i),true);
+      rst::scale(J[d][i], static_cast<Real>(-1));
+    }
+    // APPLY DIRICHLET CONDITIONS
+    applyDirichletToJacobian1(J);
+    // Combine the jacobians.
+    FieldUtils::combineFieldCoeff(jac, J, fieldInfo_, fieldInfo_);
+  }
+
+
+  void Jacobian_2(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fv = feVel_->gradN().extent_int(1);
+    const int fp = fePrs_->gradN().extent_int(1);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int p  = feVel_->gradN().extent_int(2);
+    const int d  = feVel_->gradN().extent_int(3);
+    // Initialize Jacobians.
+    std::vector<std::vector<scalar_view>> J(numFields_);
+    for (int i = 0; i < d; ++i) {
+      J[i].resize(1);
+      J[i][0] = scalar_view("jac", c, fv, fc);
+    }
+    J[d].resize(1);
+    J[d][0] = scalar_view("jac", c, fp, fc);
+    // Split coefficients into components.
+    std::vector<scalar_view> U, Z;
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoCtrl_);
+    // Evaluate on FE basis.
+    scalar_view valCtrl("valCtrl", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view alphaPhi("alphaPhi", c, fc, p);
+    scalar_view valVel("valVel", c, p);
+    scalar_view alphaU("alphaU", c, fc, p);
+    feCtrl_->evaluateValue(valCtrl, Z[0]);
+    imp_->compute(alpha, valCtrl, feVel_->cubPts(), 1);
+    fst::scalarMultiplyDataField(alphaPhi, alpha, feCtrl_->N());
+    for (int i = 0; i < d; ++i) {
+      Kokkos::deep_copy(alphaU,static_cast<Real>(0));
+      Kokkos::deep_copy(valVel,static_cast<Real>(0));
+      feVel_->evaluateValue(valVel, U[i]);
+      // Multiply velocity with alpha
+      fst::scalarMultiplyDataField(alphaU, valVel, alphaPhi);
+      // Integrate
+      multiplyByRadius(alphaU,true);
+      fst::integrate(J[i][0],feVel_->NdetJ(),alphaU,false);
+    }
+    // APPLY DIRICHLET CONDITIONS
+    applyDirichletToJacobian2(J);
+    // Combine the jacobians.
+    FieldUtils::combineFieldCoeff<Real>(jac, J, fieldInfo_, fieldInfoCtrl_);
+  }
+
+  void Jacobian_3(std::vector<scalar_view> & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> Jacobian_3 is zero!");
+  }
+
+  void Hessian_11(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if (!useStokes_) {
+      // Retrieve dimensions.
+      const int c  = feVel_->gradN().extent_int(0);
+      const int fv = feVel_->gradN().extent_int(1);
+      const int fp = fePrs_->gradN().extent_int(1);
+      const int p  = feVel_->gradN().extent_int(2);
+      const int d  = feVel_->gradN().extent_int(3);
+      // Initialize hessians.
+      std::vector<std::vector<scalar_view>> H;
+      H.resize(numFields_);
+      H[d].resize(numFields_);
+      for (int i = 0; i < d; ++i) {
+        H[i].resize(numFields_);
+        for (int j = 0; j < d; ++j)
+          H[i][j] = scalar_view("hess", c, fv, fv);
+        H[i][d] = scalar_view("hess", c, fv, fp);
+        H[d][i] = scalar_view("hess", c, fp, fv);
+      }
+      H[d][d] = scalar_view("hess", c, fp, fp);
+      // Split coefficients into components.
+      std::vector<scalar_view> L;
+      FieldUtils::splitFieldCoeff<Real>(L, l_coeff, fieldInfo_);
+      // Apply Dirichlet conditions
+      applyDirichletToMultiplier(L);
+      // Evaluate/interpolate finite element fields on cells.
+      std::vector<scalar_view> Lval(d), LPhi(d);
+      for (int i = 0; i < d; ++i) {
+        Lval[i] = scalar_view("Lval", c, p);
+        LPhi[i] = scalar_view("LPhi", c, fv, p);
+        feVel_->evaluateValue(Lval[i], L[i]);
+        fst::scalarMultiplyDataField(LPhi[i], Lval[i], feVel_->N());
+        rst::scale(LPhi[i], density_);
+        multiplyByRadius(LPhi[i],true);
+      }
+      /*** Evaluate weak form of the Hessian. ***/
+      for (int i = 0; i < d; ++i) {
+        fst::integrate(H[i][i],LPhi[i],feVel_->DNDdetJ(i),false);
+        fst::integrate(H[i][i],feVel_->DNDdetJ(i),LPhi[i],true);
+        for (int j = 0; j < d; ++j) {
+          fst::integrate(H[i][j],feVel_->DNDdetJ(j),LPhi[i],false);
+          fst::integrate(H[i][j],LPhi[j],feVel_->DNDdetJ(i),true);
+        }
+      }
+      // Combine the Hessians.
+      FieldUtils::combineFieldCoeff<Real>(hess, H, fieldInfo_, fieldInfo_);
+    }
+    else {
+      throw Exception::Zero(">>> Hessian_11 is zero!");
+    }
+  }
+
+  void Hessian_12(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fv = feVel_->gradN().extent_int(1);
+    const int fp = fePrs_->gradN().extent_int(1);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int p  = feVel_->gradN().extent_int(2);
+    const int d  = feVel_->gradN().extent_int(3);
+    // Initialize hessians.
+    std::vector<std::vector<scalar_view>> H(1);
+    H[0].resize(numFields_);
+    for (int i = 0; i < d; ++i)
+      H[0][i] = scalar_view("hess", c, fc, fv);
+    H[0][d] = scalar_view("hess", c, fc, fp);
+    // Split coefficients into components.
+    std::vector<scalar_view> L, Z;
+    FieldUtils::splitFieldCoeff<Real>(L, l_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoCtrl_);
+    // Apply Dirichlet conditions
+    applyDirichletToMultiplier(L);
+    // Evaluate/interpolate finite element fields on cells.
+    scalar_view Z0("Z0", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view Lval("Lval", c, p);
+    scalar_view alphaL("alphaL", c, fc, p);
+    feCtrl_->evaluateValue(Z0, Z[0]);
+    imp_->compute(alpha, Z0, feVel_->cubPts(), 1);
+    /*** Evaluate weak form of the Hessian. ***/
+    // Velocity equation.
+    for (int i = 0; i < d; ++i) {
+      // Multiply velocity with alpha
+      Kokkos::deep_copy(Lval, static_cast<Real>(0));
+      Kokkos::deep_copy(alphaL, static_cast<Real>(0));
+      feVel_->evaluateValue(Lval, L[i]);
+      for (int j = 0; j < c; ++j) {
+        for (int k = 0; k < fc; ++k) {
+          for (int l = 0; l < p; ++l) {
+            alphaL(j,k,l) = alpha(j,l) * (feCtrl_->N())(j,k,l) * Lval(j,l);
+          }
+        }
+      }
+      multiplyByRadius(alphaL,true);
+      fst::integrate(H[0][i],alphaL,feVel_->NdetJ(),false);
+    }
+    // Combine the Hessians.
+    FieldUtils::combineFieldCoeff<Real>(hess, H, fieldInfoCtrl_, fieldInfo_);
+  }
+
+  void Hessian_13(std::vector<scalar_view> & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> Hessian_13 is zero!");
+  }
+
+  void Hessian_21(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fv = feVel_->gradN().extent_int(1);
+    const int fp = fePrs_->gradN().extent_int(1);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int p  = feVel_->gradN().extent_int(2);
+    const int d  = feVel_->gradN().extent_int(3);
+    // Initialize hessians.
+    std::vector<std::vector<scalar_view>> H(numFields_);
+    for (int i = 0; i < d; ++i) {
+      H[i].resize(1);
+      H[i][0] = scalar_view("hess", c, fv, fc);
+    }
+    H[d].resize(1);
+    H[d][0] = scalar_view("hess", c, fp, fc);
+    // Split coefficients into components.
+    std::vector<scalar_view> L, Z;
+    FieldUtils::splitFieldCoeff<Real>(L, l_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoCtrl_);
+    // Apply Dirichlet conditions
+    applyDirichletToMultiplier(L);
+    // Evaluate/interpolate finite element fields on cells.
+    scalar_view Z0("Z0", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view Lval("Lval", c, p);
+    scalar_view alphaL("alphaL", c, fc, p);
+    feCtrl_->evaluateValue(Z0, Z[0]);
+    imp_->compute(alpha, Z0, feVel_->cubPts(), 1);
+    /*** Evaluate weak form of the Hessian. ***/
+    // Velocity equation.
+    for (int i = 0; i < d; ++i) {
+      // Multiply velocity with alpha
+      Kokkos::deep_copy(Lval, static_cast<Real>(0));
+      Kokkos::deep_copy(alphaL, static_cast<Real>(0));
+      feVel_->evaluateValue(Lval, L[i]);
+      for (int j = 0; j < c; ++j) {
+        for (int k = 0; k < fc; ++k) {
+          for (int l = 0; l < p; ++l) {
+            alphaL(j,k,l) = alpha(j,l) * (feCtrl_->N())(j,k,l) * Lval(j,l);
+          }
+        }
+      }
+      multiplyByRadius(alphaL,true);
+      fst::integrate(H[i][0],feVel_->NdetJ(),alphaL,false);
+    }
+    // Combine the Hessians.
+    FieldUtils::combineFieldCoeff<Real>(hess, H, fieldInfo_, fieldInfoCtrl_);
+  }
+
+  void Hessian_22(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int p  = feVel_->gradN().extent_int(2);
+    const int d  = feVel_->gradN().extent_int(3);
+    // Initialize hessians.
+    std::vector<std::vector<scalar_view>> H(1);
+    H[0].resize(1);
+    H[0][0] = scalar_view("hess", c, fc, fc);
+    // Split coefficients into components.
+    std::vector<scalar_view> L, U, Z;
+    FieldUtils::splitFieldCoeff<Real>(L, l_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoCtrl_);
+    // Apply Dirichlet conditions
+    applyDirichletToMultiplier(L);
+    // Evaluate/interpolate finite element fields on cells.
+    std::vector<scalar_view> Lval(d), Uval(d);
+    scalar_view Z0("Z0", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view alphaUL("alphaUL", c, fc, p);
+    for (int i = 0; i < d; ++i) {
+      Lval[i] = scalar_view("Lval", c, p);
+      Uval[i] = scalar_view("Uval", c, p);
+      feVel_->evaluateValue(Lval[i], L[i]);
+      feVel_->evaluateValue(Uval[i], U[i]);
+    }
+    feCtrl_->evaluateValue(Z0, Z[0]);
+    imp_->compute(alpha, Z0, feVel_->cubPts(), 2);
+    // Multiply velocity with alpha
+    Real dot(0);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < fc; ++j) {
+        for (int k = 0; k < p; ++k) {
+          dot = static_cast<Real>(0);
+          for (int l = 0; l < d; ++l) {
+            dot += (Uval[l])(i,k) * (Lval[l])(i,k);
+          }
+          alphaUL(i,j,k) = alpha(i,k) * (feCtrl_->N())(i,j,k) * dot;
+        }
+      }
+    }
+    /*** Evaluate weak form of the Hessian. ***/
+    // Velocity equation.
+    multiplyByRadius(alphaUL,true);
+    fst::integrate(H[0][0],alphaUL,feCtrl_->NdetJ(),false);
+    // Combine hessians.
+    FieldUtils::combineFieldCoeff<Real>(hess, H, fieldInfoCtrl_, fieldInfoCtrl_);
+  }
+
+  void Hessian_23(std::vector<scalar_view> & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> Hessian_23 is zero!");
+  }
+
+  void Hessian_31(std::vector<scalar_view> & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> Hessian_31 is zero!");
+  }
+
+  void Hessian_32(std::vector<scalar_view> & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> Hessian_32 is zero!");
+  }
+
+  void Hessian_33(std::vector<std::vector<scalar_view>> & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> Hessian_33 is zero!");
+  }
+
+  void RieszMap_1(scalar_view & riesz) override {
+    // Retrieve dimensions.
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fv = feVel_->gradN().extent_int(1);
+    const int fp = fePrs_->gradN().extent_int(1);
+    const int p  = feVel_->gradN().extent_int(2);
+    const int d  = feVel_->gradN().extent_int(3);
+    // Initialize Riesz maps.
+    std::vector<std::vector<scalar_view>> J(numFields_);
+    J[d].resize(numFields_);
+    for (int i = 0; i < d; ++i) {
+      J[i].resize(numFields_);
+      for (int j = 0; j < d; ++j)
+        J[i][j] = scalar_view("riesz", c, fv, fv);
+      J[i][d] = scalar_view("riesz", c, fv, fp);
+      J[d][i] = scalar_view("riesz", c, fp, fv);
+    }
+    J[d][d] = scalar_view("riesz", c, fp, fp);
+    // Evaluate/interpolate finite element fields on cells.
+    scalar_view valVel("valVel", c, fv, p);
+    scalar_view gradVel("gradVel", c, fv, p, d);
+    scalar_view valPrs("valPrs", c, fp, p);
+    Kokkos::deep_copy(valVel, feVel_->N());
+    Kokkos::deep_copy(gradVel, feVel_->gradN());
+    Kokkos::deep_copy(valPrs, fePrs_->N());
+    /*** Evaluate weak form of the Jacobian. ***/
+    multiplyByRadius(gradVel,true);
+    multiplyByRadius(valVel,true);
+    multiplyByRadius(valPrs,true);
+    for (int i = 0; i < d; ++i) {
+      fst::integrate(J[i][i],gradVel,feVel_->gradNdetJ(),false);
+      fst::integrate(J[i][i],valVel,feVel_->NdetJ(),true);
+    }
+    fst::integrate(J[d][d],valPrs,fePrs_->NdetJ(),false);
+    // Combine Riesz maps.
+    FieldUtils::combineFieldCoeff<Real>(riesz, J, fieldInfo_, fieldInfo_);
+  }
+
+  void RieszMap_2(scalar_view & riesz) override {
+    // Retrieve dimensions.
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int p  = feCtrl_->gradN().extent_int(2);
+    // Initialize residuals.
+    std::vector<std::vector<scalar_view>> J(1);
+    J[0].resize(1);
+    J[0][0] = scalar_view("riesz", c, fc, fc);
+    // Evaluate on FE basis.
+    scalar_view valPsi("valPsi", c, fc, p);
+    Kokkos::deep_copy(valPsi, feCtrl_->N());
+    /*** Evaluate weak form of the residual. ***/
+    multiplyByRadius(valPsi,true);
+    fst::integrate(J[0][0],valPsi,feCtrl_->NdetJ(),false);
+    // Combine Riesz maps.
+    FieldUtils::combineFieldCoeff<Real>(riesz, J, fieldInfoCtrl_, fieldInfoCtrl_);
+  }
+
+  std::vector<basis_ptr> getFields() override {
+    return basisPtrs_;
+  }
+
+  std::vector<basis_ptr> getFields2() override {
+    return basisPtrsCtrl_;
+  }
+
+  void setCellNodes(const scalar_view &volCellNodes,
+                    const std::vector<std::vector<scalar_view>> &bdryCellNodes,
+                    const std::vector<std::vector<std::vector<int>>> &bdryCellLocIds) override {
+    volCellNodes_ = volCellNodes;
+    bdryCellNodes_ = bdryCellNodes;
+    bdryCellLocIds_ = bdryCellLocIds;
+    // Finite element definition.
+    feVel_  = ROL::makePtr<fe_type>(volCellNodes_,basisPtrVel_,cellCub_);
+    fePrs_  = ROL::makePtr<fe_type>(volCellNodes_,basisPtrPrs_,cellCub_);
+    feCtrl_ = ROL::makePtr<fe_type>(volCellNodes_,basisPtrCtrl_,cellCub_,false);
+    fvidx_  = feVel_->getBoundaryDofs();
+    fpidx_  = fePrs_->getBoundaryDofs();
+    // Construct control boundary FE
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      feVelBdry_.resize(numSideSets);
+      for (int i = 0; i < numSideSets; ++i) {
+        int numLocSides = bdryCellNodes[i].size();
+        feVelBdry_[i].resize(numLocSides);
+        for (int j = 0; j < numLocSides; ++j) {
+          if (bdryCellNodes[i][j] != scalar_view()) {
+            feVelBdry_[i][j] = ROL::makePtr<fe_type>(bdryCellNodes[i][j],basisPtrVel_,bdryCub_,j);
+          }
+        }
+      }
+    }
+    computeDirichlet();
+  }
+
+  void setFieldPattern(const std::vector<std::vector<int>> &fieldPattern,
+                       const std::vector<std::vector<int>> &fieldPattern2) override {
+    fieldPattern_ = fieldPattern;
+    fieldInfo_    = ROL::makePtr<FieldUtils::FieldInfo>(numFields_,numDofs_,numFieldDofs_,fieldPattern_);
+    fieldPatternCtrl_ = fieldPattern2;
+    fieldInfoCtrl_    = ROL::makePtr<FieldUtils::FieldInfo>(numFieldsCtrl_,numDofsCtrl_,numFieldDofsCtrl_,fieldPatternCtrl_);
+    fieldHelper_ = ROL::makePtr<FieldHelper<Real,DeviceType>>(numFields_, numDofs_, numFieldDofs_, fieldPattern_);
+  }
+
+  const ROL::Ptr<fe_type> getVelocityFE(void) const {
+    return feVel_;
+  }
+
+  const ROL::Ptr<fe_type> getPressureFE(void) const {
+    return fePrs_;
+  }
+
+  const ROL::Ptr<fe_type> getControlFE(void) const {
+    return feCtrl_;
+  }
+
+  const std::vector<ROL::Ptr<fe_type>> getVelocityBdryFE(const int sideset = -1) const {
+    int side = sideset;
+    if ( sideset < 0 || sideset > 4 ) {
+      //side = (useDirichletControl_) ? 6 : 10;
+      side = 0;
+    }
+    return feVelBdry_[side];
+  }
+
+  const std::vector<std::vector<int>> getBdryCellLocIds(const int sideset = -1) const {
+    int side = sideset;
+    if ( sideset < 0 || sideset > 4 ) {
+      //side = (useDirichletControl_) ? 6 : 10;
+      side = 0;
+    }
+    return bdryCellLocIds_[side];
+  }
+
+  const ROL::Ptr<FieldHelper<Real,DeviceType>> getFieldHelper(void) const {
+    return fieldHelper_;
+  }
+
+  const ROL::Ptr<const FieldUtils::FieldInfo> getStateFieldInfo(void) const {
+    return fieldInfo_;
+  }
+
+  const ROL::Ptr<const FieldUtils::FieldInfo> getControlFieldInfo(void) const {
+    return fieldInfoCtrl_;
+  }
+
+  void printData(std::string tag,
+                 const scalar_view u_coeff,
+                 const scalar_view z_coeff = scalar_view(),
+                 const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c = fePrs_->gradN().extent_int(0);
+    const int p = fePrs_->gradN().extent_int(2);
+    const int d = fePrs_->gradN().extent_int(3);
+    // Evaluate/interpolate finite element fields on cells.
+    scalar_view valCtrl("valCtrl", c, p);
+    scalar_view alpha("alpha", c, p);
+    feCtrl_->evaluateValue(valCtrl, z_coeff);
+    imp_->compute(alpha, valCtrl, fePrs_->cubPts(), 0);
+    // Print to permeability file
+    std::stringstream namePer;
+    namePer << "permeability" << tag << ".txt";
+    std::ofstream filePer;
+    filePer.open(namePer.str());
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        filePer << std::scientific << std::setprecision(15);
+        for (int k = 0; k < d; ++k) filePer << std::setw(25) << (fePrs_->cubPts())(i,j,k);
+        filePer << std::setw(25) << viscosity_/alpha(i,j);
+        filePer << std::endl;
+      }
+    }
+    filePer.close();
+  }
+
+private:
+
+  void applyDirichletToResidual(std::vector<scalar_view> &R,
+                          const std::vector<scalar_view> &U) const {
+    const int d = feVel_->gradN().extent_int(3);
+    const int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      // APPLY DIRICHLET CONDITIONS
+      for (int i = 0; i < numSideSets; ++i) {
+        if (i==0) /* in flow */ {
+          const int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            const int numCellsSide = bdryCellLocIds_[i][j].size();
+            const int numBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              const int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                for (int m=0; m < d; ++m) {
+                  (R[m])(cidx,fvidx_[j][l]) = (U[m])(cidx,fvidx_[j][l]) - (bdryCellDofValues_[i][j])(k,fvidx_[j][l],m);
+                }
+              }
+            }
+          }
+        }
+        else if (i==1 && useNoSlip_) /* no slip */ {
+          const int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            const int numCellsSide = bdryCellLocIds_[i][j].size();
+            const int numBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              const int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                for (int m=0; m < d; ++m) {
+                  (R[m])(cidx,fvidx_[j][l]) = (U[m])(cidx,fvidx_[j][l]);
+                }
+              }
+            }
+          }
+        }
+        else if (i==2 && usePresOut_) /* out flow */ {
+          const int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            const int numCellsSide = bdryCellLocIds_[i][j].size();
+            const int numBdryDofs = fpidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              const int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                (R[d])(cidx,fpidx_[j][l]) = (U[d])(cidx,fpidx_[j][l]) - Patm_;
+              }
+            }
+          }
+        }
+        else if (i==3) /* no normal */ { 
+          const int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            const int numCellsSide = bdryCellLocIds_[i][j].size();
+            const int numBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              const int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                (R[0])(cidx,fvidx_[j][l]) = (U[0])(cidx,fvidx_[j][l]);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void applyDirichletToJacobian1(std::vector<std::vector<scalar_view>> &J) const {
+    const int fv = feVel_->gradN().extent_int(1);
+    const int fp = fePrs_->gradN().extent_int(1);
+    const int d  = feVel_->gradN().extent_int(3);
+    const int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if (i==0) /* in flow */ {
+          const int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            const int numCellsSide = bdryCellLocIds_[i][j].size();
+            const int numBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              const int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                for (int m=0; m < fv; ++m) {
+                  for (int n=0; n < d; ++n) {
+                    for (int p=0; p < d; ++p) {
+                      (J[n][p])(cidx,fvidx_[j][l],m) = static_cast<Real>(0);
+                    }
+                    (J[n][n])(cidx,fvidx_[j][l],fvidx_[j][l]) = static_cast<Real>(1);
+                  }
+                }
+                for (int m=0; m < fp; ++m) {
+                  for (int n=0; n < d; ++n) {
+                    (J[n][d])(cidx,fvidx_[j][l],m) = static_cast<Real>(0);
+                  }
+                }
+              }
+            }
+          }
+        }
+        else if (i==1 && useNoSlip_) /* no slip */ {
+          const int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            const int numCellsSide = bdryCellLocIds_[i][j].size();
+            const int numBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              const int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                for (int m=0; m < fv; ++m) {
+                  for (int n=0; n < d; ++n) {
+                    for (int p=0; p < d; ++p) {
+                      (J[n][p])(cidx,fvidx_[j][l],m) = static_cast<Real>(0);
+                    }
+                    (J[n][n])(cidx,fvidx_[j][l],fvidx_[j][l]) = static_cast<Real>(1);
+                  }
+                }
+                for (int m=0; m < fp; ++m) {
+                  for (int n=0; n < d; ++n) {
+                    (J[n][d])(cidx,fvidx_[j][l],m) = static_cast<Real>(0);
+                  }
+                }
+              }
+            }
+          }
+        }
+        else if (i==2 && usePresOut_) /* out flow */ {
+          const int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            const int numCellsSide = bdryCellLocIds_[i][j].size();
+            const int numBdryDofs = fpidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              const int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                for (int m = 0; m < fv; ++m) {
+                  for (int p = 0; p < d; ++p) {
+                    (J[d][p])(cidx,fpidx_[j][l],m) = static_cast<Real>(0);
+                  }
+                }
+                for (int m = 0; m < fp; ++m) {
+                  (J[d][d])(cidx,fpidx_[j][l],m) = static_cast<Real>(0);
+                }
+                (J[d][d])(cidx,fpidx_[j][l],fpidx_[j][l]) = static_cast<Real>(1);
+              }
+            }
+          }
+        }
+        else if ( i==3 ) /* no normal */ {
+          const int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            const int numCellsSide = bdryCellLocIds_[i][j].size();
+            const int numBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              const int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                for (int m=0; m < fv; ++m) {
+                  for (int p=0; p < d; ++p) {
+                    (J[0][p])(cidx,fvidx_[j][l],m) = static_cast<Real>(0);
+                  }
+                  (J[0][0])(cidx,fvidx_[j][l],fvidx_[j][l]) = static_cast<Real>(1);
+                }
+                for (int m=0; m < fp; ++m) {
+                  (J[0][d])(cidx,fvidx_[j][l],m) = static_cast<Real>(0);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void applyDirichletToJacobian2(std::vector<std::vector<scalar_view>> &J) const {
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int d  = feVel_->gradN().extent_int(3);
+    const int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if (i==0) /* in flow */ {
+          const int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            const int numCellsSide = bdryCellLocIds_[i][j].size();
+            const int numBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              const int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                for (int n=0; n < d; ++n) {
+                  for (int m=0; m < fc; ++m) {
+                    (J[n][0])(cidx,fvidx_[j][l],m) = static_cast<Real>(0);
+                  }
+                }
+              }
+            }
+          }
+        }
+        else if (i==1 && useNoSlip_) /* no slip */ {
+          const int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            const int numCellsSide = bdryCellLocIds_[i][j].size();
+            const int numBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              const int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                for (int n=0; n < d; ++n) {
+                  for (int m=0; m < fc; ++m) {
+                    (J[n][0])(cidx,fvidx_[j][l],m) = static_cast<Real>(0);
+                  }
+                }
+              }
+            }
+          }
+        }
+        else if (i==2 && usePresOut_) /* out flow */ {}
+        else if ( i==3 ) /* no normal */ {
+          const int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            const int numCellsSide = bdryCellLocIds_[i][j].size();
+            const int numBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              const int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                for (int m=0; m < fc; ++m) {
+                  (J[0][0])(cidx,fvidx_[j][l],m) = static_cast<Real>(0);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void applyDirichletToMultiplier(std::vector<scalar_view> &L) const {
+    const int d = feVel_->gradN().extent_int(3);
+    const int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if (i==0) /* in flow */ {
+          const int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            const int numCellsSide = bdryCellLocIds_[i][j].size();
+            const int numBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              const int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                for (int m=0; m < d; ++m) {
+                  (L[m])(cidx,fvidx_[j][l]) = static_cast<Real>(0);
+                }
+              }
+            }
+          }
+        }
+        else if (i==1 && useNoSlip_) /* no slip */ {
+          const int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            const int numCellsSide = bdryCellLocIds_[i][j].size();
+            const int numBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              const int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                for (int m=0; m < d; ++m) {
+                  (L[m])(cidx,fvidx_[j][l]) = static_cast<Real>(0);
+                }
+              }
+            }
+          }
+        }
+        else if (i==2 && usePresOut_) /* out flow */ {
+          const int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            const int numCellsSide = bdryCellLocIds_[i][j].size();
+            const int numBdryDofs = fpidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              const int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                (L[d])(cidx,fpidx_[j][l]) = static_cast<Real>(0);
+              }
+            }
+          }
+        }
+        else if ( i==3 ) /* no normal */ {
+          const int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            const int numCellsSide = bdryCellLocIds_[i][j].size();
+            const int numBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              const int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                (L[0])(cidx,fvidx_[j][l]) = static_cast<Real>(0);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+}; // PDE_NavierStokes
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/darcy/CMakeLists.txt b/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/darcy/CMakeLists.txt
index e6c5d7675e02..f3e83e6e1742 100644
--- a/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/darcy/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/darcy/CMakeLists.txt
@@ -11,13 +11,13 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_01
     SOURCES example_01.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     ProjectAxisymmetricDarcyDataCopy
     SOURCE_FILES
       input.xml
@@ -27,5 +27,34 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
   )
 
+ENDIF()
+
+IF(${PROJECT_NAME}_ENABLE_Intrepid2 AND
+   ${PROJECT_NAME}_ENABLE_Ifpack2   AND
+   ${PROJECT_NAME}_ENABLE_MueLu     AND
+   ${PROJECT_NAME}_ENABLE_Amesos2   AND
+   ${PROJECT_NAME}_ENABLE_Tpetra )
+
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/teuchos/src/sol)
+
+  # Need ROL_TpetraMultiVector.hpp
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
+
+  ROL_ADD_EXECUTABLE(
+    example_01_Kokkos
+    SOURCES example_01K.cpp
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_COPY_FILES_TO_BINARY_DIR(
+    ProjectAxisymmetricDarcyDataCopyK
+    SOURCE_FILES
+      input.xml
+      plotDarcy.m
+      plotmesh.m
+    SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}"
+    DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
+  )
 
 ENDIF()
diff --git a/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/darcy/example_01.cpp b/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/darcy/example_01.cpp
index c7d411898744..7680be62601e 100644
--- a/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/darcy/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/darcy/example_01.cpp
@@ -13,8 +13,7 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -47,7 +46,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int>> comm
     = Tpetra::getDefaultComm();
   const int myRank = comm->getRank();
diff --git a/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/darcy/example_01K.cpp b/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/darcy/example_01K.cpp
new file mode 100644
index 000000000000..cb4d5b5e17bc
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/darcy/example_01K.cpp
@@ -0,0 +1,231 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_01.cpp
+    \brief Shows how to solve the Darcy porosity optimization problem.
+*/
+
+#include "Teuchos_Comm.hpp"
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+
+#include "ROL_TpetraMultiVector.hpp"
+#include "ROL_Reduced_Objective_SimOpt.hpp"
+#include "ROL_Bounds.hpp"
+#include "ROL_Solver.hpp"
+#include "ROL_SingletonVector.hpp"
+#include "ROL_ConstraintFromObjective.hpp"
+
+#include "../../../../TOOLS/meshreaderK.hpp"
+#include "../../../../TOOLS/pdeconstraintK.hpp"
+#include "../../../../TOOLS/pdeobjectiveK.hpp"
+#include "../../../../TOOLS/pdevectorK.hpp"
+#include "../../../../TOOLS/integralconstraintK.hpp"
+
+#include "pde_darcyK.hpp"
+#include "obj_darcyK.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+int main(int argc, char *argv[]) {
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  Kokkos::ScopeGuard kokkosScope (argc, argv);
+  ROL::Ptr<const Teuchos::Comm<int>> comm
+    = Tpetra::getDefaultComm();
+  const int myRank = comm->getRank();
+  if ((iprint > 0) && (myRank == 0))
+    outStream = ROL::makePtrFromRef(std::cout);
+  else
+    outStream = ROL::makePtrFromRef(bhs);
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+
+    /*** Read in XML input ***/
+    std::string filename = "input.xml";
+    auto parlist = ROL::getParametersFromXmlFile(filename);
+
+    bool output = parlist->sublist("SimOpt").sublist("Solve").get("Output Iteration History",false);
+    output = (iprint > 0) && (myRank==0) && output;
+    parlist->sublist("SimOpt").sublist("Solve").set("Output Iteration History",output);
+
+    /*** Initialize main data structure. ***/
+    auto meshMgr = ROL::makePtr<MeshReader<RealT,DeviceT>>(*parlist);
+    // Initialize PDE describing Darcy equations.
+    auto pde = ROL::makePtr<PDE_Darcy<RealT,DeviceT>>(*parlist);
+    auto con = ROL::makePtr<PDE_Constraint<RealT,DeviceT>>(pde,meshMgr,comm,*parlist,*outStream);
+    // Cast the constraint and get the assembler.
+    auto assembler = con->getAssembler();
+    con->setSolveParameters(*parlist);
+
+    // Create state vectors
+    auto u_ptr = assembler->createStateVector(); u_ptr->randomize();
+    auto up = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(u_ptr,pde,assembler,*parlist);
+    auto p_ptr = assembler->createStateVector(); p_ptr->randomize();
+    auto pp = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(p_ptr,pde,assembler,*parlist);
+    auto r_ptr = assembler->createResidualVector(); r_ptr->putScalar(0.0);
+    auto rp = ROL::makePtr<PDE_DualSimVector<RealT,DeviceT>>(r_ptr,pde,assembler,*parlist);
+
+    // Create control vectors
+    bool useParamVar = parlist->sublist("Problem").get("Use Optimal Constant Velocity",false);
+    int dim = 2;
+    auto z_ptr = assembler->createControlVector(); z_ptr->randomize();
+    ROL::Ptr<std::vector<RealT>>            z0_ptr;
+    ROL::Ptr<ROL::StdVector<RealT>>         z0p;
+    ROL::Ptr<ROL::TpetraMultiVector<RealT>> z1p;
+    ROL::Ptr<ROL::Vector<RealT>>            zp;
+    if (useParamVar) {
+      z0_ptr = ROL::makePtr<std::vector<RealT>>(dim);
+      z0p = ROL::makePtr<ROL::StdVector<RealT>>(z0_ptr);
+      z1p = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(z_ptr,pde,assembler,*parlist);
+      zp = ROL::makePtr<PDE_OptVector<RealT>>(z1p,z0p,myRank);
+    }
+    else {
+      zp = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(z_ptr,pde,assembler,*parlist);
+    }
+
+    // Create combined vector
+    auto xp = ROL::makePtr<ROL::Vector_SimOpt<RealT>>(up,zp);
+
+    // Initialize quadratic objective function.
+    //auto qoi  = ROL::makePtr<QoI_Velocity_Darcy<RealT,DeviceT>>(*parlist,
+    //                                                            pde->getPressureFE(),
+    //                                                            pde->getControlFE(),
+    //                                                            pde->getPressureBdryFE(4),
+    //                                                            pde->getControlBdryFE(4),
+    //                                                            pde->getBdryCellLocIds(4),
+    //                                                            pde->getPermeability());
+    auto qoi  = ROL::makePtr<QoI_VelocityTracking_Darcy<RealT,DeviceT>>(*parlist,
+                                                                        pde->getPressureFE(),
+                                                                        pde->getControlFE(),
+                                                                        pde->getPermeability());
+    auto obj  = ROL::makePtr<PDE_Objective<RealT,DeviceT>>(qoi,assembler);
+    auto robj = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(obj,con,up,zp,pp,true,false);
+
+    // Build bound constraint
+    ROL::Ptr<ROL::Vector<RealT>> lp, hp;
+    if (useParamVar) {
+      auto l0p = ROL::makePtr<ROL::StdVector<RealT>>(dim,ROL::ROL_NINF<RealT>());
+      auto h0p = ROL::makePtr<ROL::StdVector<RealT>>(dim,ROL::ROL_INF<RealT>());
+      auto l1_ptr = assembler->createControlVector();
+      auto h1_ptr = assembler->createControlVector();
+      auto l1p = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(l1_ptr,pde,assembler,*parlist);
+      auto h1p = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(h1_ptr,pde,assembler,*parlist);
+      l1p->setScalar(0.0);
+      h1p->setScalar(1.0);
+      lp = ROL::makePtr<PDE_OptVector<RealT>>(l1p,l0p,myRank);
+      hp = ROL::makePtr<PDE_OptVector<RealT>>(h1p,h0p,myRank);
+    }
+    else {
+      lp = zp->clone(); lp->setScalar(0.0);
+      hp = zp->clone(); hp->setScalar(1.0);
+    }
+    auto bnd = ROL::makePtr<ROL::Bounds<RealT>>(lp, hp);
+    // Build optimization problem
+    auto optProb = ROL::makePtr<ROL::Problem<RealT>>(robj, zp);
+    optProb->addBoundConstraint(bnd);
+    optProb->finalize(false,true,*outStream);
+
+    // Run derivative checks
+    bool checkDeriv = parlist->sublist("Problem").get("Check derivatives",false);
+    if ( checkDeriv ) {
+      auto rup = up->clone(); rup->randomize(-1.0,1.0);
+      auto rzp = zp->clone(); rzp->randomize( 0.0,1.0);
+      auto rpp = pp->clone(); rpp->randomize(-1.0,1.0);
+      auto dup = up->clone(); dup->randomize(-1.0,1.0);
+      auto dzp = zp->clone(); dzp->randomize( 0.0,1.0);
+      con->checkApplyJacobian_1(*rup,*rzp,*dup,*rup,true,*outStream);
+      con->checkApplyJacobian_2(*rup,*rzp,*dzp,*rup,true,*outStream);
+      con->checkInverseJacobian_1(*rup,*rup,*rup,*rzp,true,*outStream);
+      con->checkInverseAdjointJacobian_1(*rup,*rup,*rup,*rzp,true,*outStream);
+      con->checkApplyAdjointHessian_11(*rup,*rzp,*rpp,*dup,*rup,true,*outStream);
+      con->checkApplyAdjointHessian_21(*rup,*rzp,*rpp,*dzp,*rup,true,*outStream);
+      con->checkApplyAdjointHessian_12(*rup,*rzp,*rpp,*dup,*rzp,true,*outStream);
+      con->checkApplyAdjointHessian_22(*rup,*rzp,*rpp,*dzp,*rzp,true,*outStream);
+      obj->checkGradient_1(*rup,*rzp,*dup,true,*outStream);
+      obj->checkGradient_2(*rup,*rzp,*dzp,true,*outStream);
+      obj->checkHessVec_11(*rup,*rzp,*dup,true,*outStream);
+      obj->checkHessVec_12(*rup,*rzp,*dzp,true,*outStream);
+      obj->checkHessVec_21(*rup,*rzp,*dup,true,*outStream);
+      obj->checkHessVec_22(*rup,*rzp,*dzp,true,*outStream);
+      robj->checkGradient(*rzp,*dzp,true,*outStream);
+      robj->checkHessVec(*rzp,*dzp,true,*outStream);
+      //optProb->check(*outStream);
+    }
+
+    // Solve optimization problem
+    zp->setScalar(0.5);
+    up->zero(); pp->zero();
+    bool opt = parlist->sublist("Problem").get("Solve Optimization Problem",true);
+    if (opt) {
+      std::ifstream infile("control.txt");
+      if (infile.good())
+        assembler->inputTpetraVector(z_ptr,"control.txt");
+      if (useParamVar) {
+        std::ifstream infile0; infile0.open("target.txt");
+        if (infile0.good()) {
+          for (int i = 0; i < dim; ++i) infile0 >> (*z0_ptr)[i];
+          infile0.close();
+        }
+      }
+      ROL::Solver<RealT> optSolver(optProb, *parlist);
+      optSolver.solve(*outStream);
+      con->outputTpetraVector(z_ptr,"control.txt");
+      if (useParamVar) {
+        std::ofstream outfile; outfile.open("target.txt");
+        for (const auto x : *z0_ptr) {
+          outfile << std::scientific << std::setprecision(16);
+          outfile << x << std::endl;
+        }
+        outfile.close();
+      }
+    }
+
+    // Output
+    assembler->printMeshData(*outStream);
+    RealT tol(1.e-8);
+    Teuchos::Array<RealT> res(1,0);
+    con->solve(*rp,*up,*zp,tol);
+    con->outputTpetraVector(u_ptr,"state.txt");
+    con->value(*rp,*up,*zp,tol);
+    r_ptr->norm2(res.view(0,1));
+    *outStream << "Residual Norm: " << res[0] << std::endl;
+    assembler->printDataPDE(pde,u_ptr,z_ptr);
+    errorFlag += (res[0] > 1.e-6 ? 1 : 0);
+    //con->outputTpetraData();
+
+    // Get a summary from the time monitor.
+    Teuchos::TimeMonitor::summarize();
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/darcy/obj_darcyK.hpp b/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/darcy/obj_darcyK.hpp
new file mode 100644
index 000000000000..762d1396fb8d
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/darcy/obj_darcyK.hpp
@@ -0,0 +1,1278 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  obj.hpp
+    \brief Provides the interface for local (cell-based) objective function computations.
+*/
+
+#ifndef OBJ_DARCYK_HPP
+#define OBJ_DARCYK_HPP
+
+#include "../../../../TOOLS/qoiK.hpp"
+#include "pde_darcyK.hpp"
+#include "permeabilityK.hpp"
+
+template <class Real, class DeviceType>
+class QoI_Velocity_Darcy : public QoI<Real, DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> fePrs_, feCtrl_;
+  const std::vector<ROL::Ptr<fe_type>> fePrsBdry_, feCtrlBdry_;
+  const std::vector<std::vector<int>> bdryCellLocIds_;
+  const ROL::Ptr<Permeability<Real,DeviceType>> perm_;
+  std::vector<Real> target_;
+  bool onlyAxial_;
+
+  scalar_view getBoundaryCoeff(const scalar_view cell_coeff, int locSideId, const ROL::Ptr<fe_type> &fe) const {
+    std::vector<int> bdryCellLocId = bdryCellLocIds_[locSideId];
+    const int numCellsSide = bdryCellLocId.size();
+    const int f = fe->N().extent_int(1);
+    
+    scalar_view bdry_coeff("bdry_coeff", numCellsSide, f);
+    for (int i = 0; i < numCellsSide; ++i) {
+      for (int j = 0; j < f; ++j) {
+        bdry_coeff(i, j) = cell_coeff(bdryCellLocId[i], j);
+      }
+    }
+    return bdry_coeff;
+  }
+
+public:
+  QoI_Velocity_Darcy(ROL::ParameterList &list,
+                      const ROL::Ptr<fe_type> &fePrs,
+                      const ROL::Ptr<fe_type> &feCtrl,
+                      const std::vector<ROL::Ptr<fe_type>> &fePrsBdry,
+                      const std::vector<ROL::Ptr<fe_type>> &feCtrlBdry,
+                      const std::vector<std::vector<int>> &bdryCellLocIds,
+                      const ROL::Ptr<Permeability<Real,DeviceType>> &perm)
+    : fePrs_(fePrs), feCtrl_(feCtrl),
+      fePrsBdry_(fePrsBdry), feCtrlBdry_(feCtrlBdry),
+      bdryCellLocIds_(bdryCellLocIds),
+      perm_(perm) {
+    target_.clear(); target_.resize(2);
+    target_[0] = list.sublist("Problem").get("Target Radial Velocity",0.0);
+    target_[1] = list.sublist("Problem").get("Target Axial Velocity",-15.0);
+    onlyAxial_ = list.sublist("Problem").get("Only Use Axial Velocity",false);
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c = fePrs_->gradN().extent_int(0);
+    const int d = fePrs_->gradN().extent_int(3);
+    // Initialize output val
+    val = scalar_view("val", c);
+    // Compute cost integral
+    std::vector<Real> target = (z_param == ROL::nullPtr) ? target_ : *z_param;
+    const int numLocSides = bdryCellLocIds_.size();
+    for (int l = 0; l < numLocSides; ++l) {
+      const int numCellsSide = bdryCellLocIds_[l].size();
+      if ( numCellsSide ) {
+        const int numCubPerSide = fePrsBdry_[l]->cubPts().extent_int(1);
+        scalar_view valU_eval("valU_eval", numCellsSide, numCubPerSide, d);
+        scalar_view valZ_eval("valZ_eval", numCellsSide, numCubPerSide);
+        scalar_view alpha("alpha", numCellsSide, numCubPerSide);
+        scalar_view intVal("intVal", numCellsSide);
+        scalar_view u_coeff_bdry = getBoundaryCoeff(u_coeff, l, fePrs_);
+        scalar_view z_coeff_bdry = getBoundaryCoeff(z_coeff, l, feCtrl_);
+        fePrsBdry_[l]->evaluateGradient(valU_eval, u_coeff_bdry);
+        feCtrlBdry_[l]->evaluateValue(valZ_eval, z_coeff_bdry);
+        perm_->compute(alpha, valZ_eval, fePrsBdry_[l]->cubPts(), 0);
+        for (int i = 0; i < numCellsSide; ++i) {
+          for (int j = 0; j < numCubPerSide; ++j) {
+            for (int k = 0; k < d; ++k) {
+              valU_eval(i,j,k) *= -alpha(i,j);
+              if (k==0 && onlyAxial_) valU_eval(i,j,k) = static_cast<Real>(0);
+              else                    valU_eval(i,j,k) -= target[k];
+            }
+          }
+        }
+        fePrsBdry_[l]->computeIntegral(intVal,valU_eval,valU_eval,true);
+        // Add to integral value
+        for (int i = 0; i < numCellsSide; ++i) {
+          int cidx = bdryCellLocIds_[l][i];
+          val(cidx) += static_cast<Real>(0.5)*intVal(i);
+        }
+      }
+    }
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c = fePrs_->gradN().extent_int(0);
+    const int f = fePrs_->gradN().extent_int(1);
+    const int d = fePrs_->gradN().extent_int(3);
+    // Initialize output grad
+    grad = scalar_view("grad", c, f);
+    // Compute cost integral
+    std::vector<Real> target = (z_param == ROL::nullPtr) ? target_ : *z_param;
+    const int numLocSides = bdryCellLocIds_.size();
+    for (int l = 0; l < numLocSides; ++l) {
+      const int numCellsSide  = bdryCellLocIds_[l].size();
+      if ( numCellsSide ) {
+        const int numCubPerSide = fePrsBdry_[l]->cubPts().extent_int(1);
+        scalar_view valU_eval("valU_eval", numCellsSide, numCubPerSide, d);
+        scalar_view valZ_eval("valZ_eval", numCellsSide, numCubPerSide);
+        scalar_view alpha("alpha", numCellsSide, numCubPerSide);
+        scalar_view intGrad("intGrad", numCellsSide, f);
+        scalar_view u_coeff_bdry = getBoundaryCoeff(u_coeff, l, fePrs_);
+        scalar_view z_coeff_bdry = getBoundaryCoeff(z_coeff, l, feCtrl_);
+        fePrsBdry_[l]->evaluateGradient(valU_eval, u_coeff_bdry);
+        feCtrlBdry_[l]->evaluateValue(valZ_eval, z_coeff_bdry);
+        perm_->compute(alpha, valZ_eval, fePrsBdry_[l]->cubPts(), 0);
+        for (int i = 0; i < numCellsSide; ++i) {
+          for (int j = 0; j < numCubPerSide; ++j) {
+            for (int k = 0; k < d; ++k) {
+              valU_eval(i,j,k) *= -alpha(i,j);
+              if (k==0 && onlyAxial_) valU_eval(i,j,k) = static_cast<Real>(0);
+              else                    valU_eval(i,j,k) -= target[k];
+              valU_eval(i,j,k) *= -alpha(i,j);
+            }
+          }
+        }
+        // Compute gradient of squared L2-norm of diff
+        fst::integrate(intGrad,valU_eval,fePrsBdry_[l]->gradNdetJ(),false);
+        // Add to integral value
+        for (int i = 0; i < numCellsSide; ++i) {
+          int cidx = bdryCellLocIds_[l][i];
+          for (int j = 0; j < f; ++j) {
+            grad(cidx,j) += intGrad(i,j);
+          }
+        }
+      }
+    }
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c  = fePrs_->gradN().extent_int(0);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int d  = fePrs_->gradN().extent_int(3);
+    // Initialize output grad
+    grad = scalar_view("grad", c, fc);
+    // Compute cost integral
+    std::vector<Real> target = (z_param == ROL::nullPtr) ? target_ : *z_param;
+    const int numLocSides = bdryCellLocIds_.size();
+    for (int l = 0; l < numLocSides; ++l) {
+      const int numCellsSide = bdryCellLocIds_[l].size();
+      if ( numCellsSide ) {
+        const int numCubPerSide = fePrsBdry_[l]->cubPts().extent_int(1);
+        scalar_view valU_eval("valU_eval", numCellsSide, numCubPerSide, d);
+        scalar_view valZ_eval("valZ_eval", numCellsSide, numCubPerSide);
+        scalar_view alpha("alpha", numCellsSide, numCubPerSide);
+        scalar_view dalpha("dalpha", numCellsSide, numCubPerSide);
+        scalar_view integrand("integrand", numCellsSide, numCubPerSide);
+        scalar_view intGrad("intGrad", numCellsSide, fc);
+        scalar_view u_coeff_bdry = getBoundaryCoeff(u_coeff, l, fePrs_);
+        scalar_view z_coeff_bdry = getBoundaryCoeff(z_coeff, l, feCtrl_);
+        fePrsBdry_[l]->evaluateGradient(valU_eval, u_coeff_bdry);
+        feCtrlBdry_[l]->evaluateValue(valZ_eval, z_coeff_bdry);
+        perm_->compute(alpha, valZ_eval, fePrsBdry_[l]->cubPts(), 0);
+        perm_->compute(dalpha, valZ_eval, fePrsBdry_[l]->cubPts(), 1);
+        Real dalphaU(0), misfit(0);
+        for (int i = 0; i < numCellsSide; ++i) {
+          for (int j = 0; j < numCubPerSide; ++j) {
+            for (int k = 0; k < d; ++k) {
+              dalphaU = -dalpha(i,j) * valU_eval(i,j,k);
+              if (k==0 && onlyAxial_) misfit = static_cast<Real>(0);
+              else                    misfit = -alpha(i,j) * valU_eval(i,j,k) - target[k];
+              integrand(i,j) += dalphaU * misfit;
+            }
+          }
+        }
+        // Compute gradient of squared L2-norm of diff
+        fst::integrate(intGrad,integrand,feCtrlBdry_[l]->NdetJ(),false);
+        // Add to integral value
+        for (int i = 0; i < numCellsSide; ++i) {
+          int cidx = bdryCellLocIds_[l][i];
+          for (int j = 0; j < fc; ++j) {
+            grad(cidx,j) += intGrad(i,j);
+          }
+        }
+      }
+    }
+  }
+
+  std::vector<Real> gradient_3(std::vector<scalar_view> & grad,
+                               const scalar_view u_coeff,
+                               const scalar_view z_coeff = scalar_view(),
+                               const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if (z_param != ROL::nullPtr) {
+      // Get relevant dimensions
+      const int c = fePrs_->gradN().extent_int(0);
+      const int d = fePrs_->gradN().extent_int(3);
+      // Initialize output val
+      std::vector<Real> g_param(d,static_cast<Real>(0));
+      grad.clear(); grad.resize(d);
+      for (int k = 0; k < d; ++k)
+       grad[k] = scalar_view("grad", c);
+      // Compute cost integral
+      const int numLocSides = bdryCellLocIds_.size();
+      for (int l = 0; l < numLocSides; ++l) {
+        const int numCellsSide  = bdryCellLocIds_[l].size();
+        if ( numCellsSide ) {
+          const int numCubPerSide = fePrsBdry_[l]->cubPts().extent_int(1);
+          std::vector<scalar_view> intVal(d);
+          scalar_view valU_eval("valU_eval", numCellsSide, numCubPerSide, d);
+          scalar_view valZ_eval("valZ_eval", numCellsSide, numCubPerSide);
+          scalar_view integrand("integrand", numCellsSide, numCubPerSide);
+          scalar_view weight("weight", numCellsSide, numCubPerSide);
+          scalar_view alpha("alpha", numCellsSide, numCubPerSide);
+          scalar_view u_coeff_bdry = getBoundaryCoeff(u_coeff, l, fePrs_);
+          scalar_view z_coeff_bdry = getBoundaryCoeff(z_coeff, l, feCtrl_);
+          fePrsBdry_[l]->evaluateGradient(valU_eval, u_coeff_bdry);
+          feCtrlBdry_[l]->evaluateValue(valZ_eval, z_coeff_bdry);
+          perm_->compute(alpha, valZ_eval, fePrsBdry_[l]->cubPts(), 0);
+          for (int k = 0; k < d; ++k) {
+            intVal[k] = scalar_view("intVal", numCellsSide);
+            if ((k==0 && !onlyAxial_) || k==1) {
+              Kokkos::deep_copy(integrand,static_cast<Real>(0));
+              for (int i = 0; i < numCellsSide; ++i) {
+                for (int j = 0; j < numCubPerSide; ++j) {
+                  weight(i,j) = static_cast<Real>(1);
+                  integrand(i,j) = (*z_param)[k] + alpha(i,j) * valU_eval(i,j,k);
+                }
+              }
+              fePrsBdry_[l]->computeIntegral(intVal[k],weight,integrand,true);
+            }
+          }
+          // Add to integral value
+          for (int k = 0; k < d; ++k) {
+            for (int i = 0; i < numCellsSide; ++i) {
+              int cidx = bdryCellLocIds_[l][i];
+              (grad[k])(cidx) += (intVal[k])(i);
+            }
+          }
+        }
+      }
+      return g_param;
+    }
+    else {
+      throw Exception::Zero(">>> QoI_Velocity_Darcy::gradient_3 is zero.");
+    }
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c = fePrs_->gradN().extent_int(0);
+    const int f = fePrs_->gradN().extent_int(1);
+    const int d = fePrs_->gradN().extent_int(3);
+    // Initialize output grad
+    hess = scalar_view("hess", c, f);
+    // Compute cost integral
+    const int numLocSides = bdryCellLocIds_.size();
+    for (int l = 0; l < numLocSides; ++l) {
+      const int numCellsSide  = bdryCellLocIds_[l].size();
+      if ( numCellsSide ) {
+        const int numCubPerSide = fePrsBdry_[l]->cubPts().extent_int(1);
+        scalar_view valV_eval("valV_eval", numCellsSide, numCubPerSide, d);
+        scalar_view valZ_eval("valZ_eval", numCellsSide, numCubPerSide);
+        scalar_view alpha("alpha", numCellsSide, numCubPerSide);
+        scalar_view intHess("intHess", numCellsSide, f);
+        scalar_view v_coeff_bdry = getBoundaryCoeff(*v_coeff, l, fePrs_);
+        scalar_view z_coeff_bdry = getBoundaryCoeff(*z_coeff, l, feCtrl_);
+        fePrsBdry_[l]->evaluateGradient(valV_eval, v_coeff_bdry);
+        feCtrlBdry_[l]->evaluateValue(valZ_eval, z_coeff_bdry);
+        perm_->compute(alpha, valZ_eval, fePrsBdry_[l]->cubPts(), 0);
+        for (int i = 0; i < numCellsSide; ++i) {
+          for (int j = 0; j < numCubPerSide; ++j) {
+            for (int k = 0; k < d; ++k) {
+              valV_eval(i,j,k) *= (-alpha(i,j)) * (-alpha(i,j));
+                if (k==0 && onlyAxial_) valV_eval(i,j,k) = static_cast<Real>(0);
+            }
+          }
+        }
+        // Compute hessian of squared L2-norm of diff
+        fst::integrate(intHess,valV_eval,fePrsBdry_[l]->gradNdetJ(),false);
+        // Add to integral value
+        for (int i = 0; i < numCellsSide; ++i) {
+          int cidx = bdryCellLocIds_[l][i];
+          for (int j = 0; j < f; ++j) {
+            hess(cidx,j) += intHess(i,j);
+          }
+        }
+      }
+    }
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c  = fePrs_->gradN().extent_int(0);
+    const int f  = fePrs_->gradN().extent_int(1);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int d  = fePrs_->gradN().extent_int(3);
+    // Initialize output grad
+    hess = scalar_view("hess", c, f);
+    // Compute cost integral
+    std::vector<Real> target = (z_param == ROL::nullPtr) ? target_ : *z_param;
+    const int numLocSides = bdryCellLocIds_.size();
+    for (int l = 0; l < numLocSides; ++l) {
+      const int numCellsSide  = bdryCellLocIds_[l].size();
+      if ( numCellsSide ) {
+        const int numCubPerSide = fePrsBdry_[l]->cubPts().extent_int(1);
+        scalar_view valU_eval("valU_eval", numCellsSide, numCubPerSide, d);
+        scalar_view valZ_eval("valZ_eval", numCellsSide, numCubPerSide);
+        scalar_view valV_eval("valV_eval", numCellsSide, numCubPerSide);
+        scalar_view alpha("alpha", numCellsSide, numCubPerSide);
+        scalar_view dalpha("dalpha", numCellsSide, numCubPerSide);
+        scalar_view integrand("integrand", numCellsSide, numCubPerSide, d);
+        scalar_view intHess("IntHess", numCellsSide, f);
+        scalar_view u_coeff_bdry = getBoundaryCoeff(u_coeff, l, fePrs_);
+        scalar_view z_coeff_bdry = getBoundaryCoeff(z_coeff, l, feCtrl_);
+        scalar_view v_coeff_bdry = getBoundaryCoeff(v_coeff, l, feCtrl_);
+        fePrsBdry_[l]->evaluateGradient(valU_eval, u_coeff_bdry);
+        feCtrlBdry_[l]->evaluateValue(valZ_eval, z_coeff_bdry);
+        feCtrlBdry_[l]->evaluateValue(valV_eval, v_coeff_bdry);
+        perm_->compute(alpha, valZ_eval, fePrsBdry_[l]->cubPts(), 0);
+        perm_->compute(dalpha, valZ_eval, fePrsBdry_[l]->cubPts(), 1);
+        Real dalphaV(0), misfit(0), dmisfit(0);
+        for (int i = 0; i < numCellsSide; ++i) {
+          for (int j = 0; j < numCubPerSide; ++j) {
+            for (int k = 0; k < d; ++k) {
+              dalphaV = -dalpha(i,j) * valV_eval(i,j);
+              if (k==0 && onlyAxial_) {
+                misfit  = static_cast<Real>(0);
+                dmisfit = static_cast<Real>(0);
+              }
+              else {
+                misfit  = -alpha(i,j)*valU_eval(i,j,k) - target[k];
+                dmisfit = -alpha(i,j)*valU_eval(i,j,k);
+              }
+              integrand(i,j,k) = dalphaV * (misfit + dmisfit);
+            }
+          }
+        }
+        // Compute gradient of squared L2-norm of diff
+        fst::integrate(intHess,integrand,fePrsBdry_[l]->gradNdetJ(),false);
+        // Add to integral value
+        for (int i = 0; i < numCellsSide; ++i) {
+          int cidx = bdryCellLocIds_[l][i];
+          for (int j = 0; j < f; ++j) {
+            hess(cidx,j) += intHess(i,j);
+          }
+        }
+      }
+    }
+  }
+
+  void HessVec_13(scalar_view & hess,
+                  const ROL::Ptr<const std::vector<Real>> & v_param,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if (z_param != ROL::nullPtr) {
+      // Get relevant dimensions
+      const int c = fePrs_->gradN().extent_int(0);
+      const int f = fePrs_->gradN().extent_int(1);
+      const int d = fePrs_->gradN().extent_int(3);
+      // Initialize output grad
+      hess = scalar_view("hess", c, f);
+      // Compute cost integral
+      const int numLocSides = bdryCellLocIds_.size();
+      for (int l = 0; l < numLocSides; ++l) {
+        const int numCellsSide  = bdryCellLocIds_[l].size();
+        if ( numCellsSide ) {
+          const int numCubPerSide = fePrsBdry_[l]->cubPts().extent_int(1);
+          scalar_view weight("weight", numCellsSide, numCubPerSide, d);
+          scalar_view valZ_eval("valZ_eval", numCellsSide, numCubPerSide);
+          scalar_view alpha("alpha", numCellsSide, numCubPerSide);
+          scalar_view intHess("intHess", numCellsSide, f);
+          scalar_view z_coeff_bdry = getBoundaryCoeff(z_coeff, l, feCtrl_);
+          feCtrlBdry_[l]->evaluateValue(valZ_eval, z_coeff_bdry);
+          perm_->compute(alpha, valZ_eval, fePrsBdry_[l]->cubPts(), 0);
+          for (int k = 0; k < d; ++k) {
+            if ((k==0 && !onlyAxial_) || k==1) {
+              for (int i = 0; i < numCellsSide; ++i) {
+                for (int j = 0; j < numCubPerSide; ++j) {
+                  weight(i,j,k) = (*v_param)[k] * (-alpha(i,j));
+                }
+              }
+              // Compute gradient of squared L2-norm of diff
+              fst::integrate(intHess,weight,fePrsBdry_[l]->gradNdetJ(),false);
+              // Add to integral value
+              for (int i = 0; i < numCellsSide; ++i) {
+                int cidx = bdryCellLocIds_[l][i];
+                for (int j = 0; j < f; ++j) {
+                  hess(cidx,j) += intHess(i,j);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    else {
+      throw Exception::NotImplemented(">>> HessVec_13 not implemented.");
+    }
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c  = fePrs_->gradN().extent_int(0);
+    const int f  = fePrs_->gradN().extent_int(1);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int d  = fePrs_->gradN().extent_int(3);
+    // Initialize output grad
+    hess = scalar_view("hess", c, fc);
+    // Compute cost integral
+    std::vector<Real> target = (z_param == ROL::nullPtr) ? target_ : *z_param;
+    const int numLocSides = bdryCellLocIds_.size();
+    for (int l = 0; l < numLocSides; ++l) {
+      const int numCellsSide  = bdryCellLocIds_[l].size();
+      if ( numCellsSide ) {
+        const int numCubPerSide = fePrsBdry_[l]->cubPts().extent_int(1);
+        scalar_view valU_eval("valU_eval", numCellsSide, numCubPerSide, d);
+        scalar_view valZ_eval("valZ_eval", numCellsSide, numCubPerSide);
+        scalar_view valV_eval("valV_eval", numCellsSide, numCubPerSide, d);
+        scalar_view alpha("alpha", numCellsSide, numCubPerSide);
+        scalar_view dalpha("dalpha", numCellsSide, numCubPerSide);
+        scalar_view integrand("integrand", numCellsSide, numCubPerSide);
+        scalar_view intHess("intHess", numCellsSide, fc);
+        scalar_view u_coeff_bdry = getBoundaryCoeff(u_coeff, l, fePrs_);
+        scalar_view z_coeff_bdry = getBoundaryCoeff(z_coeff, l, feCtrl_);
+        scalar_view v_coeff_bdry = getBoundaryCoeff(v_coeff, l, fePrs_);
+        fePrsBdry_[l]->evaluateGradient(valU_eval, u_coeff_bdry);
+        feCtrlBdry_[l]->evaluateValue(valZ_eval, z_coeff_bdry);
+        fePrsBdry_[l]->evaluateGradient(valV_eval, v_coeff_bdry);
+        perm_->compute(alpha, valZ_eval, fePrsBdry_[l]->cubPts(), 0);
+        perm_->compute(dalpha, valZ_eval, fePrsBdry_[l]->cubPts(), 1);
+        Real dalphaV(0), misfit(0), dmisfit(0);
+        for (int i = 0; i < numCellsSide; ++i) {
+          for (int j = 0; j < numCubPerSide; ++j) {
+            for (int k = 0; k < d; ++k) {
+              dalphaV = -dalpha(i,j) * valV_eval(i,j,k);
+              if (k==0 && onlyAxial_) {
+                misfit  = static_cast<Real>(0);
+                dmisfit = static_cast<Real>(0);
+              }
+              else {
+                misfit  = -alpha(i,j)*valU_eval(i,j,k) - target[k];
+                dmisfit = -alpha(i,j)*valU_eval(i,j,k);
+              }
+              integrand(i,j) += dalphaV * (misfit + dmisfit);
+            }
+          }
+        }
+        // Compute gradient of squared L2-norm of diff
+        fst::integrate(intHess,integrand,feCtrlBdry_[l]->NdetJ(),false);
+        // Add to integral value
+        for (int i = 0; i < numCellsSide; ++i) {
+          int cidx = bdryCellLocIds_[l][i];
+          for (int j = 0; j < fc; ++j) {
+            hess(cidx,j) += intHess(i,j);
+          }
+        }
+      }
+    }
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c  = fePrs_->gradN().extent_int(0);
+    const int f  = fePrs_->gradN().extent_int(1);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int d  = fePrs_->gradN().extent_int(3);
+    // Initialize output grad
+    hess = scalar_view("hess", c, fc);
+    // Compute cost integral
+    std::vector<Real> target = (z_param == ROL::nullPtr) ? target_ : *z_param;
+    const int numLocSides = bdryCellLocIds_.size();
+    for (int l = 0; l < numLocSides; ++l) {
+      const int numCellsSide  = bdryCellLocIds_[l].size();
+      if ( numCellsSide ) {
+        const int numCubPerSide = fePrsBdry_[l]->cubPts().extent_int(1);
+        scalar_view valU_eval("valU_eval", numCellsSide, numCubPerSide, d);
+        scalar_view valZ_eval("valZ_eval", numCellsSide, numCubPerSide);
+        scalar_view valV_eval("valV_eval", numCellsSide, numCubPerSide);
+        scalar_view alpha("alpha", numCellsSide, numCubPerSide);
+        scalar_view dalpha("dalpha", numCellsSide, numCubPerSide);
+        scalar_view ddalpha("ddalpha", numCellsSide, numCubPerSide);
+        scalar_view integrand("integrand", numCellsSide, numCubPerSide);
+        scalar_view intHess("intHess", numCellsSide, fc);
+        scalar_view u_coeff_bdry = getBoundaryCoeff(u_coeff, l, fePrs_);
+        scalar_view z_coeff_bdry = getBoundaryCoeff(z_coeff, l, feCtrl_);
+        scalar_view v_coeff_bdry = getBoundaryCoeff(v_coeff, l, feCtrl_);
+        fePrsBdry_[l]->evaluateGradient(valU_eval, u_coeff_bdry);
+        feCtrlBdry_[l]->evaluateValue(valZ_eval, z_coeff_bdry);
+        feCtrlBdry_[l]->evaluateValue(valV_eval, v_coeff_bdry);
+        perm_->compute(alpha, valZ_eval, fePrsBdry_[l]->cubPts(), 0);
+        perm_->compute(dalpha, valZ_eval, fePrsBdry_[l]->cubPts(), 1);
+        perm_->compute(ddalpha, valZ_eval, fePrsBdry_[l]->cubPts(), 2);
+        Real dalphaV(0), misfit(0), dmisfit(0), ddalphaV(0);
+        for (int i = 0; i < numCellsSide; ++i) {
+          for (int j = 0; j < numCubPerSide; ++j) {
+            for (int k = 0; k < d; ++k) {
+              dalphaV  = -dalpha(i,j) * valV_eval(i,j) * valU_eval(i,j,k);
+              ddalphaV = -ddalpha(i,j) * valV_eval(i,j) * valU_eval(i,j,k);
+              if (k==0 && onlyAxial_) {
+                misfit  = static_cast<Real>(0);
+                dmisfit = static_cast<Real>(0);
+              }
+              else {
+                misfit  = -alpha(i,j)*valU_eval(i,j,k) - target[k];
+                dmisfit = -dalpha(i,j)**valU_eval(i,j,k);
+              }
+              integrand(i,j) += dalphaV * dmisfit + ddalphaV * misfit;
+            }
+          }
+        }
+        // Compute gradient of squared L2-norm of diff
+        fst::integrate(intHess,integrand,feCtrlBdry_[l]->NdetJ(),false);
+        // Add to integral value
+        for (int i = 0; i < numCellsSide; ++i) {
+          int cidx = bdryCellLocIds_[l][i];
+          for (int j = 0; j < fc; ++j) {
+            hess(cidx,j) += intHess(i,j);
+          }
+        }
+      }
+    }
+  }
+
+  void HessVec_23(scalar_view & hess,
+                  const ROL::Ptr<const std::vector<Real>> & v_param,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Velocity_Darcy::HessVec_23 is zero.");
+  }
+
+  std::vector<Real> HessVec_31(std::vector<scalar_view> & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if (z_param != ROL::nullPtr) {
+      // Get relevant dimensions
+      const int c = fePrs_->gradN().extent_int(0);
+      const int d = fePrs_->gradN().extent_int(3);
+      // Initialize output val
+      std::vector<Real> h_param(d,static_cast<Real>(0));
+      hess.clear(); hess.resize(d);
+      for (int k = 0; k < d; ++k)
+        hess[k] = scalar_view("hess", c);
+      // Compute cost integral
+      const int numLocSides = bdryCellLocIds_.size();
+      for (int l = 0; l < numLocSides; ++l) {
+        const int numCellsSide  = bdryCellLocIds_[l].size();
+        if ( numCellsSide ) {
+          const int numCubPerSide = fePrsBdry_[l]->cubPts().extent_int(1);
+          std::vector<scalar_view> intVal(d);
+          scalar_view valV_eval("valV_eval", numCellsSide, numCubPerSide);
+          scalar_view weight("weight", numCellsSide, numCubPerSide);
+          for (int k = 0; k < d; ++k) {
+            intVal[k] = scalar_view("intVal", numCellsSide);
+            if ((k==0 && !onlyAxial_) || k==1) {
+              scalar_view v_coeff_bdry = getBoundaryCoeff(v_coeff, l, fePrs_);
+	      Kokkos::deep_copy(valV_eval,static_cast<Real>(0));
+              fePrsBdry_[l]->evaluateValue(valV_eval, v_coeff_bdry);
+              for (int i = 0; i < numCellsSide; ++i) {
+                for (int j = 0; j < numCubPerSide; ++j) {
+                  weight(i,j) = static_cast<Real>(1);
+                  valV_eval(i,j) *= static_cast<Real>(-1);
+                }
+              }
+              fePrsBdry_[l]->computeIntegral(intVal[k],weight,valV_eval,true);
+            }
+          }
+          // Add to integral value
+          for (int k = 0; k < d; ++k) {
+            for (int i = 0; i < numCellsSide; ++i) {
+              int cidx = bdryCellLocIds_[l][i];
+              (hess[k])(cidx) += (intVal[k])(i);
+            }
+          }
+        }
+      }
+      return h_param;
+    }
+    else {
+      throw Exception::Zero(">>> QoI_Velocity_Darcy::HessVec_31 is zero.");
+    }
+  }
+
+  std::vector<Real> HessVec_32(std::vector<scalar_view> & hess,
+                               const scalar_view v_coeff,
+                               const scalar_view u_coeff,
+                               const scalar_view z_coeff = scalar_view(),
+                               const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> HessVec_32 is zero.");
+  }
+
+  std::vector<Real> HessVec_33(std::vector<scalar_view> & hess,
+                               const ROL::Ptr<const std::vector<Real>> & v_param,
+                               const scalar_view u_coeff,
+                               const scalar_view z_coeff = scalar_view(),
+                               const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if (z_param != ROL::nullPtr) {
+      // Get relevant dimensions
+      const int c = fePrs_->gradN().extent_int(0);
+      const int d = fePrs_->gradN().extent_int(3);
+      // Initialize output val
+      std::vector<Real> h_param(d,static_cast<Real>(0));
+      hess.clear(); hess.resize(d);
+      for (int k = 0; k < d; ++k)
+        hess[k] = scalar_view("hess", c);
+      // Compute cost integral
+      const int numLocSides = bdryCellLocIds_.size();
+      for (int l = 0; l < numLocSides; ++l) {
+        const int numCellsSide  = bdryCellLocIds_[l].size();
+        if ( numCellsSide ) {
+          const int numCubPerSide = fePrsBdry_[l]->cubPts().extent_int(1);
+          std::vector<scalar_view> intVal(d);
+          scalar_view valU_eval("valU_eval", numCellsSide, numCubPerSide);
+          scalar_view weight("weight", numCellsSide, numCubPerSide);
+          for (int k = 0; k < d; ++k) {
+            intVal[k] = scalar_view("intVal", numCellsSide);
+            if ((k==0 && !onlyAxial_) || k==1) {
+              Kokkos::deep_copy(valU_eval,static_cast<Real>(0));
+              for (int i = 0; i < numCellsSide; ++i) {
+                for (int j = 0; j < numCubPerSide; ++j) {
+                  weight(i,j) = static_cast<Real>(1);
+                  valU_eval(i,j) = (*v_param)[k];
+                }
+              }
+              fePrsBdry_[l]->computeIntegral(intVal[k],weight,valU_eval,true);
+            }
+          }
+          // Add to integral value
+          for (int k = 0; k < d; ++k) {
+            for (int i = 0; i < numCellsSide; ++i) {
+              int cidx = bdryCellLocIds_[l][i];
+              (hess[k])(cidx) += (intVal[k])(i);
+            }
+          }
+        }
+      }
+      return h_param;
+    }
+    else {
+      throw Exception::Zero(">>> QoI_Velocity_Darcy::HessVec_33 is zero.");
+    }
+  }
+
+}; // QoI_Velocity_Darcy
+
+
+template <class Real, class DeviceType>
+class QoI_VelocityTracking_Darcy : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> fePrs_, feCtrl_;
+  const ROL::Ptr<Permeability<Real,DeviceType>> perm_;
+  scalar_view target_, weight_;
+  Real rad_, yvel_, frac_, twpow_;
+  bool onlyAxial_;
+
+  Real xTarget(const std::vector<Real> &x) const {
+    const Real X = x[0], Y = x[1];
+    //return xWeight(x) ? -X*Y/(rad_*rad_-Y*Y) : zero;
+    //return xWeight(x) ? -X*Y/std::sqrt(rad_*rad_-Y*Y) : zero;
+    //return polyWeight(x) * (-X*Y/std::sqrt(rad_*rad_-Y*Y));
+    //return -X*Y/std::sqrt(rad_*rad_-Y*Y);
+    return -X*Y/((rad_*rad_-Y*Y)*(rad_*rad_-Y*Y));
+  }
+
+  Real yTarget(const std::vector<Real> &x) const {
+    const Real one(1), Y = x[1];
+    //return yWeight(x) ? one : zero;
+    //return yWeight(x) ? std::sqrt(rad_*rad_-Y*Y) : zero;
+    //return polyWeight(x) * std::sqrt(rad_*rad_-Y*Y);
+    //return std::sqrt(rad_*rad_-Y*Y);
+    return one/(rad_*rad_-Y*Y);
+  }
+
+  Real xWeight(const std::vector<Real> &x) const {
+    return yWeight(x);
+  }
+
+  Real yWeight(const std::vector<Real> &x) const {
+    //const Real zero(0), one(1), Y = x[1];
+    //return (std::abs(Y) <= frac_*rad_ ? one : zero);
+    return polyWeight(x);
+  }
+
+  Real polyWeight(const std::vector<Real> &x) const {
+    const Real zero(0), one(1), Y = x[1], p = twpow_;
+    const Real yTOP = 9.976339196;
+    const Real yBOT = -yTOP;
+    Real val = 0, at = 0, bt = 0;
+    at = one / std::pow(-yTOP,p);
+    bt = one / std::pow(-yBOT,p);
+    if (Y > zero) {
+      val = at*std::pow(Y-yTOP,p);
+    } else {
+      val = bt*std::pow(Y-yBOT,p);
+    }
+    //std::cout << Y << "  " << val << std::endl;
+    return val;
+  }
+
+public:
+  QoI_VelocityTracking_Darcy(ROL::ParameterList &list,
+                             const ROL::Ptr<fe_type> &fePrs,
+                             const ROL::Ptr<fe_type> &feCtrl,
+                             const ROL::Ptr<Permeability<Real,DeviceType>> &perm)
+    : fePrs_(fePrs), feCtrl_(feCtrl), perm_(perm) {
+    rad_         = list.sublist("Problem").get("Diffuser Radius",5.0);
+    yvel_        = list.sublist("Problem").get("Target Axial Velocity",15.0);
+    frac_        = list.sublist("Problem").get("Integration Domain Fraction",0.95);
+    onlyAxial_   = list.sublist("Problem").get("Only Use Axial Velocity",false);
+    twpow_       = list.sublist("Problem").get("Target Weighting Power",0.0);
+    Real xWScal  = list.sublist("Problem").get("Radial Tracking Scale",1.0);
+    Real yWScal  = list.sublist("Problem").get("Axial Tracking Scale",1.0);
+    bool useNorm = list.sublist("Problem").get("Use Normalized Misfit",false);
+    useNorm = onlyAxial_ ? false : useNorm;
+    xWScal  = onlyAxial_ ? static_cast<Real>(0) : xWScal;
+    const int c = fePrs_->gradN().extent_int(0);
+    const int p = fePrs_->gradN().extent_int(2);
+    target_ = scalar_view("target",c,p,2);
+    weight_ = scalar_view("weight",c,p,2);
+    std::vector<Real> x(2);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        x[0] = (fePrs_->cubPts())(i,j,0);
+        x[1] = (fePrs_->cubPts())(i,j,1);
+        target_(i,j,0) = xTarget(x);
+        target_(i,j,1) = yTarget(x);
+        if (useNorm && yWeight(x)) {
+          xWScal = static_cast<Real>(1)
+                  /(std::pow(target_(i,j,0),2) + std::pow(target_(i,j,1),2));
+          yWScal = xWScal;
+        }
+        weight_(i,j,0) = x[0] * xWScal * xWeight(x);
+        weight_(i,j,1) = x[0] * yWScal * yWeight(x);
+      }
+    }
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c = fePrs_->gradN().extent_int(0);
+    const int p = fePrs_->gradN().extent_int(2);
+    const int d = fePrs_->gradN().extent_int(3);
+    Real yvel = (z_param == ROL::nullPtr) ? yvel_ : (*z_param)[0];
+    // Initialize output val
+    val = scalar_view("val",c);
+    // Compute cost integral
+    scalar_view gradU("gradU",c,p,d);
+    scalar_view vel("vel",c,p,d);
+    scalar_view wvel("wvel",c,p,d);
+    scalar_view valZ("valZ",c,p);
+    scalar_view alpha("alpha",c,p);
+    fePrs_->evaluateGradient(gradU, u_coeff);
+    feCtrl_->evaluateValue(valZ, z_coeff);
+    perm_->compute(alpha, valZ, fePrs_->cubPts(), 0);
+
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k) {
+          vel(i,j,k)  = alpha(i,j)*gradU(i,j,k)+yvel*target_(i,j,k);
+          wvel(i,j,k) = weight_(i,j,k)*vel(i,j,k);
+        }
+      }
+    }
+
+    fePrs_->computeIntegral(val,vel,wvel);
+    // Scale by one half
+    rst::scale(val,static_cast<Real>(0.5));
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c = fePrs_->gradN().extent_int(0);
+    const int f = fePrs_->gradN().extent_int(1);
+    const int p = fePrs_->gradN().extent_int(2);
+    const int d = fePrs_->gradN().extent_int(3);
+    Real yvel = (z_param == ROL::nullPtr) ? yvel_ : (*z_param)[0];
+    // Initialize output grad
+    grad = scalar_view("grad", c, f);
+    // Compute cost integral
+    scalar_view gradU("gradU",c,p,d);
+    scalar_view awvel("awvel",c,p,d);
+    scalar_view valZ("valZ",c,p);
+    scalar_view alpha("alpha",c,p);
+    fePrs_->evaluateGradient(gradU, u_coeff);
+    feCtrl_->evaluateValue(valZ, z_coeff);
+    perm_->compute(alpha, valZ, fePrs_->cubPts(), 0);
+
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k) {
+          awvel(i,j,k) = weight_(i,j,k)*alpha(i,j)*(alpha(i,j)*gradU(i,j,k)+yvel*target_(i,j,k));
+        }
+      }
+    }
+
+    fst::integrate(grad,awvel,fePrs_->gradNdetJ(),false);
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c  = fePrs_->gradN().extent_int(0);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int p  = fePrs_->gradN().extent_int(2);
+    const int d  = fePrs_->gradN().extent_int(3);
+    Real yvel = (z_param == ROL::nullPtr) ? yvel_ : (*z_param)[0];
+    // Initialize output grad
+    grad = scalar_view("grad", c, fc);
+    // Compute cost integral
+    scalar_view gradU("gradU",c,p,d);
+    scalar_view valZ("valZ",c,p);
+    scalar_view alpha("alpha",c,p);
+    scalar_view dalpha("dalpha",c,p);
+    scalar_view deriv("deriv",c,p);
+    fePrs_->evaluateGradient(gradU, u_coeff);
+    feCtrl_->evaluateValue(valZ, z_coeff);
+    perm_->compute(alpha, valZ, fePrs_->cubPts(), 0);
+    perm_->compute(dalpha, valZ, fePrs_->cubPts(), 1);
+
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k) {
+          deriv(i,j) += weight_(i,j,k)*(alpha(i,j)*gradU(i,j,k)+yvel*target_(i,j,k))
+                        *dalpha(i,j)*gradU(i,j,k);
+        }
+      }
+    }
+
+    fst::integrate(grad,deriv,feCtrl_->NdetJ(),false);
+  }
+
+  std::vector<Real> gradient_3(std::vector<scalar_view> & grad,
+                               const scalar_view u_coeff,
+                               const scalar_view z_coeff = scalar_view(),
+                               const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if (z_param != ROL::nullPtr) {
+      // Get relevant dimensions
+      const int c = fePrs_->gradN().extent_int(0);
+      const int p = fePrs_->gradN().extent_int(2);
+      const int d = fePrs_->gradN().extent_int(3);
+      Real yvel = (z_param == ROL::nullPtr) ? yvel_ : (*z_param)[0];
+      // Initialize output val
+      std::vector<Real> g_param(d,static_cast<Real>(0));
+      grad.clear(); grad.resize(d);
+      for (int i = 0; i < d; ++i)
+        grad[i] = scalar_view("grad",c);
+      // Compute cost integral
+      scalar_view gradU("gradU",c,p,d);
+      scalar_view wvel("wvel",c,p,d);
+      scalar_view valZ("valZ",c,p);
+      scalar_view alpha("alpha",c,p);
+      fePrs_->evaluateGradient(gradU, u_coeff);
+      feCtrl_->evaluateValue(valZ, z_coeff);
+      perm_->compute(alpha, valZ, fePrs_->cubPts(), 0);
+
+      for (int i = 0; i < c; ++i) {
+        for (int j = 0; j < p; ++j) {
+          for (int k = 0; k < d; ++k) {
+            wvel(i,j,k) = weight_(i,j,k)*(alpha(i,j)*gradU(i,j,k)+yvel*target_(i,j,k));
+          }
+        }
+      }
+
+      fePrs_->computeIntegral(grad[0],wvel,target_);
+      
+      return g_param;
+    }
+    else {
+      throw Exception::Zero(">>> QoI_Velocity_Darcy::gradient_3 is zero.");
+    }
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c = fePrs_->gradN().extent_int(0);
+    const int f = fePrs_->gradN().extent_int(1);
+    const int p = fePrs_->gradN().extent_int(2);
+    const int d = fePrs_->gradN().extent_int(3);
+    // Initialize output hess
+    hess = scalar_view("hess", c, f);
+    // Compute cost integral
+    scalar_view gradV("gradV",c,p,d);
+    scalar_view awvel("awvel",c,p,d);
+    scalar_view valZ("valZ",c,p);
+    scalar_view alpha("alpha",c,p);
+    fePrs_->evaluateGradient(gradV, v_coeff);
+    feCtrl_->evaluateValue(valZ, z_coeff);
+    perm_->compute(alpha, valZ, fePrs_->cubPts(), 0);
+
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k) {
+          awvel(i,j,k) = alpha(i,j)*alpha(i,j)*weight_(i,j,k)*gradV(i,j,k);
+        }
+      }
+    }
+
+    fst::integrate(hess,awvel,fePrs_->gradNdetJ(),false);
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c = fePrs_->gradN().extent_int(0);
+    const int f = fePrs_->gradN().extent_int(1);
+    const int p = fePrs_->gradN().extent_int(2);
+    const int d = fePrs_->gradN().extent_int(3);
+    Real yvel = (z_param == ROL::nullPtr) ? yvel_ : (*z_param)[0];
+    // Initialize output grad
+    hess = scalar_view("hess", c, f);
+    // Compute cost integral
+    scalar_view gradU("gradU",c,p,d);
+    scalar_view awvel("awvel",c,p,d);
+    scalar_view valZ("valZ",c,p);
+    scalar_view valV("valV",c,p);
+    scalar_view alpha("alpha",c,p);
+    scalar_view dalpha("dalpha",c,p);
+    fePrs_->evaluateGradient(gradU, u_coeff);
+    feCtrl_->evaluateValue(valZ, z_coeff);
+    feCtrl_->evaluateValue(valV, v_coeff);
+    perm_->compute( alpha, valZ, fePrs_->cubPts(), 0);
+    perm_->compute(dalpha, valZ, fePrs_->cubPts(), 1);
+
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k) {
+          awvel(i,j,k)  = static_cast<Real>(2)*alpha(i,j)*gradU(i,j,k)+yvel*target_(i,j,k);
+          awvel(i,j,k) *= weight_(i,j,k)*dalpha(i,j)*valV(i,j);
+        }
+      }
+    }
+
+    fst::integrate(hess,awvel,fePrs_->gradNdetJ(),false);
+  }
+
+  void HessVec_13(scalar_view & hess,
+                  const ROL::Ptr<const std::vector<Real>> & v_param,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if (z_param != ROL::nullPtr) {
+      // Get relevant dimensions
+      const int c = fePrs_->gradN().extent_int(0);
+      const int f = fePrs_->gradN().extent_int(1);
+      const int p = fePrs_->gradN().extent_int(2);
+      const int d = fePrs_->gradN().extent_int(3);
+      // Initialize output val
+      hess = scalar_view("hess", c, f);
+      // Compute cost integral
+      scalar_view wvel("wvel",c,p,d);
+      scalar_view valZ("valZ",c,p);
+      scalar_view alpha("alpha",c,p);
+      feCtrl_->evaluateValue(valZ, z_coeff);
+      perm_->compute(alpha, valZ, fePrs_->cubPts(), 0);
+
+      for (int i = 0; i < c; ++i) {
+        for (int j = 0; j < p; ++j) {
+          for (int k = 0; k < d; ++k) {
+            wvel(i,j,k) = weight_(i,j,k)*alpha(i,j)*target_(i,j,k)*(*v_param)[0];
+          }
+        }
+      }
+
+      fst::integrate(hess,wvel,fePrs_->gradNdetJ(),false);
+    }
+    else {
+      throw Exception::NotImplemented(">>> HessVec_13 not implemented.");
+    }
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c  = fePrs_->gradN().extent_int(0);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int p  = fePrs_->gradN().extent_int(2);
+    const int d  = fePrs_->gradN().extent_int(3);
+    Real yvel = (z_param == ROL::nullPtr) ? yvel_ : (*z_param)[0];
+    // Initialize output grad
+    hess = scalar_view("hess", c, fc);
+    // Compute cost integral
+    scalar_view gradU("gradU",c,p,d);
+    scalar_view gradV("gradV",c,p,d);
+    scalar_view valZ("valZ",c,p);
+    scalar_view alpha("alpha",c,p);
+    scalar_view dalpha("dalpha",c,p);
+    scalar_view deriv("deriv",c,p);
+    fePrs_->evaluateGradient(gradU, u_coeff);
+    fePrs_->evaluateGradient(gradV, v_coeff);
+    feCtrl_->evaluateValue(valZ, z_coeff);
+    perm_->compute(alpha, valZ, fePrs_->cubPts(), 0);
+    perm_->compute(dalpha, valZ, fePrs_->cubPts(), 1);
+
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k) {
+          deriv(i,j) += weight_(i,j,k)*(alpha(i,j)*gradU(i,j,k)+yvel*target_(i,j,k))
+                        *dalpha(i,j)*gradV(i,j,k);
+          deriv(i,j) += weight_(i,j,k)*dalpha(i,j)*gradU(i,j,k)*alpha(i,j)*gradV(i,j,k);
+        }
+      }
+    }
+
+    fst::integrate(hess,deriv,feCtrl_->NdetJ(),false);
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c  = fePrs_->gradN().extent_int(0);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int p  = fePrs_->gradN().extent_int(2);
+    const int d  = fePrs_->gradN().extent_int(3);
+    Real yvel = (z_param == ROL::nullPtr) ? yvel_ : (*z_param)[0];
+    // Initialize output hess
+    hess = scalar_view("hess", c, fc);
+    // Compute cost integral
+    scalar_view gradU("gradU",c,p,d);
+    scalar_view valZ("valZ",c,p);
+    scalar_view valV("valV",c,p);
+    scalar_view alpha("alpha",c,p);
+    scalar_view dalpha("dalpha",c,p);
+    scalar_view ddalpha("ddalpha",c,p);
+    scalar_view deriv("deriv",c,p);
+    fePrs_->evaluateGradient(gradU, u_coeff);
+    feCtrl_->evaluateValue(valZ, z_coeff);
+    feCtrl_->evaluateValue(valV, v_coeff);
+    perm_->compute(  alpha, valZ, fePrs_->cubPts(), 0);
+    perm_->compute( dalpha, valZ, fePrs_->cubPts(), 1);
+    perm_->compute(ddalpha, valZ, fePrs_->cubPts(), 2);
+
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k) {
+          deriv(i,j) += weight_(i,j,k)*(alpha(i,j)*gradU(i,j,k)+yvel*target_(i,j,k))
+                        *ddalpha(i,j)*valV(i,j)*gradU(i,j,k);
+          deriv(i,j) += weight_(i,j,k)*dalpha(i,j)*gradU(i,j,k)
+                        *dalpha(i,j)*valV(i,j)*gradU(i,j,k);
+        }
+      }
+    }
+
+    fst::integrate(hess,deriv,feCtrl_->NdetJ(),false);
+  }
+
+  void HessVec_23(scalar_view & hess,
+                  const ROL::Ptr<const std::vector<Real>> & v_param,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if (z_param != ROL::nullPtr) {
+      // Get relevant dimensions
+      const int c  = fePrs_->gradN().extent_int(0);
+      const int fc = feCtrl_->N().extent_int(1);
+      const int p  = fePrs_->gradN().extent_int(2);
+      const int d  = fePrs_->gradN().extent_int(3);
+      // Initialize output val
+      hess = scalar_view("hess", c, fc);
+      // Compute cost integral
+      scalar_view gradU("gradU",c,p,d);
+      scalar_view wvel("wvel",c,p);
+      scalar_view valZ("valZ",c,p);
+      scalar_view dalpha("dalpha",c,p);
+      fePrs_->evaluateGradient(gradU, u_coeff);
+      feCtrl_->evaluateValue(valZ, z_coeff);
+      perm_->compute(dalpha, valZ, fePrs_->cubPts(), 1);
+
+      for (int i = 0; i < c; ++i) {
+        for (int j = 0; j < p; ++j) {
+          for (int k = 0; k < d; ++k) {
+            wvel(i,j) += weight_(i,j,k)*dalpha(i,j)*gradU(i,j,k)*target_(i,j,k)*(*v_param)[0];
+          }
+        }
+      }
+
+      fst::integrate(hess,wvel,feCtrl_->NdetJ(),false);
+    }
+    else {
+      throw Exception::Zero(">>> QoI_Velocity_Darcy::HessVec_23 is zero.");
+    }
+  }
+
+  std::vector<Real> HessVec_31(std::vector<scalar_view> & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if (z_param != ROL::nullPtr) {
+      // Get relevant dimensions
+      const int c = fePrs_->gradN().extent_int(0);
+      const int p = fePrs_->gradN().extent_int(2);
+      const int d = fePrs_->gradN().extent_int(3);
+      // Initialize output val
+      std::vector<Real> h_param(d,static_cast<Real>(0));
+      hess.clear(); hess.resize(d);
+      for (int k = 0; k < d; ++k)
+        hess[k] = scalar_view("hess",c);
+      // Compute cost integral
+      scalar_view gradV("gradV",c,p,d);
+      scalar_view wvel("wvel",c,p,d);
+      scalar_view valZ("valZ",c,p);
+      scalar_view alpha("alpha",c,p);
+      fePrs_->evaluateGradient(gradV, v_coeff);
+      feCtrl_->evaluateValue(valZ, z_coeff);
+      perm_->compute(alpha, valZ, fePrs_->cubPts(), 0);
+
+      for (int i = 0; i < c; ++i) {
+        for (int j = 0; j < p; ++j) {
+          for (int k = 0; k < d; ++k) {
+            wvel(i,j,k) = weight_(i,j,k)*alpha(i,j)*gradV(i,j,k);
+          }
+        }
+      }
+      fePrs_->computeIntegral(hess[0],wvel,target_);
+
+      return h_param;
+    }
+    else {
+      throw Exception::Zero(">>> QoI_Velocity_Darcy::HessVec_31 is zero.");
+    }
+  }
+
+  std::vector<Real> HessVec_32(std::vector<scalar_view> & hess,
+                               const scalar_view v_coeff,
+                               const scalar_view u_coeff,
+                               const scalar_view z_coeff = scalar_view(),
+                               const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if (z_param != ROL::nullPtr) {
+      // Get relevant dimensions
+      const int c = fePrs_->gradN().extent_int(0);
+      const int p = fePrs_->gradN().extent_int(2);
+      const int d = fePrs_->gradN().extent_int(3);
+      // Initialize output val
+      std::vector<Real> h_param(d,static_cast<Real>(0));
+      hess.clear(); hess.resize(d);
+      for (int k = 0; k < d; ++k)
+        hess[k] = scalar_view("hess",c);
+      // Compute cost integral
+      scalar_view gradU("gradU",c,p,d);
+      scalar_view wvel("wvel",c,p,d);
+      scalar_view valZ("valZ",c,p);
+      scalar_view valV("valV",c,p);
+      scalar_view dalpha("dalpha",c,p);
+      fePrs_->evaluateGradient(gradU, u_coeff);
+      feCtrl_->evaluateValue(valZ, z_coeff);
+      feCtrl_->evaluateValue(valV, v_coeff);
+      perm_->compute(dalpha, valZ, fePrs_->cubPts(), 1);
+
+      for (int i = 0; i < c; ++i) {
+        for (int j = 0; j < p; ++j) {
+          for (int k = 0; k < d; ++k) {
+            wvel(i,j,k) = weight_(i,j,k)*dalpha(i,j)*gradU(i,j,k)*valV(i,j);
+          }
+        }
+      }
+      fePrs_->computeIntegral(hess[0],wvel,target_);
+
+      return h_param;
+    }
+    else {
+      throw Exception::Zero(">>> HessVec_32 is zero.");
+    }
+  }
+
+  std::vector<Real> HessVec_33(std::vector<scalar_view> & hess,
+                               const ROL::Ptr<const std::vector<Real>> & v_param,
+                               const scalar_view u_coeff,
+                               const scalar_view z_coeff = scalar_view(),
+                               const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if (z_param != ROL::nullPtr) {
+      // Get relevant dimensions
+      const int c = fePrs_->gradN().extent_int(0);
+      const int p = fePrs_->gradN().extent_int(2);
+      const int d = fePrs_->gradN().extent_int(3);
+      // Initialize output val
+      std::vector<Real> h_param(d,static_cast<Real>(0));
+      hess.clear(); hess.resize(d);
+      for (int k = 0; k < d; ++k)
+        hess[k] = scalar_view("hess", c);
+      // Compute cost integral
+      scalar_view wtarget("wtarget",c,p,d);
+      for (int i = 0; i < c; ++i) {
+        for (int j = 0; j < p; ++j) {
+          for (int k = 0; k < d; ++k) {
+            wtarget(i,j,k) = weight_(i,j,k)*target_(i,j,k);
+          }
+        }
+      }
+      fePrs_->computeIntegral(hess[0],wtarget,target_);
+      rst::scale(hess[0],(*v_param)[0]);
+      
+      return h_param;
+    }
+    else {
+      throw Exception::Zero(">>> QoI_Velocity_Darcy::HessVec_33 is zero.");
+    }
+  }
+
+}; // QoI_Velocity_Darcy
+#endif
diff --git a/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/darcy/pde_darcy.hpp b/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/darcy/pde_darcy.hpp
index 62ab96a74935..6d80456911e2 100644
--- a/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/darcy/pde_darcy.hpp
+++ b/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/darcy/pde_darcy.hpp
@@ -370,7 +370,7 @@ class PDE_Darcy : public PDE<Real> {
     hess = ROL::makePtr<Intrepid::FieldContainer<Real>>(c, fc, f);
     // Apply Dirichlet conditions
     ROL::Ptr<Intrepid::FieldContainer<Real>> l0_coeff;
-    l0_coeff = ROL::makePtr<Intrepid::FieldContainer<Real>>(c, p);
+    l0_coeff = ROL::makePtr<Intrepid::FieldContainer<Real>>(c, f);
     *l0_coeff = *l_coeff;
     int numSideSets = bdryCellLocIds_.size();
     if (numSideSets > 0) {
@@ -444,7 +444,7 @@ class PDE_Darcy : public PDE<Real> {
     hess = ROL::makePtr<Intrepid::FieldContainer<Real>>(c, f, fc);
     // Apply Dirichlet conditions
     ROL::Ptr<Intrepid::FieldContainer<Real>> l0_coeff;
-    l0_coeff = ROL::makePtr<Intrepid::FieldContainer<Real>>(c, p);
+    l0_coeff = ROL::makePtr<Intrepid::FieldContainer<Real>>(c, f);
     *l0_coeff = *l_coeff;
     int numSideSets = bdryCellLocIds_.size();
     if (numSideSets > 0) {
@@ -501,6 +501,7 @@ class PDE_Darcy : public PDE<Real> {
                   const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
     // Retrieve dimensions.
     const int c  = fePrs_->gradN()->dimension(0);
+    const int f  = fePrs_->gradN()->dimension(1);
     const int fc = feCtrl_->gradN()->dimension(1);
     const int p  = fePrs_->gradN()->dimension(2);
     const int d  = fePrs_->gradN()->dimension(3);
@@ -508,7 +509,7 @@ class PDE_Darcy : public PDE<Real> {
     hess = ROL::makePtr<Intrepid::FieldContainer<Real>>(c, fc, fc);
     // Apply Dirichlet conditions
     ROL::Ptr<Intrepid::FieldContainer<Real>> l0_coeff;
-    l0_coeff = ROL::makePtr<Intrepid::FieldContainer<Real>>(c, p);
+    l0_coeff = ROL::makePtr<Intrepid::FieldContainer<Real>>(c, f);
     *l0_coeff = *l_coeff;
     int numSideSets = bdryCellLocIds_.size();
     if (numSideSets > 0) {
diff --git a/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/darcy/pde_darcyK.hpp b/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/darcy/pde_darcyK.hpp
new file mode 100644
index 000000000000..2a5540cb702c
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/darcy/pde_darcyK.hpp
@@ -0,0 +1,735 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  pde_darcy.hpp
+    \brief Implements the local PDE interface for the Darcy porosity optimization problem.
+*/
+
+#ifndef PDE_DARCYK_HPP
+#define PDE_DARCYK_HPP
+
+#include "../../../../TOOLS/pdeK.hpp"
+#include "../../../../TOOLS/feK.hpp"
+#include "../../../../TOOLS/fieldhelperK.hpp"
+#include "permeabilityK.hpp"
+
+#include "Intrepid2_HVOL_C0_FEM.hpp"
+#include "Intrepid2_HGRAD_QUAD_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_QUAD_C2_FEM.hpp"
+#include "Intrepid2_HGRAD_TRI_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_TRI_C2_FEM.hpp"
+#include "Intrepid2_DefaultCubatureFactory.hpp"
+#include "Intrepid2_FunctionSpaceTools.hpp"
+#include "Intrepid2_CellTools.hpp"
+
+#include "ROL_Ptr.hpp"
+
+template <class Real, class DeviceType>
+class PDE_Darcy : public PDE<Real, DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  // Finite element basis information
+  basis_ptr basisPtrPrs_, basisPtrCtrl_;
+  std::vector<basis_ptr> basisPtrs_, basisPtrsCtrl_;
+  // Cell cubature information
+  ROL::Ptr<Intrepid2::Cubature<DeviceType,Real,Real>> cellCub_, bdryCub_;
+  // Cell node information
+  scalar_view volCellNodes_;
+  std::vector<std::vector<scalar_view>> bdryCellNodes_;
+  std::vector<std::vector<std::vector<int>>> bdryCellLocIds_;
+  // Finite element definition
+  ROL::Ptr<fe_type> fePrs_, feCtrl_;
+  std::vector<std::vector<ROL::Ptr<fe_type>>> fePrsBdry_, feCtrlBdry_;
+  // Local degrees of freedom on boundary, for each side of the reference cell (first index).
+  std::vector<std::vector<int>> fpidx_;
+  // Field pattern, offsets, etc.
+  std::vector<std::vector<int>> fieldPattern_;  // local Field/DOF pattern; set from DOF manager 
+  int numFields_;                               // number of fields (equations in the PDE)
+  int numDofs_;                                 // total number of degrees of freedom for all (local) fields
+  std::vector<int> offset_;                     // for each field, a counting offset
+  std::vector<int> numFieldDofs_;               // for each field, number of degrees of freedom
+  std::vector<std::vector<int>> fieldPatternCtrl_;  // local Field/DOF pattern; set from DOF manager 
+  int numFieldsCtrl_;                               // number of fields (equations in the PDE)
+  int numDofsCtrl_;                                 // total number of degrees of freedom for all (local) fields
+  std::vector<int> offsetCtrl_;                     // for each field, a counting offset
+  std::vector<int> numFieldDofsCtrl_;               // for each field, number of degrees of freedom
+  
+  // Problem parameters.
+  Real Patm_, inVelocity_, dynvisco_;
+  bool useNoSlip_;
+
+  ROL::Ptr<Permeability<Real,DeviceType>> perm_;
+  ROL::Ptr<FieldUtils::FieldInfo> fieldInfo_, fieldInfoCtrl_;
+
+  void multiplyByRadius(scalar_view &input, const scalar_view cubPts,
+                        bool isField, bool useReciprocal = false) const {
+    int c = cubPts.extent_int(0);
+    int p = cubPts.extent_int(1);
+    scalar_view r("r",c,p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        if (!useReciprocal) r(i,j) = cubPts(i,j,0);
+        else                r(i,j) = static_cast<Real>(1)/cubPts(i,j,0);
+      }
+    }
+    scalar_view in(input);
+    if (!isField)
+      fst::scalarMultiplyDataData(input,r,in);
+    else
+      fst::scalarMultiplyDataField(input,r,in);
+  }
+
+public:
+  PDE_Darcy(ROL::ParameterList &parlist) : Patm_(101.325 /* kg/mm-s^2 */) {
+    // Finite element fields.
+    int cubDegree     = parlist.sublist("Problem").get("Cubature Degree",4);
+    int bdryCubDegree = parlist.sublist("Problem").get("Boundary Cubature Degree",2);
+    int basisDegPres  = parlist.sublist("Problem").get("Pressure Basis Degree",1);
+    int basisDegCtrl  = parlist.sublist("Problem").get("Density Basis Degree",1);
+    std::string elemType = parlist.sublist("Problem").get("Element Type","QUAD");
+    if (elemType == "TRI") {
+      if (basisDegPres == 2)
+        basisPtrPrs_ = ROL::makePtr<Intrepid2::Basis_HGRAD_TRI_C2_FEM<DeviceType,Real,Real>>();
+      else 
+        basisPtrPrs_ = ROL::makePtr<Intrepid2::Basis_HGRAD_TRI_C1_FEM<DeviceType,Real,Real>>();
+    }
+    else {
+      if (basisDegPres == 2)
+        basisPtrPrs_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C2_FEM<DeviceType,Real,Real>>();
+      else 
+        basisPtrPrs_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceType,Real,Real>>();
+    }
+    basisPtrs_.push_back(basisPtrPrs_);       // Pressure component
+    shards::CellTopology cellType = basisPtrs_[0]->getBaseCellTopology();        // get the cell type from any basis
+    if (basisDegCtrl == 1) {
+      if (elemType == "TRI")
+        basisPtrCtrl_ = ROL::makePtr<Intrepid2::Basis_HGRAD_TRI_C1_FEM<DeviceType,Real,Real>>();
+      else
+        basisPtrCtrl_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceType,Real,Real>>();
+    }
+    else {
+      basisPtrCtrl_ = ROL::makePtr<Intrepid2::Basis_HVOL_C0_FEM<DeviceType,Real,Real>>(cellType);
+    }
+    basisPtrsCtrl_.push_back(basisPtrCtrl_);  // Control component
+    // Quadrature rules.
+    Intrepid2::DefaultCubatureFactory cubFactory;                            // create cubature factory
+    cellCub_ = cubFactory.create<DeviceType,Real,Real>(cellType, cubDegree); // create default cubature
+
+    shards::CellTopology bdryCellType = cellType.getCellTopologyData(1, 0);
+    bdryCub_ = cubFactory.create<DeviceType,Real,Real>(bdryCellType, bdryCubDegree);
+
+    // Other problem parameters.
+    dynvisco_     = parlist.sublist("Problem").get("Dynamic Viscosity", 0.84e-8); // kg/mm-s
+    Real density  = parlist.sublist("Problem").get("Fluid Density", 8.988e-11);   // kg/mm^3
+    Real inRadius = parlist.sublist("Problem").get("Inlet Radius", 0.6875);       // mm
+    Real Reynolds = parlist.sublist("Problem").get("Reynolds Number", 50.0);      // dimensionless
+    inVelocity_ = Reynolds * dynvisco_ / (density*static_cast<Real>(2)*inRadius); // mm/s
+    perm_       = ROL::makePtr<Permeability<Real,DeviceType>>(parlist);
+    useNoSlip_  = parlist.sublist("Problem").get("Use No Slip", false);
+
+    numDofs_ = 0;
+    numFields_ = basisPtrs_.size();
+    offset_.resize(numFields_);
+    numFieldDofs_.resize(numFields_);
+    for (int i=0; i<numFields_; ++i) {
+      offset_[i] = i==0 ? 0 : offset_[i-1] + basisPtrs_[i-1]->getCardinality();
+      numFieldDofs_[i] = basisPtrs_[i]->getCardinality();
+      numDofs_ += numFieldDofs_[i];
+    }
+    numDofsCtrl_ = 0;
+    numFieldsCtrl_ = basisPtrsCtrl_.size();
+    offsetCtrl_.resize(numFieldsCtrl_);
+    numFieldDofsCtrl_.resize(numFieldsCtrl_);
+    for (int i=0; i<numFieldsCtrl_; ++i) {
+      offsetCtrl_[i] = i==0 ? 0 : offsetCtrl_[i-1] + basisPtrsCtrl_[i-1]->getCardinality();
+      numFieldDofsCtrl_[i] = basisPtrsCtrl_[i]->getCardinality();
+      numDofsCtrl_ += numFieldDofsCtrl_[i];
+    }
+  }
+
+  void residual(scalar_view & res,
+                const scalar_view u_coeff,
+                const scalar_view z_coeff = scalar_view(),
+                const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c = fePrs_->gradN().extent_int(0);
+    const int f = fePrs_->gradN().extent_int(1);
+    const int p = fePrs_->gradN().extent_int(2);
+    const int d = fePrs_->gradN().extent_int(3);
+    // Initialize residuals.
+    res = scalar_view("res", c, f);
+    // Evaluate/interpolate finite element fields on cells.
+    scalar_view gradP("gradP", c, p, d);
+    scalar_view AgradP("AgradP", c, p, d);
+    scalar_view valCtrl("valCtrl", c, p);
+    scalar_view alpha("alpha", c, p);
+    fePrs_->evaluateGradient(gradP, u_coeff);
+    feCtrl_->evaluateValue(valCtrl, z_coeff);
+    perm_->compute(alpha, valCtrl, fePrs_->cubPts(), 0);
+    fst::scalarMultiplyDataData(AgradP,alpha,gradP);
+    /*** Evaluate weak form of the residual. ***/
+    multiplyByRadius(AgradP,fePrs_->cubPts(),false);
+    fst::integrate(res,AgradP,fePrs_->gradNdetJ(),false);
+    // Boundary conditions.
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      // APPLY DIRICHLET CONDITIONS
+      for (int i = 0; i < numSideSets; ++i) {
+        if (i==0) /* in flow */ {
+          const int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            const int numCellsSide = bdryCellLocIds_[i][j].size();
+            if (numCellsSide > 0) {
+              const int numCubPerSide = fePrsBdry_[i][j]->cubPts().extent_int(1);
+              scalar_view nRes("nRes", numCellsSide, f);
+              scalar_view nVal("nVal", numCellsSide, numCubPerSide);
+	      Kokkos::deep_copy(nVal, -inVelocity_);
+              multiplyByRadius(nVal,fePrsBdry_[i][j]->cubPts(),false);
+              fst::integrate(nRes,nVal,fePrsBdry_[i][j]->NdetJ(),false);
+              for (int k = 0; k < numCellsSide; ++k) {
+                const int cidx = bdryCellLocIds_[i][j][k];
+                for (int l = 0; l < f; ++l) {
+                  res(cidx,l) += nRes(k,l);
+                }
+              }
+            }
+          }
+        }
+        else if (useNoSlip_ && i==1) /* no slip */ {}
+        else if (i==2) /* out flow */ {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numBdryDofs = fpidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                res(cidx,fpidx_[j][l]) = u_coeff(cidx,fpidx_[j][l]) - Patm_;
+              }
+            }
+          }
+        }
+        else if (i==3) /* no normal */ {} 
+      }
+    }
+  }
+
+  void Jacobian_1(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c = fePrs_->gradN().extent_int(0);
+    const int f = fePrs_->gradN().extent_int(1);
+    const int p = fePrs_->gradN().extent_int(2);
+    const int d = fePrs_->gradN().extent_int(3);
+    // Initialize Jacobians.
+    jac = scalar_view("jac", c, f, f);
+    // Evaluate/interpolate finite element fields on cells.
+    scalar_view valCtrl("valCtrl", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view alphaPhi("alphaPhi", c, f, p, d);
+    feCtrl_->evaluateValue(valCtrl, z_coeff);
+    perm_->compute(alpha, valCtrl, fePrs_->cubPts(), 0);
+    // Multiply velocity with alpha
+    fst::scalarMultiplyDataField(alphaPhi, alpha, fePrs_->gradN());
+    /*** Evaluate weak form of the Jacobian. ***/
+    multiplyByRadius(alphaPhi,fePrs_->cubPts(),true);
+    fst::integrate(jac,alphaPhi,fePrs_->gradNdetJ(),false);
+    // APPLY DIRICHLET CONDITIONS
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if (i==0) /* in flow */ {}
+        else if (useNoSlip_ && i==1) /* no slip */ {}
+        else if (i==2) /* out flow */ {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numBdryDofs = fpidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                for (int m = 0; m < f; ++m) {
+                  jac(cidx,fpidx_[j][l],m) = static_cast<Real>(0);
+                }
+                jac(cidx,fpidx_[j][l],fpidx_[j][l]) = static_cast<Real>(1);
+              }
+            }
+          }
+        }
+        else if (i==3) /* no normal */ {}
+      }
+    }
+  }
+
+  void Jacobian_2(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c  = fePrs_->gradN().extent_int(0);
+    const int f  = fePrs_->gradN().extent_int(1);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int p  = fePrs_->gradN().extent_int(2);
+    const int d  = fePrs_->gradN().extent_int(3);
+    // Initialize Jacobians.
+    jac = scalar_view("jac", c, f, fc);
+    // Evaluate on FE basis.
+    scalar_view valZ("valZ", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view alphaN("alphaN", c, fc, p);
+    scalar_view gradP("gradP", c, p, d);
+    scalar_view gradPgradN("gradPgradN", c, f, p);
+    fePrs_->evaluateGradient(gradP, u_coeff);
+    feCtrl_->evaluateValue(valZ, z_coeff);
+    perm_->compute(alpha, valZ, fePrs_->cubPts(), 1);
+    fst::scalarMultiplyDataField(alphaN,alpha,feCtrl_->N());
+    fst::dotMultiplyDataField(gradPgradN,gradP,fePrs_->gradNdetJ());
+    /*** Evaluate weak form of the residual. ***/
+    multiplyByRadius(gradPgradN,fePrs_->cubPts(),true);
+    fst::integrate(jac,gradPgradN,alphaN,false);
+    // APPLY DIRICHLET CONDITIONS
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if (i==0) /* in flow */ {}
+        else if (useNoSlip_ && i==1) /* no slip */ {}
+        else if (i==2) /* out flow */ {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numBdryDofs = fpidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                for (int m=0; m < fc; ++m) {
+                  jac(cidx,fpidx_[j][l],m) = static_cast<Real>(0);
+                }
+              }
+            }
+          }
+        }
+        else if (i==3) /* no normal */ {}
+      }
+    }
+  }
+
+  void Jacobian_3(std::vector<scalar_view> & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> Jacobian_3 is zero!");
+  }
+
+  void Hessian_11(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> Hessian_11 is zero!");
+  }
+
+  void Hessian_12(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c  = fePrs_->gradN().extent_int(0);
+    const int f  = fePrs_->gradN().extent_int(1);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int p  = fePrs_->gradN().extent_int(2);
+    const int d  = fePrs_->gradN().extent_int(3);
+    // Initialize hessians.
+    hess = scalar_view("hess", c, fc, f);
+    // Apply Dirichlet conditions
+    scalar_view l0_coeff("l0_coeff", c, f);
+    Kokkos::deep_copy(l0_coeff,l_coeff);
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if (i==0) /* in flow */ {}
+        else if (useNoSlip_ && i==1) /* no slip */ {}
+        else if (i==2) /* out flow */ {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numBdryDofs = fpidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                l0_coeff(cidx,fpidx_[j][l]) = static_cast<Real>(0);
+              }
+            }
+          }
+        }
+        else if (i==3) /* no normal */ {}
+      }
+    }
+    // Evaluate/interpolate finite element fields on cells.
+    scalar_view valZ("valZ", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view gradL("gradL", c, p, d);
+    scalar_view alphaL("alphaL", c, fc, p, d);
+    fePrs_->evaluateGradient(gradL, l0_coeff);
+    feCtrl_->evaluateValue(valZ, z_coeff);
+    perm_->compute(alpha, valZ, fePrs_->cubPts(), 1);
+    /*** Evaluate weak form of the Hessian. ***/
+    // Multiply velocity with alpha
+    for (int j = 0; j < c; ++j) {
+      for (int k = 0; k < fc; ++k) {
+        for (int l = 0; l < p; ++l) {
+          for (int m = 0; m < d; ++m) {
+            alphaL(j,k,l,m) = alpha(j,l) * (feCtrl_->N())(j,k,l) * gradL(j,l,m);
+          }
+        }
+      }
+    }
+    multiplyByRadius(alphaL,fePrs_->cubPts(),true);
+    fst::integrate(hess,alphaL,fePrs_->gradNdetJ(),false);
+  }
+
+  void Hessian_13(std::vector<scalar_view> & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> Hessian_13 is zero!");
+  }
+
+  void Hessian_21(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c  = fePrs_->gradN().extent_int(0);
+    const int f  = fePrs_->gradN().extent_int(1);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int p  = fePrs_->gradN().extent_int(2);
+    const int d  = fePrs_->gradN().extent_int(3);
+    // Initialize hessians.
+    hess = scalar_view("hess", c, f, fc);
+    // Apply Dirichlet conditions
+    scalar_view l0_coeff("l0_coeff", c, f);
+    Kokkos::deep_copy(l0_coeff,l_coeff);
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if (i==0) /* in flow */ {}
+        else if (useNoSlip_ && i==1) /* no slip */ {}
+        else if (i==2) /* out flow */ {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numBdryDofs = fpidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                l0_coeff(cidx,fpidx_[j][l]) = static_cast<Real>(0);
+              }
+            }
+          }
+        }
+        else if (i==3) /* no normal */ {}
+      }
+    }
+    // Evaluate/interpolate finite element fields on cells.
+    scalar_view valZ("valZ", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view gradL("gradL", c, p, d);
+    scalar_view alphaL("alphaL", c, fc, p, d);
+    fePrs_->evaluateGradient(gradL, l0_coeff);
+    feCtrl_->evaluateValue(valZ, z_coeff);
+    perm_->compute(alpha, valZ, fePrs_->cubPts(), 1);
+    /*** Evaluate weak form of the Hessian. ***/
+    for (int j = 0; j < c; ++j) {
+      for (int k = 0; k < fc; ++k) {
+        for (int l = 0; l < p; ++l) {
+          for (int m = 0; m < d; ++m) {
+            alphaL(j,k,l,m) = alpha(j,l) * (feCtrl_->N())(j,k,l) * gradL(j,l,m);
+          }
+        }
+      }
+    }
+    multiplyByRadius(alphaL,fePrs_->cubPts(),true);
+    fst::integrate(hess,fePrs_->gradNdetJ(),alphaL,false);
+  }
+
+  void Hessian_22(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c  = fePrs_->gradN().extent_int(0);
+    const int f  = fePrs_->gradN().extent_int(1);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int p  = fePrs_->gradN().extent_int(2);
+    const int d  = fePrs_->gradN().extent_int(3);
+    // Initialize hessians.
+    hess = scalar_view("hess", c, fc, fc);
+    // Apply Dirichlet conditions
+    scalar_view l0_coeff("l0_coeff", c, f);
+    Kokkos::deep_copy(l0_coeff,l_coeff);
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if (i==0) /* in flow */ {}
+        else if (useNoSlip_ && i==1) /* no slip */ {}
+        else if (i==2) /* out flow */ {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numBdryDofs = fpidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                l0_coeff(cidx,fpidx_[j][l]) = static_cast<Real>(0);
+              }
+            }
+          }
+        }
+        else if (i==3) /* no normal */ {}
+      }
+    }
+    // Evaluate/interpolate finite element fields on cells.
+    scalar_view valZ("valZ", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view alphaUL("alphaUL", c, fc, p);
+    scalar_view gradL("gradL", c, p, d);
+    scalar_view gradU("gradU", c, p, d);
+    fePrs_->evaluateGradient(gradL, l0_coeff);
+    fePrs_->evaluateGradient(gradU, u_coeff);
+    feCtrl_->evaluateValue(valZ, z_coeff);
+    perm_->compute(alpha, valZ, fePrs_->cubPts(), 2);
+    // Multiply velocity with alpha
+    Real dot(0);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < fc; ++j) {
+        for (int k = 0; k < p; ++k) {
+          dot = static_cast<Real>(0);
+          for (int l = 0; l < d; ++l) {
+            dot += gradU(i,k,l) * gradL(i,k,l);
+          }
+          alphaUL(i,j,k) = alpha(i,k) * (feCtrl_->N())(i,j,k) * dot;
+        }
+      }
+    }
+    /*** Evaluate weak form of the Hessian. ***/
+    // Velocity equation.
+    multiplyByRadius(alphaUL,fePrs_->cubPts(),true);
+    fst::integrate(hess,alphaUL,feCtrl_->NdetJ(),false);
+  }
+
+  void Hessian_23(std::vector<scalar_view> & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> Hessian_23 is zero!");
+  }
+
+  void Hessian_31(std::vector<scalar_view> & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> Hessian_31 is zero!");
+  }
+
+  void Hessian_32(std::vector<scalar_view> & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> Hessian_32 is zero!");
+  }
+
+  void Hessian_33(std::vector<std::vector<scalar_view>> & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> Hessian_33 is zero!");
+  }
+
+  void RieszMap_1(scalar_view & riesz) override {
+    // Retrieve dimensions.
+    const int c = fePrs_->gradN().extent_int(0);
+    const int f = fePrs_->gradN().extent_int(1);
+    const int p = fePrs_->gradN().extent_int(2);
+    const int d = fePrs_->gradN().extent_int(3);
+    // Initialize Jacobians.
+    riesz = scalar_view("riesz", c, f, f);
+    // Evaluate/interpolate finite element fields on cells.
+    scalar_view valPhi("valPhi", c, f, p);
+    scalar_view gradPhi("gradPhi", c, f, p, d);
+    Kokkos::deep_copy(valPhi, fePrs_->N());
+    Kokkos::deep_copy(gradPhi, fePrs_->gradN());
+    /*** Evaluate weak form of the Jacobian. ***/
+    multiplyByRadius(gradPhi,fePrs_->cubPts(),true);
+    multiplyByRadius(valPhi,fePrs_->cubPts(),true);
+    fst::integrate(riesz,gradPhi,fePrs_->gradNdetJ(),false);
+    fst::integrate(riesz,valPhi,fePrs_->NdetJ(),true);
+  }
+
+  void RieszMap_2(scalar_view & riesz) override {
+    // Retrieve dimensions.
+    const int c = feCtrl_->gradN().extent_int(0);
+    const int f = feCtrl_->gradN().extent_int(1);
+    const int p = feCtrl_->gradN().extent_int(2);
+    // Initialize Jacobians.
+    riesz = scalar_view("riesz", c, f, f);
+    // Evaluate on FE basis.
+    scalar_view valPsi("valPsi", c, f, p);
+    Kokkos::deep_copy(valPsi, feCtrl_->N());
+    /*** Evaluate weak form of the residual. ***/
+    multiplyByRadius(valPsi,feCtrl_->cubPts(),true);
+    fst::integrate(riesz,valPsi,feCtrl_->NdetJ(),false);
+  }
+
+  std::vector<basis_ptr> getFields() override {
+    return basisPtrs_;
+  }
+
+  std::vector<basis_ptr> getFields2() override {
+    return basisPtrsCtrl_;
+  }
+
+  void setCellNodes(const scalar_view &volCellNodes,
+                    const std::vector<std::vector<scalar_view>> &bdryCellNodes,
+                    const std::vector<std::vector<std::vector<int>>> &bdryCellLocIds) override {
+    volCellNodes_ = volCellNodes;
+    bdryCellNodes_ = bdryCellNodes;
+    bdryCellLocIds_ = bdryCellLocIds;
+    // Finite element definition.
+    fePrs_  = ROL::makePtr<fe_type>(volCellNodes_,basisPtrPrs_,cellCub_);
+    feCtrl_ = ROL::makePtr<fe_type>(volCellNodes_,basisPtrCtrl_,cellCub_,false);
+    fpidx_  = fePrs_->getBoundaryDofs();
+    // Construct control boundary FE
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      fePrsBdry_.resize(numSideSets);
+      //feCtrlBdry_.resize(numSideSets);
+      for (int i = 0; i < numSideSets; ++i) {
+        int numLocSides = bdryCellNodes[i].size();
+        fePrsBdry_[i].resize(numLocSides);
+        //feCtrlBdry_[i].resize(numLocSides);
+        for (int j = 0; j < numLocSides; ++j) {
+          if (bdryCellNodes[i][j] != scalar_view()) {
+            fePrsBdry_[i][j] = ROL::makePtr<fe_type>(bdryCellNodes[i][j],basisPtrPrs_,bdryCub_,j);
+            //feCtrlBdry_[i][j] = ROL::makePtr<fe_type>(bdryCellNodes[i][j],basisPtrCtrl_,bdryCub_,j);
+          }
+        }
+      }
+    }
+  }
+
+  void setFieldPattern(const std::vector<std::vector<int>> &fieldPattern,
+                       const std::vector<std::vector<int>> &fieldPattern2) override {
+    fieldPattern_ = fieldPattern;
+    fieldInfo_    = ROL::makePtr<FieldUtils::FieldInfo>(numFields_,numDofs_,numFieldDofs_,fieldPattern_);
+    fieldPatternCtrl_ = fieldPattern2;
+    fieldInfoCtrl_    = ROL::makePtr<FieldUtils::FieldInfo>(numFieldsCtrl_,numDofsCtrl_,numFieldDofsCtrl_,fieldPatternCtrl_);
+  }
+
+  void printData(std::string tag,
+                 const scalar_view u_coeff,
+                 const scalar_view z_coeff = scalar_view(),
+                 const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c = fePrs_->gradN().extent_int(0);
+    const int p = fePrs_->gradN().extent_int(2);
+    const int d = fePrs_->gradN().extent_int(3);
+    // Evaluate/interpolate finite element fields on cells.
+    scalar_view gradP("gradP", c, p, d);
+    scalar_view AgradP("AgradP", c, p, d);
+    scalar_view valCtrl("valCtrl", c, p);
+    scalar_view alpha("alpha", c, p);
+    fePrs_->evaluateGradient(gradP, u_coeff);
+    feCtrl_->evaluateValue(valCtrl, z_coeff);
+    perm_->compute(alpha, valCtrl, fePrs_->cubPts(), 0);
+    fst::scalarMultiplyDataData(AgradP,alpha,gradP);
+    // Print to velocity file
+    std::stringstream nameVel;
+    nameVel << "velocity" << tag << ".txt";
+    std::ofstream fileVel;
+    fileVel.open(nameVel.str());
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        fileVel << std::scientific << std::setprecision(15);
+        for (int k = 0; k < d; ++k) fileVel << std::setw(25) << (fePrs_->cubPts())(i,j,k);
+        for (int k = 0; k < d; ++k) fileVel << std::setw(25) << -AgradP(i,j,k);
+        fileVel << std::endl;
+      }
+    }
+    fileVel.close();
+    // Print to permeability file
+    std::stringstream namePer;
+    namePer << "permeability" << tag << ".txt";
+    std::ofstream filePer;
+    filePer.open(namePer.str());
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        filePer << std::scientific << std::setprecision(15);
+        for (int k = 0; k < d; ++k) filePer << std::setw(25) << (fePrs_->cubPts())(i,j,k);
+        filePer << std::setw(25) << dynvisco_*alpha(i,j);
+        filePer << std::endl;
+      }
+    }
+    filePer.close();
+  }
+
+  const ROL::Ptr<fe_type> getPressureFE(void) const {
+    return fePrs_;
+  }
+
+  const ROL::Ptr<fe_type> getControlFE(void) const {
+    return feCtrl_;
+  }
+
+  const std::vector<ROL::Ptr<fe_type>> getPressureBdryFE(const int sideset = -1) const {
+    int side = (sideset < 0 || sideset > 4) ? 0 : sideset;
+    return fePrsBdry_[side];
+  }
+
+  const std::vector<ROL::Ptr<fe_type>> getControlBdryFE(const int sideset = -1) const {
+    int side = (sideset < 0 || sideset > 4) ? 0 : sideset;
+    return feCtrlBdry_[side];
+  }
+
+  const std::vector<std::vector<int>> getBdryCellLocIds(const int sideset = -1) const {
+    int side = (sideset < 0 || sideset > 4) ? 0 : sideset;
+    return bdryCellLocIds_[side];
+  }
+
+  const ROL::Ptr<Permeability<Real,DeviceType>> getPermeability(void) const {
+    return perm_;
+  }
+
+  const ROL::Ptr<const FieldUtils::FieldInfo> getStateFieldInfo(void) const {
+    return fieldInfo_;
+  }
+
+  const ROL::Ptr<const FieldUtils::FieldInfo> getControlFieldInfo(void) const {
+    return fieldInfoCtrl_;
+  }
+
+}; // PDE_Darcy
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/darcy/permeabilityK.hpp b/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/darcy/permeabilityK.hpp
new file mode 100644
index 000000000000..ccfc3566aebf
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/darcy/permeabilityK.hpp
@@ -0,0 +1,93 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+#ifndef PERMEABILITYK_HPP
+#define PERMEABILITYK_HPP
+
+#include <vector>
+#include "ROL_ParameterList.hpp"
+#include "ROL_Ptr.hpp"
+#include "../../../../TOOLS/feK.hpp"
+
+template<class Real, class DeviceType>
+class Permeability {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+
+private:
+  Real dRadius_, viscosity_, minPerm_, maxPerm_;
+  int type_;
+
+  Real value(Real z) const {
+    if (type_==1) {
+      const Real a(minPerm_/viscosity_), b(std::log(maxPerm_/minPerm_));
+      return a * std::exp(b * z);
+    }
+    else {
+      const Real a(minPerm_/viscosity_), b(maxPerm_/viscosity_);
+      return a + z * (b - a);
+    }
+  }
+
+  Real deriv1(Real z) const {
+    if (type_==1) {
+      const Real a(minPerm_/viscosity_), b(std::log(maxPerm_/minPerm_));
+      return a * b * std::exp(b * z);
+    }
+    else {
+      const Real a(minPerm_/viscosity_), b(maxPerm_/viscosity_);
+      return b - a;
+    }
+  }
+
+  Real deriv2(Real z) const {
+    if (type_==1) {
+      const Real a(minPerm_/viscosity_), b(std::log(maxPerm_/minPerm_));
+      return a * b * b * std::exp(b * z);
+    }
+    else {
+      return static_cast<Real>(0);
+    }
+  }
+
+public:
+  Permeability(ROL::ParameterList &list) {
+    bool useDarcy = list.sublist("Problem").get("Use Darcy Flow",true);
+    dRadius_   = list.sublist("Problem").get("Diffuser Radius",5.0);        // mm
+    viscosity_ = list.sublist("Problem").get("Dynamic Viscosity", 0.84e-8); // kg/mm-s
+    minPerm_   = list.sublist("Problem").get("Minimum Permeability", 3e-7); // mm^2
+    maxPerm_   = list.sublist("Problem").get("Maximum Permeability", 3e-6); // mm^2
+    if (useDarcy) dRadius_ += static_cast<Real>(10);
+    type_      = list.sublist("Problem").get("Parametrization Type", 0);
+    // type_ = 0 ... K = z kmin + (1-z) kmax
+    // type_ = 1 ... K = exp((1-z) log(kmin) + z log(kmax))
+  }
+
+  void compute(scalar_view &alpha, const scalar_view z, const scalar_view pts, int deriv=0) const {
+    const Real tol = std::sqrt(ROL::ROL_EPSILON<Real>());
+    const Real zero(0), one(1);
+    const int c = pts.extent_int(0);
+    const int p = pts.extent_int(1);
+    const int d = pts.extent_int(2);
+    Real weight(0), norm(0);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        norm = zero;
+        for (int k = 0; k < d; ++k) norm += pts(i,j,k)*pts(i,j,k);
+        weight = (std::sqrt(norm) <= dRadius_ + tol ? one : zero);
+        // Compute spatially dependent viscosity
+        if (deriv==1)      alpha(i,j) = weight * deriv1(z(i,j));
+        else if (deriv==2) alpha(i,j) = weight * deriv2(z(i,j));
+        else               alpha(i,j) = weight * value(z(i,j));
+      }
+    }
+  }
+};
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/filteredDarcy/CMakeLists.txt b/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/filteredDarcy/CMakeLists.txt
index 4336dbeb8a2d..34f44015576f 100644
--- a/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/filteredDarcy/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/filteredDarcy/CMakeLists.txt
@@ -11,13 +11,13 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_01
     SOURCES example_01.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     ProjectAxisymmetricFilteredDarcyDataCopy
     SOURCE_FILES
       input.xml
@@ -32,5 +32,39 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
   )
 
+ENDIF()
+
+IF(${PROJECT_NAME}_ENABLE_Intrepid2 AND
+   ${PROJECT_NAME}_ENABLE_Ifpack2   AND
+   ${PROJECT_NAME}_ENABLE_MueLu     AND
+   ${PROJECT_NAME}_ENABLE_Amesos2   AND
+   ${PROJECT_NAME}_ENABLE_Tpetra )
+
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/teuchos/src/sol)
+
+  # Need ROL_TpetraMultiVector.hpp
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
+
+  ROL_ADD_EXECUTABLE(
+    example_01_Kokkos
+    SOURCES example_01K.cpp
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_COPY_FILES_TO_BINARY_DIR(
+    ProjectAxisymmetricFilteredDarcyDataCopyK
+    SOURCE_FILES
+      input.xml
+      input-nonconvex.xml
+      input-sphere.xml
+      plotDarcy.m
+      plotDarcyAxi.m
+      plotmesh.m
+      README.md
+      run.sh
+    SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}"
+    DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
+  )
 
 ENDIF()
diff --git a/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/filteredDarcy/example_01.cpp b/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/filteredDarcy/example_01.cpp
index bb152a7a640c..f1ebd9ad074a 100644
--- a/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/filteredDarcy/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/filteredDarcy/example_01.cpp
@@ -13,8 +13,7 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -50,7 +49,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int>> comm
     = Tpetra::getDefaultComm();
   const int myRank = comm->getRank();
diff --git a/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/filteredDarcy/example_01K.cpp b/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/filteredDarcy/example_01K.cpp
new file mode 100644
index 000000000000..043e3b502e34
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/filteredDarcy/example_01K.cpp
@@ -0,0 +1,260 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_01.cpp
+    \brief Shows how to solve the filtered Darcy porosity optimization problem.
+*/
+
+#include "Teuchos_Comm.hpp"
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+
+#include "ROL_TpetraMultiVector.hpp"
+#include "ROL_Reduced_Objective_SimOpt.hpp"
+#include "ROL_Bounds.hpp"
+#include "ROL_Solver.hpp"
+#include "ROL_SingletonVector.hpp"
+#include "ROL_ConstraintFromObjective.hpp"
+
+#include "../../../../TOOLS/meshreaderK.hpp"
+#include "../../../../TOOLS/pdeconstraintK.hpp"
+#include "../../../../TOOLS/pdeobjectiveK.hpp"
+#include "../../../../TOOLS/pdevectorK.hpp"
+#include "../../../../TOOLS/integralconstraintK.hpp"
+
+#include "pde_darcyK.hpp"
+#include "obj_darcyK.hpp"
+#include "pde_filterK.hpp"
+#include "filtered_objK.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+int main(int argc, char *argv[]) {
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  Kokkos::ScopeGuard kokkosScope (argc, argv);
+  ROL::Ptr<const Teuchos::Comm<int>> comm
+    = Tpetra::getDefaultComm();
+  const int myRank = comm->getRank();
+  if ((iprint > 0) && (myRank == 0))
+    outStream = ROL::makePtrFromRef(std::cout);
+  else
+    outStream = ROL::makePtrFromRef(bhs);
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+
+    /*** Read in XML input ***/
+    std::string filename = "input.xml";
+    auto parlist = ROL::getParametersFromXmlFile(filename);
+
+    bool output = parlist->sublist("SimOpt").sublist("Solve").get("Output Iteration History",false);
+    output = (iprint > 0) && (myRank==0) && output;
+    parlist->sublist("SimOpt").sublist("Solve").set("Output Iteration History",output);
+
+    /*** Initialize main data structure. ***/
+    auto meshMgr = ROL::makePtr<MeshReader<RealT,DeviceT>>(*parlist);
+    // Initialize PDE describing filtered Darcy equations.
+    auto pde = ROL::makePtr<PDE_Darcy<RealT,DeviceT>>(*parlist);
+    auto con = ROL::makePtr<PDE_Constraint<RealT,DeviceT>>(pde,meshMgr,comm,*parlist,*outStream);
+    auto pdeFilter = ROL::makePtr<PDE_Filter<RealT,DeviceT>>(*parlist);
+    // Cast the constraint and get the assembler.
+    auto assembler = con->getAssembler();
+    con->setSolveParameters(*parlist);
+
+    // Create state vectors
+    auto u_ptr = assembler->createStateVector();    u_ptr->randomize();
+    auto up = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(u_ptr,pde,assembler,*parlist);
+    auto p_ptr = assembler->createStateVector();    p_ptr->randomize();
+    auto pp = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(p_ptr,pde,assembler,*parlist);
+    auto r_ptr = assembler->createResidualVector(); r_ptr->putScalar(0.0);
+    auto rp = ROL::makePtr<PDE_DualSimVector<RealT,DeviceT>>(r_ptr,pde,assembler,*parlist);
+
+    // Create control vectors
+    bool useParamVar = parlist->sublist("Problem").get("Use Optimal Constant Velocity",false);
+    int dim = 2;
+    auto f_ptr = assembler->createControlVector();  f_ptr->randomize();
+    ROL::Ptr<std::vector<RealT>>            z0_ptr;
+    ROL::Ptr<ROL::StdVector<RealT>>         z0p;
+    ROL::Ptr<ROL::TpetraMultiVector<RealT>> f1p;
+    ROL::Ptr<ROL::Vector<RealT>>            fp;
+    if (useParamVar) {
+      z0_ptr = ROL::makePtr<std::vector<RealT>>(dim);
+      z0p = ROL::makePtr<ROL::StdVector<RealT>>(z0_ptr);
+      f1p = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(f_ptr,pde,assembler,*parlist);
+      fp = ROL::makePtr<PDE_OptVector<RealT>>(f1p,z0p,myRank);
+    }
+    else {
+      fp = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(f_ptr,pde,assembler,*parlist);
+    }
+
+    // Create combined vectors
+    auto xp = ROL::makePtr<ROL::Vector_SimOpt<RealT>>(up,fp);
+
+    // Initialize quadratic objective function.
+    //auto qoi  = ROL::makePtr<QoI_Velocity_Darcy<RealT,DeviceT>>(*parlist,
+    //                                                            pde->getPressureFE(),
+    //                                                            pde->getControlFE(),
+    //                                                            pde->getPressureBdryFE(4),
+    //                                                            pde->getControlBdryFE(4),
+    //                                                            pde->getBdryCellLocIds(4),
+    //                                                            pde->getPermeability());
+    auto qoi  = ROL::makePtr<QoI_VelocityTracking_Darcy<RealT,DeviceT>>(*parlist,
+                                                                        pde->getPressureFE(),
+                                                                        pde->getControlFE(),
+                                                                        pde->getPermeability());
+    auto obj  = ROL::makePtr<PDE_Objective<RealT,DeviceT>>(qoi,assembler);
+    auto robj = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(obj,con,up,fp,pp,true,false);
+    auto fobj = ROL::makePtr<FilteredObjective<RealT,DeviceT>>(robj,pdeFilter,meshMgr,comm,*parlist,*outStream);
+
+    // Build density vector
+    ROL::Ptr<ROL::TpetraMultiVector<RealT>> z1p;
+    ROL::Ptr<ROL::Vector<RealT>> zp;
+    auto z_ptr = fobj->getAssembler()->createControlVector();  z_ptr->randomize();
+    if (useParamVar) {
+      z1p = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(z_ptr,pdeFilter,fobj->getAssembler(),*parlist);
+      zp = ROL::makePtr<PDE_OptVector<RealT>>(z1p,z0p,myRank);
+    }
+    else {
+      zp = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(z_ptr,pdeFilter,fobj->getAssembler(),*parlist);
+    }
+
+    // Build bound constraint
+    ROL::Ptr<ROL::Vector<RealT>>          lp, hp;
+    if (useParamVar) {
+      auto l0p = ROL::makePtr<ROL::StdVector<RealT>>(dim,ROL::ROL_NINF<RealT>());
+      auto h0p = ROL::makePtr<ROL::StdVector<RealT>>(dim,ROL::ROL_INF<RealT>());
+      auto l1_ptr = fobj->getAssembler()->createControlVector();
+      auto h1_ptr = fobj->getAssembler()->createControlVector();
+      auto l1p = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(l1_ptr,pdeFilter,fobj->getAssembler(),*parlist);
+      auto h1p = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(h1_ptr,pdeFilter,fobj->getAssembler(),*parlist);
+      l1p->setScalar(0.0);
+      h1p->setScalar(1.0);
+      lp = ROL::makePtr<PDE_OptVector<RealT>>(l1p,l0p,myRank);
+      hp = ROL::makePtr<PDE_OptVector<RealT>>(h1p,h0p,myRank);
+    }
+    else {
+      lp = zp->clone(); lp->setScalar(0.0);
+      hp = zp->clone(); hp->setScalar(1.0);
+    }
+    auto bnd = ROL::makePtr<ROL::Bounds<RealT>>(lp, hp);
+    // Build optimization problem
+    auto optProb = ROL::makePtr<ROL::Problem<RealT>>(fobj, zp);
+    optProb->addBoundConstraint(bnd);
+    optProb->finalize(false,true,*outStream);
+
+    // Run derivative checks
+    bool checkDeriv = parlist->sublist("Problem").get("Check derivatives",false);
+    if ( checkDeriv ) {
+      //ROL::Ptr<ROL::Vector<RealT>> rup = up->clone(); rup->randomize(-1.0,1.0);
+      //ROL::Ptr<ROL::Vector<RealT>> rpp = pp->clone(); rpp->randomize(-1.0,1.0);
+      //ROL::Ptr<ROL::Vector<RealT>> dup = up->clone(); dup->randomize(-1.0,1.0);
+      ROL::Ptr<ROL::Vector<RealT>> rzp = zp->clone(); rzp->randomize( 0.0,1.0);
+      ROL::Ptr<ROL::Vector<RealT>> dzp = zp->clone(); dzp->randomize( 0.0,1.0);
+      //con->checkApplyJacobian_1(*rup,*rzp,*dup,*rup,true,*outStream);
+      //con->checkApplyJacobian_2(*rup,*rzp,*dzp,*rup,true,*outStream);
+      //con->checkInverseJacobian_1(*rup,*rup,*rup,*rzp,true,*outStream);
+      //con->checkInverseAdjointJacobian_1(*rup,*rup,*rup,*rzp,true,*outStream);
+      //con->checkApplyAdjointHessian_11(*rup,*rzp,*rpp,*dup,*rup,true,*outStream);
+      //con->checkApplyAdjointHessian_21(*rup,*rzp,*rpp,*dzp,*rup,true,*outStream);
+      //con->checkApplyAdjointHessian_12(*rup,*rzp,*rpp,*dup,*rzp,true,*outStream);
+      //con->checkApplyAdjointHessian_22(*rup,*rzp,*rpp,*dzp,*rzp,true,*outStream);
+      //obj->checkGradient_1(*rup,*rzp,*dup,true,*outStream);
+      //obj->checkGradient_2(*rup,*rzp,*dzp,true,*outStream);
+      //obj->checkHessVec_11(*rup,*rzp,*dup,true,*outStream);
+      //obj->checkHessVec_12(*rup,*rzp,*dzp,true,*outStream);
+      //obj->checkHessVec_21(*rup,*rzp,*dup,true,*outStream);
+      //obj->checkHessVec_22(*rup,*rzp,*dzp,true,*outStream);
+      fobj->checkGradient(*rzp,*dzp,true,*outStream);
+      fobj->checkHessVec(*rzp,*dzp,true,*outStream);
+      //optProb->check(*outStream);
+    }
+
+    // Solve optimization problem
+    zp->setScalar(0.5);
+    up->zero(); pp->zero();
+    bool opt = parlist->sublist("Problem").get("Solve Optimization Problem",true);
+    if (opt) {
+      std::ifstream infile("control.txt");
+      if (infile.good())
+        fobj->getAssembler()->inputTpetraVector(z_ptr,"control.txt");
+      if (useParamVar) {
+        std::ifstream infile0; infile0.open("target.txt");
+        if (infile0.good()) {
+          for (int i = 0; i < dim; ++i) infile0 >> (*z0_ptr)[i];
+          infile0.close();
+        }
+      }
+      ROL::Solver<RealT> optSolver(optProb, *parlist);
+      optSolver.solve(*outStream);
+      con->outputTpetraVector(z_ptr,"control.txt");
+      if (useParamVar) {
+        std::ofstream outfile; outfile.open("target.txt");
+        for (const auto x : *z0_ptr) {
+          outfile << std::scientific << std::setprecision(16);
+          outfile << x << std::endl;
+        }
+        outfile.close();
+      }
+    }
+
+    // Output
+    assembler->printMeshData(*outStream);
+    RealT tol(1.e-8);
+    Teuchos::Array<RealT> res(1,0);
+    fobj->applyFilter(*fp,*zp,false);
+    con->solve(*rp,*up,*fp,tol);
+    con->outputTpetraVector(u_ptr,"state.txt");
+    con->outputTpetraVector(f_ptr,"filtered_control.txt");
+    con->value(*rp,*up,*fp,tol);
+    r_ptr->norm2(res.view(0,1));
+    *outStream << "Residual Norm: " << res[0] << std::endl;
+    assembler->printDataPDE(pde,u_ptr,f_ptr);
+    errorFlag += (res[0] > 1.e-6 ? 1 : 0);
+    //con->outputTpetraData();
+
+    auto conFilter = ROL::makePtr<PDE_Constraint<RealT,DeviceT>>(pdeFilter,meshMgr,comm,*parlist,*outStream);
+    auto assemblerFilter = conFilter->getAssembler();
+    auto filteru_ptr = assemblerFilter->createStateVector();
+    auto filterz_ptr = assemblerFilter->createControlVector();
+    auto filteru = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(filteru_ptr,pdeFilter,assemblerFilter,*parlist);
+    auto filterz = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(filterz_ptr,pdeFilter,assemblerFilter,*parlist);
+    filteru->setScalar(0.0);
+    filterz->setScalar(0.0);
+    conFilter->solve(*filteru, *filteru, *filterz, tol);
+    conFilter->outputTpetraData();
+
+    // Get a summary from the time monitor.
+    Teuchos::TimeMonitor::summarize();
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/filteredDarcy/filtered_objK.hpp b/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/filteredDarcy/filtered_objK.hpp
new file mode 100644
index 000000000000..ccd46a6aa199
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/filteredDarcy/filtered_objK.hpp
@@ -0,0 +1,317 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  filtered_obj.hpp
+    \brief Implements the filtered velocity matching objective.
+*/
+
+#ifndef FILTERED_OBJK_H
+#define FILTERED_OBJK_H
+
+#include "ROL_Reduced_Objective_SimOpt.hpp"
+#include "../../../../TOOLS/pdeK.hpp"
+#include "../../../../TOOLS/assemblerK.hpp"
+#include "../../../../TOOLS/solver.hpp"
+#include "../../../../TOOLS/pdevectorK.hpp"
+
+template <class Real, class DeviceType>
+class FilteredObjective : public ROL::Objective<Real> {
+private:
+  // Compliance objective function
+  ROL::Ptr<ROL::Reduced_Objective_SimOpt<Real>> obj_;
+
+  // FIlter PDE
+  const ROL::Ptr<PDE<Real,DeviceType>> pde_filter_;
+  ROL::Ptr<Assembler<Real,DeviceType>> assembler_filter_;
+  ROL::Ptr<Solver<Real>> solver_filter_;
+  ROL::Ptr<Tpetra::CrsMatrix<>> matF1_, matF2_;
+
+  // Vector Storage
+  ROL::Ptr<Tpetra::MultiVector<>> f_state_, f_res_;
+  ROL::Ptr<ROL::Vector<Real>> Fz_, Fzcache_, Fv_, dctrl_;
+
+  bool nuke_;
+  int printFreq_;
+
+  unsigned nupda_, nfval_, ngrad_, nhess_, nprec_;
+  unsigned nasFi_, napFi_;
+
+public:
+  FilteredObjective(
+      const ROL::Ptr<ROL::Reduced_Objective_SimOpt<Real>> &obj,
+      const ROL::Ptr<PDE<Real,DeviceType>>                &filter,
+      const ROL::Ptr<MeshManager<Real,DeviceType>>        &mesh,
+      const ROL::Ptr<const Teuchos::Comm<int>>            &comm,
+      ROL::ParameterList                                  &list,
+      std::ostream                                        &stream = std::cout)
+    : obj_(obj), pde_filter_(filter),
+      nupda_(0), nfval_(0), ngrad_(0), nhess_(0), nprec_(0),
+      nasFi_(0), napFi_(0) {
+    // Filter PDE
+    assembler_filter_ = ROL::makePtr<Assembler<Real,DeviceType>>(pde_filter_->getFields(),
+                                                                 pde_filter_->getFields2(),
+                                                                 mesh,comm,list,stream);
+    assembler_filter_->setCellNodes(*pde_filter_);
+    solver_filter_    = ROL::makePtr<Solver<Real>>(list.sublist("Solver"));
+
+    // Need to add parametric capability
+    bool useParamVar = list.sublist("Problem").get("Use Optimal Constant Velocity",false);
+    int dim = 2;
+
+    // Vector storage
+    f_state_ = assembler_filter_->createStateVector();
+    f_res_   = assembler_filter_->createResidualVector();
+    if (useParamVar) {
+      Fz_      = ROL::makePtr<PDE_OptVector<Real>>(
+                   ROL::makePtr<PDE_PrimalSimVector<Real,DeviceType>>(assembler_filter_->createStateVector(),pde_filter_,assembler_filter_,list),
+                   ROL::makePtr<ROL::StdVector<Real>>(dim),
+                   comm->getRank());
+      Fzcache_ = ROL::makePtr<PDE_OptVector<Real>>(
+                   ROL::makePtr<PDE_PrimalSimVector<Real,DeviceType>>(assembler_filter_->createStateVector(),pde_filter_,assembler_filter_,list),
+                   ROL::makePtr<ROL::StdVector<Real>>(dim),
+                   comm->getRank());
+      Fv_      = ROL::makePtr<PDE_OptVector<Real>>(
+                   ROL::makePtr<PDE_PrimalSimVector<Real,DeviceType>>(assembler_filter_->createStateVector(),pde_filter_,assembler_filter_,list),
+                   ROL::makePtr<ROL::StdVector<Real>>(dim),
+                   comm->getRank());
+      dctrl_   = ROL::makePtr<PDE_OptVector<Real>>(
+                   ROL::makePtr<PDE_DualSimVector<Real,DeviceType>>(assembler_filter_->createStateVector(),pde_filter_,assembler_filter_,list),
+                   ROL::makePtr<ROL::StdVector<Real>>(dim),
+                   comm->getRank());
+    }
+    else {
+      Fz_      = ROL::makePtr<PDE_PrimalSimVector<Real,DeviceType>>(assembler_filter_->createStateVector(),pde_filter_,assembler_filter_,list);
+      Fzcache_ = ROL::makePtr<PDE_PrimalSimVector<Real,DeviceType>>(assembler_filter_->createStateVector(),pde_filter_,assembler_filter_,list);
+      Fv_      = ROL::makePtr<PDE_PrimalSimVector<Real,DeviceType>>(assembler_filter_->createStateVector(),pde_filter_,assembler_filter_,list);
+      dctrl_   = ROL::makePtr<PDE_DualSimVector<Real,DeviceType>>(assembler_filter_->createStateVector(),pde_filter_,assembler_filter_,list);
+    }
+
+    nuke_      = list.sublist("Problem").get("Use Basic Update",false);
+    printFreq_ = list.sublist("Problem").get("Output Frequency",0);
+
+    assembleFilter();
+  }
+
+  const ROL::Ptr<Assembler<Real,DeviceType>> getAssembler() const {
+    return assembler_filter_;
+  }
+
+  void update(const ROL::Vector<Real> &z, ROL::UpdateType type, int iter = -1) {
+    nupda_++;
+    if (nuke_) {
+      update_temp(z,iter);
+    }
+    else {
+      switch (type) {
+        case ROL::UpdateType::Initial: update_initial(z,iter); break;
+        case ROL::UpdateType::Accept:  update_accept(z,iter);  break;
+        case ROL::UpdateType::Revert:  update_revert(z,iter);  break;
+        case ROL::UpdateType::Trial:   update_trial(z,iter);   break;
+        case ROL::UpdateType::Temp:    update_temp(z,iter);    break;
+      }
+    }
+    obj_->update(*Fz_,type,iter);
+    // Print
+    if (printFreq_ > 0 && iter%printFreq_ == 0) {
+      std::stringstream dfile;
+      dfile  << "density_"          << iter << ".txt";
+      ROL::Ptr<const Tpetra::MultiVector<>> d_data = getConstField(z);
+      assembler_filter_->outputTpetraVector(d_data, dfile.str());
+    }
+  }
+
+  void update( const ROL::Vector<Real> &z, bool flag = true, int iter = -1 ) {
+    nupda_++;
+    if (nuke_) {
+      update_temp(z,iter);
+    }
+    else {
+      // Trial:    flag = false, iter = -1
+      // Check:    flag = true,  iter = -1
+      // Reject:   flag = false, iter > -1
+      // Accept:   flag = true,  iter > -1
+      if (flag) {
+        if (iter > -1) {
+          update_accept(z,iter);
+        }
+        else {
+          update_temp(z,iter);
+        }
+      }
+      else {
+        if (iter > -1) {
+          update_revert(z,iter);
+        }
+        else {
+          update_trial(z,iter);
+        }
+      }
+    }
+    obj_->update(*Fz_,flag,iter);
+  }
+
+  void printToFile(const ROL::Vector<Real> &z, std::ostream &stream = std::cout,
+                   std::string ufile = "state.txt", std::string dfile = "density.txt",
+                   std::string ffile = "filtered_density.txt") {
+    update(z,ROL::UpdateType::Temp);
+    ROL::Ptr<const Tpetra::MultiVector<>> dens_data = getConstField(z);
+    assembler_filter_->outputTpetraVector(dens_data,  dfile);
+    obj_->printToFile(*Fz_, stream, ufile, ffile);
+  }
+
+  void applyFilter(ROL::Vector<Real> &Fx, const ROL::Vector<Real> &x, bool transpose) {
+    napFi_++;
+    ROL::Ptr<Tpetra::MultiVector<>>      Fx_data = getField(Fx);
+    ROL::Ptr<const Tpetra::MultiVector<>> x_data = getConstField(x);
+    if (transpose) {
+      solver_filter_->solve(f_state_,x_data,false);
+      matF2_->apply(*f_state_,*Fx_data,Teuchos::TRANS);
+    }
+    else {
+      matF2_->apply(*x_data,*f_res_);
+      solver_filter_->solve(Fx_data,f_res_,false);
+    }
+    Fx.scale(static_cast<Real>(-1));
+    copyParameter(Fx,x);
+  }
+
+  Real value( const ROL::Vector<Real> &z, Real &tol ) {
+    nfval_++;
+    return obj_->value(*Fz_,tol);
+  }
+
+  void gradient( ROL::Vector<Real> &g, const ROL::Vector<Real> &z, Real &tol ) {
+    ngrad_++;
+    obj_->gradient(*dctrl_, *Fz_, tol);
+    applyFilter(g, *dctrl_, true);
+  }
+
+  void hessVec( ROL::Vector<Real> &hv, const ROL::Vector<Real> &v, const ROL::Vector<Real> &z, Real &tol ) {
+    nhess_++;
+    applyFilter(*Fv_, v, false);
+    obj_->hessVec(*dctrl_, *Fv_, *Fz_, tol);
+    applyFilter(hv, *dctrl_, true);
+  }
+
+  /** \brief Apply a reduced Hessian preconditioner.
+  */
+  virtual void precond( ROL::Vector<Real> &Pv, const ROL::Vector<Real> &v, const ROL::Vector<Real> &z, Real &tol ) {
+    nprec_++;
+    Pv.set(v.dual());
+  }
+
+  void summarize(std::ostream &stream) const {
+    stream << std::endl;
+    stream << std::string(114,'=') << std::endl;
+    stream << "  Filtered_Objective_Compliance::summarize" << std::endl;
+    stream << "    Number of calls to update:          " << nupda_ << std::endl;
+    stream << "    Number of calls to value:           " << nfval_ << std::endl;
+    stream << "    Number of calls to gradient:        " << ngrad_ << std::endl;
+    stream << "    Number of calls to hessVec:         " << nhess_ << std::endl;
+    stream << "    Number of calls to precond:         " << nprec_ << std::endl;
+    stream << "    Number of filter assemblies:        " << nasFi_ << std::endl;
+    stream << "    Number of filter applies:           " << napFi_ << std::endl;
+    stream << std::string(114,'=') << std::endl;
+    stream << std::endl;
+    obj_->summarize(stream);
+  }
+
+// For parametrized (stochastic) objective functions and constraints
+public:
+  void setParameter(const std::vector<Real> &param) {
+    ROL::Objective<Real>::setParameter(param);
+    obj_->setParameter(param);
+  }
+
+private:
+
+  ROL::Ptr<const Tpetra::MultiVector<>> getConstField(const ROL::Vector<Real> &x) const {
+    ROL::Ptr<const Tpetra::MultiVector<>> xp;
+    try {
+      xp = dynamic_cast<const ROL::TpetraMultiVector<Real>&>(x).getVector();
+    }
+    catch (std::exception &e) {
+      try {
+        ROL::Ptr<const ROL::TpetraMultiVector<Real>> xvec
+          = dynamic_cast<const PDE_OptVector<Real>&>(x).getField();
+        if (xvec == ROL::nullPtr) {
+          xp = ROL::nullPtr;
+        }
+        else {
+          xp = xvec->getVector();
+        }
+      }
+      catch (std::exception &ee) {
+        xp = ROL::nullPtr;
+      }
+    }
+    return xp;
+  }
+
+  ROL::Ptr<Tpetra::MultiVector<>> getField(ROL::Vector<Real> &x) const {
+    ROL::Ptr<Tpetra::MultiVector<>> xp;
+    try {
+      xp = dynamic_cast<ROL::TpetraMultiVector<Real>&>(x).getVector();
+    }
+    catch (std::exception &e) {
+      try {
+        ROL::Ptr<ROL::TpetraMultiVector<Real>> xvec
+          = dynamic_cast<PDE_OptVector<Real>&>(x).getField();
+        if ( xvec == ROL::nullPtr ) {
+          xp = ROL::nullPtr;
+        }
+        else {
+          xp = xvec->getVector();
+        }
+      }
+      catch (std::exception &ee) {
+        xp = ROL::nullPtr;
+      }
+    }
+    return xp;
+  }
+
+  void copyParameter(ROL::Vector<Real> &x, const ROL::Vector<Real> &y) const {
+    try {
+      dynamic_cast<PDE_OptVector<Real>&>(x).getParameter()->set(*dynamic_cast<const PDE_OptVector<Real>&>(y).getParameter());
+    }
+    catch (std::exception &e) {}
+  }
+
+  void assembleFilter(void) {
+    nasFi_++;
+    ROL::Ptr<Tpetra::MultiVector<>> f_ctrl = assembler_filter_->createControlVector();
+    assembler_filter_->assemblePDEJacobian1(matF1_,pde_filter_,f_state_,f_ctrl);
+    assembler_filter_->assemblePDEJacobian2(matF2_,pde_filter_,f_state_,f_ctrl);
+    solver_filter_->setA(matF1_);
+  }
+
+  void update_initial( const ROL::Vector<Real> &z, int iter ) {
+    applyFilter(*Fz_,z,false);
+    Fzcache_->set(*Fz_);
+  }
+
+  void update_accept( const ROL::Vector<Real> &z, int iter ) {
+    Fzcache_->set(*Fz_);
+  }
+
+  void update_temp( const ROL::Vector<Real> &z, int iter ) {
+    applyFilter(*Fz_,z,false);
+  }
+
+  void update_trial( const ROL::Vector<Real> &z, int iter ) {
+    applyFilter(*Fz_,z,false);
+  }
+
+  void update_revert( const ROL::Vector<Real> &z, int iter ) {
+    Fz_->set(*Fzcache_);
+  }
+}; // class FilteredObjective
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/filteredDarcy/obj_darcyK.hpp b/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/filteredDarcy/obj_darcyK.hpp
new file mode 100644
index 000000000000..c85b9a4ae8db
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/filteredDarcy/obj_darcyK.hpp
@@ -0,0 +1,1343 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  obj.hpp
+    \brief Provides the interface for local (cell-based) objective function computations.
+*/
+
+#ifndef OBJ_DARCYK_HPP
+#define OBJ_DARCYK_HPP
+
+#include "../../../../TOOLS/qoiK.hpp"
+#include "pde_darcyK.hpp"
+#include "permeabilityK.hpp"
+
+template <class Real, class DeviceType>
+class QoI_Velocity_Darcy : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> fePrs_, feCtrl_;
+  const std::vector<ROL::Ptr<fe_type>> fePrsBdry_, feCtrlBdry_;
+  const std::vector<std::vector<int>> bdryCellLocIds_;
+  const ROL::Ptr<Permeability<Real,DeviceType>> perm_;
+  std::vector<Real> target_;
+  bool onlyAxial_;
+
+  scalar_view getBoundaryCoeff(const scalar_view cell_coeff, int locSideId, const ROL::Ptr<fe_type> &fe) const {
+    std::vector<int> bdryCellLocId = bdryCellLocIds_[locSideId];
+    const int numCellsSide = bdryCellLocId.size();
+    const int f = fe->N().extent_int(1);
+    
+    scalar_view bdry_coeff("bdry_coeff", numCellsSide, f);
+    for (int i = 0; i < numCellsSide; ++i) {
+      for (int j = 0; j < f; ++j) {
+        bdry_coeff(i, j) = cell_coeff(bdryCellLocId[i], j);
+      }
+    }
+    return bdry_coeff;
+  }
+
+public:
+  QoI_Velocity_Darcy(ROL::ParameterList &list,
+                      const ROL::Ptr<fe_type> &fePrs,
+                      const ROL::Ptr<fe_type> &feCtrl,
+                      const std::vector<ROL::Ptr<fe_type>> &fePrsBdry,
+                      const std::vector<ROL::Ptr<fe_type>> &feCtrlBdry,
+                      const std::vector<std::vector<int>> &bdryCellLocIds,
+                      const ROL::Ptr<Permeability<Real,DeviceType>> &perm)
+    : fePrs_(fePrs), feCtrl_(feCtrl),
+      fePrsBdry_(fePrsBdry), feCtrlBdry_(feCtrlBdry),
+      bdryCellLocIds_(bdryCellLocIds),
+      perm_(perm) {
+    target_.clear(); target_.resize(2);
+    target_[0] = list.sublist("Problem").get("Target Radial Velocity",0.0);
+    target_[1] = list.sublist("Problem").get("Target Axial Velocity",-15.0);
+    onlyAxial_ = list.sublist("Problem").get("Only Use Axial Velocity",false);
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = ROL::nullPtr,
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c = fePrs_->gradN().extent_int(0);
+    const int d = fePrs_->gradN().extent_int(3);
+    // Initialize output val
+    val = scalar_view("val", c);
+    // Compute cost integral
+    std::vector<Real> target = (z_param == ROL::nullPtr) ? target_ : *z_param;
+    const int numLocSides = bdryCellLocIds_.size();
+    for (int l = 0; l < numLocSides; ++l) {
+      const int numCellsSide = bdryCellLocIds_[l].size();
+      if ( numCellsSide ) {
+        const int numCubPerSide = fePrsBdry_[l]->cubPts().extent_int(1);
+        scalar_view valU_eval("valU_eval", numCellsSide, numCubPerSide, d);
+        scalar_view valZ_eval("valZ_eval", numCellsSide, numCubPerSide);
+        scalar_view alpha("alpha", numCellsSide, numCubPerSide);
+        scalar_view intVal("intVal", numCellsSide);
+        scalar_view u_coeff_bdry = getBoundaryCoeff(u_coeff, l, fePrs_);
+        scalar_view z_coeff_bdry = getBoundaryCoeff(z_coeff, l, feCtrl_);
+        fePrsBdry_[l]->evaluateGradient(valU_eval, u_coeff_bdry);
+        feCtrlBdry_[l]->evaluateValue(valZ_eval, z_coeff_bdry);
+        perm_->compute(alpha, valZ_eval, fePrsBdry_[l]->cubPts(), 0);
+        for (int i = 0; i < numCellsSide; ++i) {
+          for (int j = 0; j < numCubPerSide; ++j) {
+            for (int k = 0; k < d; ++k) {
+              valU_eval(i,j,k) *= -alpha(i,j);
+              if (k==0 && onlyAxial_) valU_eval(i,j,k) = static_cast<Real>(0);
+              else                    valU_eval(i,j,k) -= target[k];
+            }
+          }
+        }
+        fePrsBdry_[l]->computeIntegral(intVal,valU_eval,valU_eval,true);
+        // Add to integral value
+        for (int i = 0; i < numCellsSide; ++i) {
+          int cidx = bdryCellLocIds_[l][i];
+          val(cidx) += static_cast<Real>(0.5)*intVal(i);
+        }
+      }
+    }
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = ROL::nullPtr,
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c = fePrs_->gradN().extent_int(0);
+    const int f = fePrs_->gradN().extent_int(1);
+    const int d = fePrs_->gradN().extent_int(3);
+    // Initialize output grad
+    grad = scalar_view("grad", c, f);
+    // Compute cost integral
+    std::vector<Real> target = (z_param == ROL::nullPtr) ? target_ : *z_param;
+    const int numLocSides = bdryCellLocIds_.size();
+    for (int l = 0; l < numLocSides; ++l) {
+      const int numCellsSide  = bdryCellLocIds_[l].size();
+      if ( numCellsSide ) {
+        const int numCubPerSide = fePrsBdry_[l]->cubPts().extent_int(1);
+        scalar_view valU_eval("valU_eval", numCellsSide, numCubPerSide, d);
+        scalar_view valZ_eval("valZ_eval", numCellsSide, numCubPerSide);
+        scalar_view alpha("alpha", numCellsSide, numCubPerSide);
+        scalar_view intGrad("intGrad", numCellsSide, f);
+        scalar_view u_coeff_bdry = getBoundaryCoeff(u_coeff, l, fePrs_);
+        scalar_view z_coeff_bdry = getBoundaryCoeff(z_coeff, l, feCtrl_);
+        fePrsBdry_[l]->evaluateGradient(valU_eval, u_coeff_bdry);
+        feCtrlBdry_[l]->evaluateValue(valZ_eval, z_coeff_bdry);
+        perm_->compute(alpha, valZ_eval, fePrsBdry_[l]->cubPts(), 0);
+        for (int i = 0; i < numCellsSide; ++i) {
+          for (int j = 0; j < numCubPerSide; ++j) {
+            for (int k = 0; k < d; ++k) {
+              valU_eval(i,j,k) *= -alpha(i,j);
+              if (k==0 && onlyAxial_) valU_eval(i,j,k) = static_cast<Real>(0);
+              else                    valU_eval(i,j,k) -= target[k];
+              valU_eval(i,j,k) *= -alpha(i,j);
+            }
+          }
+        }
+        // Compute gradient of squared L2-norm of diff
+        fst::integrate(intGrad,valU_eval,fePrsBdry_[l]->gradNdetJ(),false);
+        // Add to integral value
+        for (int i = 0; i < numCellsSide; ++i) {
+          int cidx = bdryCellLocIds_[l][i];
+          for (int j = 0; j < f; ++j) {
+            grad(cidx,j) += intGrad(i,j);
+          }
+        }
+      }
+    }
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = ROL::nullPtr,
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c  = fePrs_->gradN().extent_int(0);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int d  = fePrs_->gradN().extent_int(3);
+    // Initialize output grad
+    grad = scalar_view("grad", c, fc);
+    // Compute cost integral
+    std::vector<Real> target = (z_param == ROL::nullPtr) ? target_ : *z_param;
+    const int numLocSides = bdryCellLocIds_.size();
+    for (int l = 0; l < numLocSides; ++l) {
+      const int numCellsSide = bdryCellLocIds_[l].size();
+      if ( numCellsSide ) {
+        const int numCubPerSide = fePrsBdry_[l]->cubPts().extent_int(1);
+        scalar_view valU_eval("valU_eval", numCellsSide, numCubPerSide, d);
+        scalar_view valZ_eval("valZ_eval", numCellsSide, numCubPerSide);
+        scalar_view alpha("alpha", numCellsSide, numCubPerSide);
+        scalar_view dalpha("dalpha", numCellsSide, numCubPerSide);
+        scalar_view integrand("integrad", numCellsSide, numCubPerSide);
+        scalar_view intGrad("intGrad", numCellsSide, fc);
+        scalar_view u_coeff_bdry = getBoundaryCoeff(u_coeff, l, fePrs_);
+        scalar_view z_coeff_bdry = getBoundaryCoeff(z_coeff, l, feCtrl_);
+        fePrsBdry_[l]->evaluateGradient(valU_eval, u_coeff_bdry);
+        feCtrlBdry_[l]->evaluateValue(valZ_eval, z_coeff_bdry);
+        perm_->compute(alpha, valZ_eval, fePrsBdry_[l]->cubPts(), 0);
+        perm_->compute(dalpha, valZ_eval, fePrsBdry_[l]->cubPts(), 1);
+        Real dalphaU(0), misfit(0);
+        for (int i = 0; i < numCellsSide; ++i) {
+          for (int j = 0; j < numCubPerSide; ++j) {
+            for (int k = 0; k < d; ++k) {
+              dalphaU = -dalpha(i,j) * valU_eval(i,j,k);
+              if (k==0 && onlyAxial_) misfit = static_cast<Real>(0);
+              else                    misfit = -alpha(i,j) * valU_eval(i,j,k) - target[k];
+              integrand(i,j) += dalphaU * misfit;
+            }
+          }
+        }
+        // Compute gradient of squared L2-norm of diff
+        fst::integrate(intGrad,integrand,feCtrlBdry_[l]->NdetJ(),false);
+        // Add to integral value
+        for (int i = 0; i < numCellsSide; ++i) {
+          int cidx = bdryCellLocIds_[l][i];
+          for (int j = 0; j < fc; ++j) {
+            grad(cidx,j) += intGrad(i,j);
+          }
+        }
+      }
+    }
+  }
+
+  std::vector<Real> gradient_3(std::vector<scalar_view> & grad,
+                               const scalar_view u_coeff,
+                               const scalar_view z_coeff = ROL::nullPtr,
+                               const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if (z_param != ROL::nullPtr) {
+      // Get relevant dimensions
+      const int c = fePrs_->gradN().extent_int(0);
+      const int d = fePrs_->gradN().extent_int(3);
+      // Initialize output val
+      std::vector<Real> g_param(d,static_cast<Real>(0));
+      grad.clear(); grad.resize(d);
+      for (int k = 0; k < d; ++k)
+        grad[k] = scalar_view("grad", c);
+      // Compute cost integral
+      const int numLocSides = bdryCellLocIds_.size();
+      for (int l = 0; l < numLocSides; ++l) {
+        const int numCellsSide  = bdryCellLocIds_[l].size();
+        if ( numCellsSide ) {
+          const int numCubPerSide = fePrsBdry_[l]->cubPts().extent_int(1);
+          std::vector<scalar_view> intVal(d);
+          scalar_view valU_eval("valU_eval", numCellsSide, numCubPerSide, d);
+          scalar_view valZ_eval("valZ_eval", numCellsSide, numCubPerSide);
+          scalar_view integrand("integrand", numCellsSide, numCubPerSide);
+          scalar_view weight("weight", numCellsSide, numCubPerSide);
+          scalar_view alpha("alpha", numCellsSide, numCubPerSide);
+          scalar_view u_coeff_bdry = getBoundaryCoeff(u_coeff, l, fePrs_);
+          scalar_view z_coeff_bdry = getBoundaryCoeff(z_coeff, l, feCtrl_);
+          fePrsBdry_[l]->evaluateGradient(valU_eval, u_coeff_bdry);
+          feCtrlBdry_[l]->evaluateValue(valZ_eval, z_coeff_bdry);
+          perm_->compute(alpha, valZ_eval, fePrsBdry_[l]->cubPts(), 0);
+          for (int k = 0; k < d; ++k) {
+            intVal[k] = scalar_view(numCellsSide);
+            if ((k==0 && !onlyAxial_) || k==1) {
+              integrand->initialize();
+              for (int i = 0; i < numCellsSide; ++i) {
+                for (int j = 0; j < numCubPerSide; ++j) {
+                  weight(i,j) = static_cast<Real>(1);
+                  integrand(i,j) = (*z_param)[k] + alpha(i,j) * valU_eval(i,j,k);
+                }
+              }
+              fePrsBdry_[l]->computeIntegral(intVal[k],weight,integrand,true);
+            }
+          }
+          // Add to integral value
+          for (int k = 0; k < d; ++k) {
+            for (int i = 0; i < numCellsSide; ++i) {
+              int cidx = bdryCellLocIds_[l][i];
+              (grad[k])(cidx) += (intVal[k])(i);
+            }
+          }
+        }
+      }
+      return g_param;
+    }
+    else {
+      throw Exception::Zero(">>> QoI_Velocity_Darcy::gradient_3 is zero.");
+    }
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = ROL::nullPtr,
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c = fePrs_->gradN().extent_int(0);
+    const int f = fePrs_->gradN().extent_int(1);
+    const int d = fePrs_->gradN().extent_int(3);
+    // Initialize output grad
+    hess = scalar_view("hess", c, f);
+    // Compute cost integral
+    const int numLocSides = bdryCellLocIds_.size();
+    for (int l = 0; l < numLocSides; ++l) {
+      const int numCellsSide  = bdryCellLocIds_[l].size();
+      if ( numCellsSide ) {
+        const int numCubPerSide = fePrsBdry_[l]->cubPts().extent_int(1);
+        scalar_view valV_eval("valV_eval", numCellsSide, numCubPerSide, d);
+        scalar_view valZ_eval("valZ_eval", numCellsSide, numCubPerSide);
+        scalar_view alpha("alpha", numCellsSide, numCubPerSide);
+        scalar_view intHess("intHess", numCellsSide, f);
+        scalar_view v_coeff_bdry = getBoundaryCoeff(v_coeff, l, fePrs_);
+        scalar_view z_coeff_bdry = getBoundaryCoeff(z_coeff, l, feCtrl_);
+        fePrsBdry_[l]->evaluateGradient(valV_eval, v_coeff_bdry);
+        feCtrlBdry_[l]->evaluateValue(valZ_eval, z_coeff_bdry);
+        perm_->compute(alpha, valZ_eval, fePrsBdry_[l]->cubPts(), 0);
+        for (int i = 0; i < numCellsSide; ++i) {
+          for (int j = 0; j < numCubPerSide; ++j) {
+            for (int k = 0; k < d; ++k) {
+              valV_eval(i,j,k) *= (-alpha(i,j)) * (-alpha(i,j));
+                if (k==0 && onlyAxial_) valV_eval(i,j,k) = static_cast<Real>(0);
+            }
+          }
+        }
+        // Compute hessian of squared L2-norm of diff
+        fst::integrate(intHess,valV_eval,fePrsBdry_[l]->gradNdetJ(),false);
+        // Add to integral value
+        for (int i = 0; i < numCellsSide; ++i) {
+          int cidx = bdryCellLocIds_[l][i];
+          for (int j = 0; j < f; ++j) {
+            hess(cidx,j) += intHess(i,j);
+          }
+        }
+      }
+    }
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = ROL::nullPtr,
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c  = fePrs_->gradN().extent_int(0);
+    const int f  = fePrs_->gradN().extent_int(1);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int d  = fePrs_->gradN().extent_int(3);
+    // Initialize output grad
+    hess = scalar_view("hess", c, f);
+    // Compute cost integral
+    std::vector<Real> target = (z_param == ROL::nullPtr) ? target_ : *z_param;
+    const int numLocSides = bdryCellLocIds_.size();
+    for (int l = 0; l < numLocSides; ++l) {
+      const int numCellsSide  = bdryCellLocIds_[l].size();
+      if ( numCellsSide ) {
+        const int numCubPerSide = fePrsBdry_[l]->cubPts().extent_int(1);
+        scalar_view valU_eval("valU_eval", numCellsSide, numCubPerSide, d);
+        scalar_view valZ_eval("valZ_eval", numCellsSide, numCubPerSide);
+        scalar_view valV_eval("valV_eval", numCellsSide, numCubPerSide);
+        scalar_view alpha("alpha", numCellsSide, numCubPerSide);
+        scalar_view dalpha("dalpha", numCellsSide, numCubPerSide);
+        scalar_view integrand("integrand", numCellsSide, numCubPerSide, d);
+        scalar_view intHess("intHess", numCellsSide, f);
+        scalar_view u_coeff_bdry = getBoundaryCoeff(u_coeff, l, fePrs_);
+        scalar_view z_coeff_bdry = getBoundaryCoeff(z_coeff, l, feCtrl_);
+        scalar_view v_coeff_bdry = getBoundaryCoeff(v_coeff, l, feCtrl_);
+        fePrsBdry_[l]->evaluateGradient(valU_eval, u_coeff_bdry);
+        feCtrlBdry_[l]->evaluateValue(valZ_eval, z_coeff_bdry);
+        feCtrlBdry_[l]->evaluateValue(valV_eval, v_coeff_bdry);
+        perm_->compute(alpha, valZ_eval, fePrsBdry_[l]->cubPts(), 0);
+        perm_->compute(dalpha, valZ_eval, fePrsBdry_[l]->cubPts(), 1);
+        Real dalphaV(0), misfit(0), dmisfit(0);
+        for (int i = 0; i < numCellsSide; ++i) {
+          for (int j = 0; j < numCubPerSide; ++j) {
+            for (int k = 0; k < d; ++k) {
+              dalphaV = -dalpha(i,j) * valV_eval(i,j);
+              if (k==0 && onlyAxial_) {
+                misfit  = static_cast<Real>(0);
+                dmisfit = static_cast<Real>(0);
+              }
+              else {
+                misfit  = -alpha(i,j)*valU_eval(i,j,k) - target[k];
+                dmisfit = -alpha(i,j)*valU_eval(i,j,k);
+              }
+              integrand(i,j,k) = dalphaV * (misfit + dmisfit);
+            }
+          }
+        }
+        // Compute gradient of squared L2-norm of diff
+        fst::integrate(intHess,integrand,fePrsBdry_[l]->gradNdetJ(),false);
+        // Add to integral value
+        for (int i = 0; i < numCellsSide; ++i) {
+          int cidx = bdryCellLocIds_[l][i];
+          for (int j = 0; j < f; ++j) {
+            hess(cidx,j) += intHess(i,j);
+          }
+        }
+      }
+    }
+  }
+
+  void HessVec_13(scalar_view & hess,
+                  const ROL::Ptr<const std::vector<Real>> & v_param,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = ROL::nullPtr,
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if (z_param != ROL::nullPtr) {
+      // Get relevant dimensions
+      const int c = fePrs_->gradN().extent_int(0);
+      const int f = fePrs_->gradN().extent_int(1);
+      const int d = fePrs_->gradN().extent_int(3);
+      // Initialize output grad
+      hess = scalar_view("hess", c, f);
+      // Compute cost integral
+      const int numLocSides = bdryCellLocIds_.size();
+      for (int l = 0; l < numLocSides; ++l) {
+        const int numCellsSide  = bdryCellLocIds_[l].size();
+        if ( numCellsSide ) {
+          const int numCubPerSide = fePrsBdry_[l]->cubPts().extent_int(1);
+          scalar_view weight("weight", numCellsSide, numCubPerSide, d);
+          scalar_view valZ_eval("valZ_eval", numCellsSide, numCubPerSide);
+          scalar_view alpha("alpha", numCellsSide, numCubPerSide);
+          scalar_view intHess("intHess", numCellsSide, f);
+          scalar_view z_coeff_bdry = getBoundaryCoeff(z_coeff, l, feCtrl_);
+          feCtrlBdry_[l]->evaluateValue(valZ_eval, z_coeff_bdry);
+          perm_->compute(alpha, valZ_eval, fePrsBdry_[l]->cubPts(), 0);
+          for (int k = 0; k < d; ++k) {
+            if ((k==0 && !onlyAxial_) || k==1) {
+              for (int i = 0; i < numCellsSide; ++i) {
+                for (int j = 0; j < numCubPerSide; ++j) {
+                  weight(i,j,k) = (*v_param)[k] * (-alpha(i,j));
+                }
+              }
+              // Compute gradient of squared L2-norm of diff
+              fst::integrate(intHess,weight,fePrsBdry_[l]->gradNdetJ(),false);
+              // Add to integral value
+              for (int i = 0; i < numCellsSide; ++i) {
+                int cidx = bdryCellLocIds_[l][i];
+                for (int j = 0; j < f; ++j) {
+                  hess(cidx,j) += intHess(i,j);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    else {
+      throw Exception::NotImplemented(">>> HessVec_13 not implemented.");
+    }
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = ROL::nullPtr,
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c  = fePrs_->gradN().extent_int(0);
+    const int f  = fePrs_->gradN().extent_int(1);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int d  = fePrs_->gradN().extent_int(3);
+    // Initialize output grad
+    hess = scalar_view("hess", c, fc);
+    // Compute cost integral
+    std::vector<Real> target = (z_param == ROL::nullPtr) ? target_ : *z_param;
+    const int numLocSides = bdryCellLocIds_.size();
+    for (int l = 0; l < numLocSides; ++l) {
+      const int numCellsSide  = bdryCellLocIds_[l].size();
+      if ( numCellsSide ) {
+        const int numCubPerSide = fePrsBdry_[l]->cubPts().extent_int(1);
+        scalar_view valU_eval("valU_eval", numCellsSide, numCubPerSide, d);
+        scalar_view valZ_eval("valZ_eval", numCellsSide, numCubPerSide);
+        scalar_view valV_eval("valV_eval", numCellsSide, numCubPerSide, d);
+        scalar_view alpha("alpha", numCellsSide, numCubPerSide);
+        scalar_view dalpha("dalpha", numCellsSide, numCubPerSide);
+        scalar_view integrand("integrand", numCellsSide, numCubPerSide);
+        scalar_view intHess("intHess", numCellsSide, fc);
+        scalar_view u_coeff_bdry = getBoundaryCoeff(u_coeff, l, fePrs_);
+        scalar_view z_coeff_bdry = getBoundaryCoeff(z_coeff, l, feCtrl_);
+        scalar_view v_coeff_bdry = getBoundaryCoeff(v_coeff, l, fePrs_);
+        fePrsBdry_[l]->evaluateGradient(valU_eval, u_coeff_bdry);
+        feCtrlBdry_[l]->evaluateValue(valZ_eval, z_coeff_bdry);
+        fePrsBdry_[l]->evaluateGradient(valV_eval, v_coeff_bdry);
+        perm_->compute(alpha, valZ_eval, fePrsBdry_[l]->cubPts(), 0);
+        perm_->compute(dalpha, valZ_eval, fePrsBdry_[l]->cubPts(), 1);
+        Real dalphaV(0), misfit(0), dmisfit(0);
+        for (int i = 0; i < numCellsSide; ++i) {
+          for (int j = 0; j < numCubPerSide; ++j) {
+            for (int k = 0; k < d; ++k) {
+              dalphaV = -dalpha(i,j) * valV_eval(i,j,k);
+              if (k==0 && onlyAxial_) {
+                misfit  = static_cast<Real>(0);
+                dmisfit = static_cast<Real>(0);
+              }
+              else {
+                misfit  = -alpha(i,j)*valU_eval(i,j,k) - target[k];
+                dmisfit = -alpha(i,j)*valU_eval(i,j,k);
+              }
+              integrand(i,j) += dalphaV * (misfit + dmisfit);
+            }
+          }
+        }
+        // Compute gradient of squared L2-norm of diff
+        fst::integrate(intHess,integrand,feCtrlBdry_[l]->NdetJ(),false);
+        // Add to integral value
+        for (int i = 0; i < numCellsSide; ++i) {
+          int cidx = bdryCellLocIds_[l][i];
+          for (int j = 0; j < fc; ++j) {
+            hess(cidx,j) += intHess(i,j);
+          }
+        }
+      }
+    }
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = ROL::nullPtr,
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c  = fePrs_->gradN().extent_int(0);
+    const int f  = fePrs_->gradN().extent_int(1);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int d  = fePrs_->gradN().extent_int(3);
+    // Initialize output grad
+    hess = scalar_view("hess", c, fc);
+    // Compute cost integral
+    std::vector<Real> target = (z_param == ROL::nullPtr) ? target_ : *z_param;
+    const int numLocSides = bdryCellLocIds_.size();
+    for (int l = 0; l < numLocSides; ++l) {
+      const int numCellsSide  = bdryCellLocIds_[l].size();
+      if ( numCellsSide ) {
+        const int numCubPerSide = fePrsBdry_[l]->cubPts().extent_int(1);
+        scalar_view valU_eval("valU_eval", numCellsSide, numCubPerSide, d);
+        scalar_view valZ_eval("valZ_eval", numCellsSide, numCubPerSide);
+        scalar_view valV_eval("valV_eval", numCellsSide, numCubPerSide);
+        scalar_view alpha("alpha", numCellsSide, numCubPerSide);
+        scalar_view dalpha("dalpha", numCellsSide, numCubPerSide);
+        scalar_view ddalpha("ddalpha", numCellsSide, numCubPerSide);
+        scalar_view integrand("integrand", numCellsSide, numCubPerSide);
+        scalar_view intHess("intHess", numCellsSide, fc);
+        scalar_view u_coeff_bdry = getBoundaryCoeff(u_coeff, l, fePrs_);
+        scalar_view z_coeff_bdry = getBoundaryCoeff(z_coeff, l, feCtrl_);
+        scalar_view v_coeff_bdry = getBoundaryCoeff(v_coeff, l, feCtrl_);
+        fePrsBdry_[l]->evaluateGradient(valU_eval, u_coeff_bdry);
+        feCtrlBdry_[l]->evaluateValue(valZ_eval, z_coeff_bdry);
+        feCtrlBdry_[l]->evaluateValue(valV_eval, v_coeff_bdry);
+        perm_->compute(alpha, valZ_eval, fePrsBdry_[l]->cubPts(), 0);
+        perm_->compute(dalpha, valZ_eval, fePrsBdry_[l]->cubPts(), 1);
+        perm_->compute(ddalpha, valZ_eval, fePrsBdry_[l]->cubPts(), 2);
+        Real dalphaV(0), misfit(0), dmisfit(0), ddalphaV(0);
+        for (int i = 0; i < numCellsSide; ++i) {
+          for (int j = 0; j < numCubPerSide; ++j) {
+            for (int k = 0; k < d; ++k) {
+              dalphaV  = -dalpha(i,j) * valV_eval(i,j) * valU_eval(i,j,k);
+              ddalphaV = -ddalpha(i,j) * valV_eval(i,j) * valU_eval(i,j,k);
+              if (k==0 && onlyAxial_) {
+                misfit  = static_cast<Real>(0);
+                dmisfit = static_cast<Real>(0);
+              }
+              else {
+                misfit  = -alpha(i,j)*valU_eval(i,j,k) - target[k];
+                dmisfit = -dalpha(i,j)*valU_eval(i,j,k);
+              }
+              integrand(i,j) += dalphaV * dmisfit + ddalphaV * misfit;
+            }
+          }
+        }
+        // Compute gradient of squared L2-norm of diff
+        fst::integrate(intHess,integrand,feCtrlBdry_[l]->NdetJ(),false);
+        // Add to integral value
+        for (int i = 0; i < numCellsSide; ++i) {
+          int cidx = bdryCellLocIds_[l][i];
+          for (int j = 0; j < fc; ++j) {
+            hess(cidx,j) += intHess(i,j);
+          }
+        }
+      }
+    }
+  }
+
+  void HessVec_23(scalar_view & hess,
+                  const ROL::Ptr<const std::vector<Real>> & v_param,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = ROL::nullPtr,
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Velocity_Darcy::HessVec_23 is zero.");
+  }
+
+  std::vector<Real> HessVec_31(std::vector<scalar_view> & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = ROL::nullPtr,
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if (z_param != ROL::nullPtr) {
+      // Get relevant dimensions
+      const int c = fePrs_->gradN().extent_int(0);
+      const int d = fePrs_->gradN().extent_int(3);
+      // Initialize output val
+      std::vector<Real> h_param(d,static_cast<Real>(0));
+      hess.clear(); hess.resize(d);
+      for (int k = 0; k < d; ++k)
+        hess[k] = scalar_view("hess", c);
+      // Compute cost integral
+      const int numLocSides = bdryCellLocIds_.size();
+      for (int l = 0; l < numLocSides; ++l) {
+        const int numCellsSide  = bdryCellLocIds_[l].size();
+        if ( numCellsSide ) {
+          const int numCubPerSide = fePrsBdry_[l]->cubPts().extent_int(1);
+          std::vector<scalar_view> intVal(d);
+          scalar_view valV_eval("valV_eval", numCellsSide, numCubPerSide);
+          scalar_view weight("weight", numCellsSide, numCubPerSide);
+          for (int k = 0; k < d; ++k) {
+            intVal[k] = scalar_view("intVal", numCellsSide);
+            if ((k==0 && !onlyAxial_) || k==1) {
+              scalar_view v_coeff_bdry = getBoundaryCoeff(v_coeff, l, fePrs_);
+              valV_eval->initialize();
+              fePrsBdry_[l]->evaluateValue(valV_eval, v_coeff_bdry);
+              for (int i = 0; i < numCellsSide; ++i) {
+                for (int j = 0; j < numCubPerSide; ++j) {
+                  weight(i,j) = static_cast<Real>(1);
+                  valV_eval(i,j) *= static_cast<Real>(-1);
+                }
+              }
+              fePrsBdry_[l]->computeIntegral(intVal[k],weight,valV_eval,true);
+            }
+          }
+          // Add to integral value
+          for (int k = 0; k < d; ++k) {
+            for (int i = 0; i < numCellsSide; ++i) {
+              int cidx = bdryCellLocIds_[l][i];
+              (hess[k])(cidx) += (intVal[k])(i);
+            }
+          }
+        }
+      }
+      return h_param;
+    }
+    else {
+      throw Exception::Zero(">>> QoI_Velocity_Darcy::HessVec_31 is zero.");
+    }
+  }
+
+  std::vector<Real> HessVec_32(std::vector<scalar_view> & hess,
+                               const scalar_view v_coeff,
+                               const scalar_view u_coeff,
+                               const scalar_view z_coeff = ROL::nullPtr,
+                               const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> HessVec_32 is zero.");
+  }
+
+  std::vector<Real> HessVec_33(std::vector<scalar_view> & hess,
+                               const ROL::Ptr<const std::vector<Real>> & v_param,
+                               const scalar_view u_coeff,
+                               const scalar_view z_coeff = ROL::nullPtr,
+                               const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if (z_param != ROL::nullPtr) {
+      // Get relevant dimensions
+      const int c = fePrs_->gradN().extent_int(0);
+      const int d = fePrs_->gradN().extent_int(3);
+      // Initialize output val
+      std::vector<Real> h_param(d,static_cast<Real>(0));
+      hess.clear(); hess.resize(d);
+      for (int k = 0; k < d; ++k)
+        hess[k] = scalar_view("hess", c);
+      // Compute cost integral
+      const int numLocSides = bdryCellLocIds_.size();
+      for (int l = 0; l < numLocSides; ++l) {
+        const int numCellsSide  = bdryCellLocIds_[l].size();
+        if ( numCellsSide ) {
+          const int numCubPerSide = fePrsBdry_[l]->cubPts().extent_int(1);
+          std::vector<scalar_view> intVal(d);
+          scalar_view valU_eval("valU_eval", numCellsSide, numCubPerSide);
+          scalar_view weight("weight", numCellsSide, numCubPerSide);
+          for (int k = 0; k < d; ++k) {
+            intVal[k] = scalar_view("intVal", numCellsSide);
+            if ((k==0 && !onlyAxial_) || k==1) {
+              valU_eval->initialize();
+              for (int i = 0; i < numCellsSide; ++i) {
+                for (int j = 0; j < numCubPerSide; ++j) {
+                  weight(i,j) = static_cast<Real>(1);
+                  valU_eval(i,j) = (*v_param)[k];
+                }
+              }
+              fePrsBdry_[l]->computeIntegral(intVal[k],weight,valU_eval,true);
+            }
+          }
+          // Add to integral value
+          for (int k = 0; k < d; ++k) {
+            for (int i = 0; i < numCellsSide; ++i) {
+              int cidx = bdryCellLocIds_[l][i];
+              (hess[k])(cidx) += (intVal[k])(i);
+            }
+          }
+        }
+      }
+      return h_param;
+    }
+    else {
+      throw Exception::Zero(">>> QoI_Velocity_Darcy::HessVec_33 is zero.");
+    }
+  }
+
+}; // QoI_Velocity_Darcy
+
+
+template <class Real, class DeviceType>
+class QoI_VelocityTracking_Darcy : public QoI<Real, DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> fePrs_, feCtrl_;
+  const ROL::Ptr<Permeability<Real,DeviceType>> perm_;
+  scalar_view target_, weight_;
+  Real rad_, yvel_, frac_, twpow_, top_;
+  std::vector<Real> rvec_, zvec_;
+  int targetType_;
+  bool onlyAxial_, optimizeFrac_, invertFrac_, polyWeight_, veloWeight_;
+
+  Real xTarget(const std::vector<Real> &x) const {
+    const Real X = x[0], Y = x[1];
+    //return xWeight(x) ? -X*Y/(rad_*rad_-Y*Y) : zero;
+    //return xWeight(x) ? -X*Y/std::sqrt(rad_*rad_-Y*Y) : zero;
+    //return polyWeight(x) * (-X*Y/std::sqrt(rad_*rad_-Y*Y));
+    //return -X*Y/std::sqrt(rad_*rad_-Y*Y);
+    if (targetType_ == 1)
+      return -X*Y/((rad_*rad_-Y*Y)*(rad_*rad_-Y*Y));
+    else if (targetType_ == 2) {
+      Real slope = 0;
+      Real xVal = 0;
+      for (int i=0; i<5; ++i) {
+        if ((Y >= zvec_[i]) && (Y < zvec_[i+1])) {
+          slope = (rvec_[i+1] - rvec_[i]) / (zvec_[i+1] - zvec_[i]);
+          xVal  = X*slope/std::pow(rvec_[i] + slope*(Y-zvec_[i]), 3);
+        }
+      }
+      return xVal;
+    }
+    else
+      throw Exception::NotImplemented(">>> Desired target type (not 1 or 2) not implemented.");
+  }
+
+  Real yTarget(const std::vector<Real> &x) const {
+    const Real one(1), Y = x[1];
+    //return yWeight(x) ? one : zero;
+    //return yWeight(x) ? std::sqrt(rad_*rad_-Y*Y) : zero;
+    //return polyWeight(x) * std::sqrt(rad_*rad_-Y*Y);
+    //return std::sqrt(rad_*rad_-Y*Y);
+    if (targetType_ == 1)
+      return one/(rad_*rad_-Y*Y);
+    else if (targetType_ == 2) {
+      Real slope = 0;
+      Real yVal = 0;
+      for (int i=0; i<5; ++i) {
+        if ((Y >= zvec_[i]) && (Y < zvec_[i+1])) {
+          slope = (rvec_[i+1] - rvec_[i]) / (zvec_[i+1] - zvec_[i]);
+          yVal  = 1.0/std::pow(rvec_[i] + slope*(Y-zvec_[i]), 2);
+        }
+      }
+      return yVal;
+    }
+    else
+      throw Exception::NotImplemented(">>> Desired target type (not 1 or 2) not implemented.");
+  }
+
+  Real xWeight(const std::vector<Real> &x) const {
+    return yWeight(x);
+  }
+
+  Real yWeight(const std::vector<Real> &x) const {
+    if (optimizeFrac_) {
+      const Real zero(0), one(1), Y = x[1];
+      if (invertFrac_)
+        return (std::abs(Y)  > frac_*top_ ? one : zero);
+      else
+        return (std::abs(Y) <= frac_*top_ ? one : zero);
+    }
+    else if (polyWeight_)
+      return polyWeight(x);
+    else if (veloWeight_)
+      return 1.0/(std::pow(xTarget(x), 2) + std::pow(yTarget(x), 2));
+    else
+      return 1.0;
+  }
+
+  Real polyWeight(const std::vector<Real> &x) const {
+    const Real zero(0), one(1), Y = x[1], p = twpow_;
+    Real yTOP(0), yBOT(0);
+    yTOP = top_;
+    yBOT = -yTOP;
+    Real val = 0, at = 0, bt = 0;
+    at = one / std::pow(-yTOP,p);
+    bt = one / std::pow(-yBOT,p);
+    if (Y > zero) {
+      val = at*std::pow(Y-yTOP,p);
+    } else {
+      val = bt*std::pow(Y-yBOT,p);
+    }
+    //std::cout << Y << "  " << val << std::endl;
+    return val;
+  }
+
+public:
+  QoI_VelocityTracking_Darcy(ROL::ParameterList &list,
+                             const ROL::Ptr<fe_type> &fePrs,
+                             const ROL::Ptr<fe_type> &feCtrl,
+                             const ROL::Ptr<Permeability<Real,DeviceType>> &perm)
+    : fePrs_(fePrs), feCtrl_(feCtrl), perm_(perm) {
+
+    rvec_.resize(6);
+    zvec_.resize(6);
+
+    rad_          = list.sublist("Problem").get("Diffuser Radius",10.0);
+    top_          = list.sublist("Problem").get("Diffuser Top",9.9763392);
+    rvec_[0]      = list.sublist("Problem").get("r0",0.6875);
+    zvec_[0]      = list.sublist("Problem").get("z0",-14.001);
+    rvec_[1]      = list.sublist("Problem").get("r1",3.0);
+    zvec_[1]      = list.sublist("Problem").get("z1",-9.0);
+    rvec_[2]      = list.sublist("Problem").get("r2",10.0);
+    zvec_[2]      = list.sublist("Problem").get("z2",-3.0);
+    rvec_[3]      = list.sublist("Problem").get("r3",10.0);
+    zvec_[3]      = list.sublist("Problem").get("z3",3.0);
+    rvec_[4]      = list.sublist("Problem").get("r4",3.0);
+    zvec_[4]      = list.sublist("Problem").get("z4",9.0);
+    rvec_[5]      = list.sublist("Problem").get("r5",0.6875);
+    zvec_[5]      = list.sublist("Problem").get("z5",14.001);
+    yvel_         = list.sublist("Problem").get("Target Axial Velocity",15.0);
+    optimizeFrac_ = list.sublist("Problem").get("Optimize Domain Fraction",false);
+    invertFrac_   = list.sublist("Problem").get("Invert Domain Fraction",false);
+    frac_         = list.sublist("Problem").get("Integration Domain Fraction",1.00);
+    polyWeight_   = list.sublist("Problem").get("Use Polynomial Weight",false);
+    veloWeight_   = list.sublist("Problem").get("Use Target Velocity Weight",false);
+    onlyAxial_    = list.sublist("Problem").get("Only Use Axial Velocity",false);
+    targetType_   = list.sublist("Problem").get("Target Type", 1);
+    twpow_        = list.sublist("Problem").get("Target Weighting Power",0.0);
+    Real xWScal   = list.sublist("Problem").get("Radial Tracking Scale",1.0);
+    Real yWScal   = list.sublist("Problem").get("Axial Tracking Scale",1.0);
+    bool useNorm  = list.sublist("Problem").get("Use Normalized Misfit",false);
+    useNorm = onlyAxial_ ? false : useNorm;
+    xWScal  = onlyAxial_ ? static_cast<Real>(0) : xWScal;
+    const int c = fePrs_->gradN().extent_int(0);
+    const int p = fePrs_->gradN().extent_int(2);
+    target_ = scalar_view("target",c,p,2);
+    weight_ = scalar_view("weight",c,p,2);
+    std::vector<Real> x(2);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        x[0] = (fePrs_->cubPts())(i,j,0);
+        x[1] = (fePrs_->cubPts())(i,j,1);
+        target_(i,j,0) = xTarget(x);
+        target_(i,j,1) = yTarget(x);
+        if (useNorm && yWeight(x)) {
+          xWScal = static_cast<Real>(1)
+                  /(std::pow(target_(i,j,0),2) + std::pow(target_(i,j,1),2));
+          yWScal = xWScal;
+        }
+        weight_(i,j,0) = x[0] * xWScal * xWeight(x);
+        weight_(i,j,1) = x[0] * yWScal * yWeight(x);
+      }
+    }
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = ROL::nullPtr,
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c = fePrs_->gradN().extent_int(0);
+    const int p = fePrs_->gradN().extent_int(2);
+    const int d = fePrs_->gradN().extent_int(3);
+    Real yvel = (z_param == ROL::nullPtr) ? yvel_ : (*z_param)[0];
+    // Initialize output val
+    val = scalar_view("val",c);
+    // Compute cost integral
+    scalar_view gradU("gradU",c,p,d);
+    scalar_view vel("vel",c,p,d);
+    scalar_view wvel("wvel",c,p,d);
+    scalar_view valZ("valZ",c,p);
+    scalar_view alpha("alpha",c,p);
+    fePrs_->evaluateGradient(gradU, u_coeff);
+    feCtrl_->evaluateValue(valZ, z_coeff);
+    perm_->compute(alpha, valZ, fePrs_->cubPts(), 0);
+
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k) {
+          vel(i,j,k)  = alpha(i,j)*gradU(i,j,k)+yvel*target_(i,j,k);
+          wvel(i,j,k) = weight_(i,j,k)*vel(i,j,k);
+        }
+      }
+    }
+
+    fePrs_->computeIntegral(val,vel,wvel);
+    // Scale by one half
+    rst::scale(val,static_cast<Real>(0.5));
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = ROL::nullPtr,
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c = fePrs_->gradN().extent_int(0);
+    const int f = fePrs_->gradN().extent_int(1);
+    const int p = fePrs_->gradN().extent_int(2);
+    const int d = fePrs_->gradN().extent_int(3);
+    Real yvel = (z_param == ROL::nullPtr) ? yvel_ : (*z_param)[0];
+    // Initialize output grad
+    grad = scalar_view("grad", c, f);
+    // Compute cost integral
+    scalar_view gradU("gradU",c,p,d);
+    scalar_view awvel("awvel",c,p,d);
+    scalar_view valZ("valZ",c,p);
+    scalar_view alpha("alpha",c,p);
+    fePrs_->evaluateGradient(gradU, u_coeff);
+    feCtrl_->evaluateValue(valZ, z_coeff);
+    perm_->compute(alpha, valZ, fePrs_->cubPts(), 0);
+
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k) {
+          awvel(i,j,k) = weight_(i,j,k)*alpha(i,j)*(alpha(i,j)*gradU(i,j,k)+yvel*target_(i,j,k));
+        }
+      }
+    }
+
+    fst::integrate(grad,awvel,fePrs_->gradNdetJ(),false);
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = ROL::nullPtr,
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c  = fePrs_->gradN().extent_int(0);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int p  = fePrs_->gradN().extent_int(2);
+    const int d  = fePrs_->gradN().extent_int(3);
+    Real yvel = (z_param == ROL::nullPtr) ? yvel_ : (*z_param)[0];
+    // Initialize output grad
+    grad = scalar_view("grad", c, fc);
+    // Compute cost integral
+    scalar_view gradU("gradU",c,p,d);
+    scalar_view valZ("valZ",c,p);
+    scalar_view alpha("alpha",c,p);
+    scalar_view dalpha("dalpha",c,p);
+    scalar_view deriv("deriv",c,p);
+    fePrs_->evaluateGradient(gradU, u_coeff);
+    feCtrl_->evaluateValue(valZ, z_coeff);
+    perm_->compute(alpha, valZ, fePrs_->cubPts(), 0);
+    perm_->compute(dalpha, valZ, fePrs_->cubPts(), 1);
+
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k) {
+          deriv(i,j) += weight_(i,j,k)*(alpha(i,j)*gradU(i,j,k)+yvel*target_(i,j,k))
+                        *dalpha(i,j)*gradU(i,j,k);
+        }
+      }
+    }
+
+    fst::integrate(grad,deriv,feCtrl_->NdetJ(),false);
+  }
+
+  std::vector<Real> gradient_3(std::vector<scalar_view> & grad,
+                               const scalar_view u_coeff,
+                               const scalar_view z_coeff = ROL::nullPtr,
+                               const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if (z_param != ROL::nullPtr) {
+      // Get relevant dimensions
+      const int c = fePrs_->gradN().extent_int(0);
+      const int p = fePrs_->gradN().extent_int(2);
+      const int d = fePrs_->gradN().extent_int(3);
+      Real yvel = (z_param == ROL::nullPtr) ? yvel_ : (*z_param)[0];
+      // Initialize output val
+      std::vector<Real> g_param(d,static_cast<Real>(0));
+      grad.clear(); grad.resize(d);
+      for (int i = 0; i < d; ++i)
+        grad[i] = scalar_view("grad", c);
+      // Compute cost integral
+      scalar_view gradU("gradU",c,p,d);
+      scalar_view wvel("wvel",c,p,d);
+      scalar_view valZ("valZ",c,p);
+      scalar_view alpha("alpha",c,p);
+      fePrs_->evaluateGradient(gradU, u_coeff);
+      feCtrl_->evaluateValue(valZ, z_coeff);
+      perm_->compute(alpha, valZ, fePrs_->cubPts(), 0);
+
+      for (int i = 0; i < c; ++i) {
+        for (int j = 0; j < p; ++j) {
+          for (int k = 0; k < d; ++k) {
+            wvel(i,j,k) = weight_(i,j,k)*(alpha(i,j)*gradU(i,j,k)+yvel*target_(i,j,k));
+          }
+        }
+      }
+
+      fePrs_->computeIntegral(grad[0],wvel,target_);
+      
+      return g_param;
+    }
+    else {
+      throw Exception::Zero(">>> QoI_Velocity_Darcy::gradient_3 is zero.");
+    }
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = ROL::nullPtr,
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c = fePrs_->gradN().extent_int(0);
+    const int f = fePrs_->gradN().extent_int(1);
+    const int p = fePrs_->gradN().extent_int(2);
+    const int d = fePrs_->gradN().extent_int(3);
+    // Initialize output hess
+    hess = scalar_view("hess", c, f);
+    // Compute cost integral
+    scalar_view gradV("gradV",c,p,d);
+    scalar_view awvel("awvel",c,p,d);
+    scalar_view valZ("valZ",c,p);
+    scalar_view alpha("alpha",c,p);
+    fePrs_->evaluateGradient(gradV, v_coeff);
+    feCtrl_->evaluateValue(valZ, z_coeff);
+    perm_->compute(alpha, valZ, fePrs_->cubPts(), 0);
+
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k) {
+          awvel(i,j,k) = alpha(i,j)*alpha(i,j)*weight_(i,j,k)*gradV(i,j,k);
+        }
+      }
+    }
+
+    fst::integrate(hess,awvel,fePrs_->gradNdetJ(),false);
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = ROL::nullPtr,
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c = fePrs_->gradN().extent_int(0);
+    const int f = fePrs_->gradN().extent_int(1);
+    const int p = fePrs_->gradN().extent_int(2);
+    const int d = fePrs_->gradN().extent_int(3);
+    Real yvel = (z_param == ROL::nullPtr) ? yvel_ : (*z_param)[0];
+    // Initialize output grad
+    hess = scalar_view("hess", c, f);
+    // Compute cost integral
+    scalar_view gradU("gradU",c,p,d);
+    scalar_view awvel("awvel",c,p,d);
+    scalar_view valZ("valZ",c,p);
+    scalar_view valV("valV",c,p);
+    scalar_view alpha("alpha",c,p);
+    scalar_view dalpha("dalpha",c,p);
+    fePrs_->evaluateGradient(gradU, u_coeff);
+    feCtrl_->evaluateValue(valZ, z_coeff);
+    feCtrl_->evaluateValue(valV, v_coeff);
+    perm_->compute( alpha, valZ, fePrs_->cubPts(), 0);
+    perm_->compute(dalpha, valZ, fePrs_->cubPts(), 1);
+
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k) {
+          awvel(i,j,k)  = static_cast<Real>(2)*alpha(i,j)*gradU(i,j,k)+yvel*target_(i,j,k);
+          awvel(i,j,k) *= weight_(i,j,k)*dalpha(i,j)*valV(i,j);
+        }
+      }
+    }
+
+    fst::integrate(hess,awvel,fePrs_->gradNdetJ(),false);
+  }
+
+  void HessVec_13(scalar_view & hess,
+                  const ROL::Ptr<const std::vector<Real>> & v_param,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = ROL::nullPtr,
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if (z_param != ROL::nullPtr) {
+      // Get relevant dimensions
+      const int c = fePrs_->gradN().extent_int(0);
+      const int f = fePrs_->gradN().extent_int(1);
+      const int p = fePrs_->gradN().extent_int(2);
+      const int d = fePrs_->gradN().extent_int(3);
+      // Initialize output val
+      hess = scalar_view("hess", c, f);
+      // Compute cost integral
+      scalar_view wvel("wvel",c,p,d);
+      scalar_view valZ("valZ",c,p);
+      scalar_view alpha("alpha",c,p);
+      feCtrl_->evaluateValue(valZ, z_coeff);
+      perm_->compute(alpha, valZ, fePrs_->cubPts(), 0);
+
+      for (int i = 0; i < c; ++i) {
+        for (int j = 0; j < p; ++j) {
+          for (int k = 0; k < d; ++k) {
+            wvel(i,j,k) = weight_(i,j,k)*alpha(i,j)*target_(i,j,k)*(*v_param)[0];
+          }
+        }
+      }
+
+      fst::integrate(hess,wvel,fePrs_->gradNdetJ(),false);
+    }
+    else {
+      throw Exception::NotImplemented(">>> HessVec_13 not implemented.");
+    }
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = ROL::nullPtr,
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c  = fePrs_->gradN().extent_int(0);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int p  = fePrs_->gradN().extent_int(2);
+    const int d  = fePrs_->gradN().extent_int(3);
+    Real yvel = (z_param == ROL::nullPtr) ? yvel_ : (*z_param)[0];
+    // Initialize output grad
+    hess = scalar_view("hess", c, fc);
+    // Compute cost integral
+    scalar_view gradU("gradU",c,p,d);
+    scalar_view gradV("gradV",c,p,d);
+    scalar_view valZ("valZ",c,p);
+    scalar_view alpha("alpha",c,p);
+    scalar_view dalpha("dalpha",c,p);
+    scalar_view deriv("deriv",c,p);
+    fePrs_->evaluateGradient(gradU, u_coeff);
+    fePrs_->evaluateGradient(gradV, v_coeff);
+    feCtrl_->evaluateValue(valZ, z_coeff);
+    perm_->compute(alpha, valZ, fePrs_->cubPts(), 0);
+    perm_->compute(dalpha, valZ, fePrs_->cubPts(), 1);
+
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k) {
+          deriv(i,j) += weight_(i,j,k)*(alpha(i,j)*gradU(i,j,k)+yvel*target_(i,j,k))
+                        *dalpha(i,j)*gradV(i,j,k);
+          deriv(i,j) += weight_(i,j,k)*dalpha(i,j)*gradU(i,j,k)*alpha(i,j)*gradV(i,j,k);
+        }
+      }
+    }
+
+    fst::integrate(hess,deriv,feCtrl_->NdetJ(),false);
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = ROL::nullPtr,
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c  = fePrs_->gradN().extent_int(0);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int p  = fePrs_->gradN().extent_int(2);
+    const int d  = fePrs_->gradN().extent_int(3);
+    Real yvel = (z_param == ROL::nullPtr) ? yvel_ : (*z_param)[0];
+    // Initialize output hess
+    hess = scalar_view("hess", c, fc);
+    // Compute cost integral
+    scalar_view gradU("gradU",c,p,d);
+    scalar_view valZ("valZ",c,p);
+    scalar_view valV("valV",c,p);
+    scalar_view alpha("alpha",c,p);
+    scalar_view dalpha("dalpha",c,p);
+    scalar_view ddalpha("ddalpha",c,p);
+    scalar_view deriv("deriv",c,p);
+    fePrs_->evaluateGradient(gradU, u_coeff);
+    feCtrl_->evaluateValue(valZ, z_coeff);
+    feCtrl_->evaluateValue(valV, v_coeff);
+    perm_->compute(  alpha, valZ, fePrs_->cubPts(), 0);
+    perm_->compute( dalpha, valZ, fePrs_->cubPts(), 1);
+    perm_->compute(ddalpha, valZ, fePrs_->cubPts(), 2);
+
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k) {
+          deriv(i,j) += weight_(i,j,k)*(alpha(i,j)*gradU(i,j,k)+yvel*target_(i,j,k))
+                        *ddalpha(i,j)*valV(i,j)*gradU(i,j,k);
+          deriv(i,j) += weight_(i,j,k)*dalpha(i,j)*gradU(i,j,k)
+                        *dalpha(i,j)*valV(i,j)*gradU(i,j,k);
+        }
+      }
+    }
+
+    fst::integrate(hess,deriv,feCtrl_->NdetJ(),false);
+  }
+
+  void HessVec_23(scalar_view & hess,
+                  const ROL::Ptr<const std::vector<Real>> & v_param,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = ROL::nullPtr,
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if (z_param != ROL::nullPtr) {
+      // Get relevant dimensions
+      const int c  = fePrs_->gradN().extent_int(0);
+      const int fc = feCtrl_->N().extent_int(1);
+      const int p  = fePrs_->gradN().extent_int(2);
+      const int d  = fePrs_->gradN().extent_int(3);
+      // Initialize output val
+      hess = scalar_view("hess", c, fc);
+      // Compute cost integral
+      scalar_view gradU("gradU",c,p,d);
+      scalar_view wvel("wvel",c,p);
+      scalar_view valZ("valZ",c,p);
+      scalar_view dalpha("dalpha",c,p);
+      fePrs_->evaluateGradient(gradU, u_coeff);
+      feCtrl_->evaluateValue(valZ, z_coeff);
+      perm_->compute(dalpha, valZ, fePrs_->cubPts(), 1);
+
+      for (int i = 0; i < c; ++i) {
+        for (int j = 0; j < p; ++j) {
+          for (int k = 0; k < d; ++k) {
+            wvel(i,j) += weight_(i,j,k)*dalpha(i,j)*gradU(i,j,k)*target_(i,j,k)*(*v_param)[0];
+          }
+        }
+      }
+
+      fst::integrate(hess,wvel,feCtrl_->NdetJ(),false);
+    }
+    else {
+      throw Exception::Zero(">>> QoI_Velocity_Darcy::HessVec_23 is zero.");
+    }
+  }
+
+  std::vector<Real> HessVec_31(std::vector<scalar_view> & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = ROL::nullPtr,
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if (z_param != ROL::nullPtr) {
+      // Get relevant dimensions
+      const int c = fePrs_->gradN().extent_int(0);
+      const int p = fePrs_->gradN().extent_int(2);
+      const int d = fePrs_->gradN().extent_int(3);
+      // Initialize output val
+      std::vector<Real> h_param(d,static_cast<Real>(0));
+      hess.clear(); hess.resize(d);
+      for (int k = 0; k < d; ++k)
+        hess[k] = scalar_view("hess",c);
+      // Compute cost integral
+      scalar_view gradV("gradV",c,p,d);
+      scalar_view wvel("wvel",c,p,d);
+      scalar_view valZ("valZ",c,p);
+      scalar_view alpha("alpha",c,p);
+      fePrs_->evaluateGradient(gradV, v_coeff);
+      feCtrl_->evaluateValue(valZ, z_coeff);
+      perm_->compute(alpha, valZ, fePrs_->cubPts(), 0);
+
+      for (int i = 0; i < c; ++i) {
+        for (int j = 0; j < p; ++j) {
+          for (int k = 0; k < d; ++k) {
+            wvel(i,j,k) = weight_(i,j,k)*alpha(i,j)*gradV(i,j,k);
+          }
+        }
+      }
+
+      fePrs_->computeIntegral(hess[0],wvel,target_);
+
+      return h_param;
+    }
+    else {
+      throw Exception::Zero(">>> QoI_Velocity_Darcy::HessVec_31 is zero.");
+    }
+  }
+
+  std::vector<Real> HessVec_32(std::vector<scalar_view> & hess,
+                               const scalar_view v_coeff,
+                               const scalar_view u_coeff,
+                               const scalar_view z_coeff = ROL::nullPtr,
+                               const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if (z_param != ROL::nullPtr) {
+      // Get relevant dimensions
+      const int c = fePrs_->gradN().extent_int(0);
+      const int p = fePrs_->gradN().extent_int(2);
+      const int d = fePrs_->gradN().extent_int(3);
+      // Initialize output val
+      std::vector<Real> h_param(d,static_cast<Real>(0));
+      hess.clear(); hess.resize(d);
+      for (int k = 0; k < d; ++k)
+        hess[k] = scalar_view("hess",c);
+      // Compute cost integral
+      scalar_view gradU("gradU",c,p,d);
+      scalar_view wvel("wvel",c,p,d);
+      scalar_view valZ("valZ",c,p);
+      scalar_view valV("valV",c,p);
+      scalar_view dalpha("alpha",c,p);
+      fePrs_->evaluateGradient(gradU, u_coeff);
+      feCtrl_->evaluateValue(valZ, z_coeff);
+      feCtrl_->evaluateValue(valV, v_coeff);
+      perm_->compute(dalpha, valZ, fePrs_->cubPts(), 1);
+
+      for (int i = 0; i < c; ++i) {
+        for (int j = 0; j < p; ++j) {
+          for (int k = 0; k < d; ++k) {
+            wvel(i,j,k) = weight_(i,j,k)*dalpha(i,j)*gradU(i,j,k)*valV(i,j);
+          }
+        }
+      }
+
+      fePrs_->computeIntegral(hess[0],wvel,target_);
+
+      return h_param;
+    }
+    else {
+      throw Exception::Zero(">>> HessVec_32 is zero.");
+    }
+  }
+
+  std::vector<Real> HessVec_33(std::vector<scalar_view> & hess,
+                               const ROL::Ptr<const std::vector<Real>> & v_param,
+                               const scalar_view u_coeff,
+                               const scalar_view z_coeff = ROL::nullPtr,
+                               const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if (z_param != ROL::nullPtr) {
+      // Get relevant dimensions
+      const int c = fePrs_->gradN().extent_int(0);
+      const int p = fePrs_->gradN().extent_int(2);
+      const int d = fePrs_->gradN().extent_int(3);
+      // Initialize output val
+      std::vector<Real> h_param(d,static_cast<Real>(0));
+      hess.clear(); hess.resize(d);
+      for (int k = 0; k < d; ++k)
+        hess[k] = scalar_view("hess",c);
+      // Compute cost integral
+      scalar_view wtarget("wtarget",c,p,d);
+      for (int i = 0; i < c; ++i) {
+        for (int j = 0; j < p; ++j) {
+          for (int k = 0; k < d; ++k) {
+            wtarget(i,j,k) = weight_(i,j,k)*target_(i,j,k);
+          }
+        }
+      }
+      fePrs_->computeIntegral(hess[0],wtarget,target_);
+      rst::scale(hess[0],(*v_param)[0]);
+      
+      return h_param;
+    }
+    else {
+      throw Exception::Zero(">>> QoI_Velocity_Darcy::HessVec_33 is zero.");
+    }
+  }
+
+}; // QoI_Velocity_Darcy
+#endif
diff --git a/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/filteredDarcy/pde_darcyK.hpp b/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/filteredDarcy/pde_darcyK.hpp
new file mode 100644
index 000000000000..9057b16067a4
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/filteredDarcy/pde_darcyK.hpp
@@ -0,0 +1,745 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  pde_darcy.hpp
+    \brief Implements the local PDE interface for the Darcy porosity optimization problem.
+*/
+
+#ifndef PDE_DARCYK_HPP
+#define PDE_DARCYK_HPP
+
+#include "../../../../TOOLS/pdeK.hpp"
+#include "../../../../TOOLS/feK.hpp"
+#include "../../../../TOOLS/fieldhelperK.hpp"
+#include "permeabilityK.hpp"
+#include "../../../../TOOLS/Intrepid2_CubatureNodal.hpp"
+
+#include "Intrepid2_HVOL_C0_FEM.hpp"
+#include "Intrepid2_HGRAD_QUAD_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_QUAD_C2_FEM.hpp"
+#include "Intrepid2_HGRAD_TRI_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_TRI_C2_FEM.hpp"
+#include "Intrepid2_DefaultCubatureFactory.hpp"
+#include "Intrepid2_FunctionSpaceTools.hpp"
+#include "Intrepid2_CellTools.hpp"
+
+#include "ROL_Ptr.hpp"
+
+template <class Real, class DeviceType>
+class PDE_Darcy : public PDE<Real, DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  // Finite element basis information
+  basis_ptr basisPtrPrs_, basisPtrCtrl_;
+  std::vector<basis_ptr> basisPtrs_, basisPtrsCtrl_;
+  // Cell cubature information
+  ROL::Ptr<Intrepid2::Cubature<DeviceType,Real,Real>> cellCub_, bdryCub_;
+  // Cell node information
+  scalar_view volCellNodes_;
+  std::vector<std::vector<scalar_view>> bdryCellNodes_;
+  std::vector<std::vector<std::vector<int>>> bdryCellLocIds_;
+  // Finite element definition
+  ROL::Ptr<fe_type> fePrs_, feCtrl_;
+  std::vector<std::vector<ROL::Ptr<fe_type>>> fePrsBdry_, feCtrlBdry_;
+  // Local degrees of freedom on boundary, for each side of the reference cell (first index).
+  std::vector<std::vector<int>> fpidx_;
+  // Field pattern, offsets, etc.
+  std::vector<std::vector<int>> fieldPattern_;  // local Field/DOF pattern; set from DOF manager 
+  int numFields_;                               // number of fields (equations in the PDE)
+  int numDofs_;                                 // total number of degrees of freedom for all (local) fields
+  std::vector<int> offset_;                     // for each field, a counting offset
+  std::vector<int> numFieldDofs_;               // for each field, number of degrees of freedom
+  std::vector<std::vector<int>> fieldPatternCtrl_;  // local Field/DOF pattern; set from DOF manager 
+  int numFieldsCtrl_;                               // number of fields (equations in the PDE)
+  int numDofsCtrl_;                                 // total number of degrees of freedom for all (local) fields
+  std::vector<int> offsetCtrl_;                     // for each field, a counting offset
+  std::vector<int> numFieldDofsCtrl_;               // for each field, number of degrees of freedom
+  
+  // Problem parameters.
+  Real Patm_, inVelocity_, dynvisco_;
+  bool useNoSlip_;
+
+  ROL::Ptr<Permeability<Real,DeviceType>> perm_;
+  ROL::Ptr<FieldUtils::FieldInfo> fieldInfo_, fieldInfoCtrl_;
+
+  void multiplyByRadius(scalar_view &input, const scalar_view cubPts,
+                        bool isField, bool useReciprocal = false) const {
+    int c = cubPts.extent_int(0);
+    int p = cubPts.extent_int(1);
+    scalar_view r("r",c,p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        if (!useReciprocal) r(i,j) = cubPts(i,j,0);
+        else                r(i,j) = static_cast<Real>(1)/cubPts(i,j,0);
+      }
+    }
+    scalar_view in(input);
+    if (!isField)
+      fst::scalarMultiplyDataData(input,r,in);
+    else
+      fst::scalarMultiplyDataField(input,r,in);
+  }
+
+public:
+  PDE_Darcy(ROL::ParameterList &parlist) : Patm_(101.325 /* kg/mm-s^2 */) {
+    // Finite element fields.
+    int cubDegree     = parlist.sublist("Problem").get("Cubature Degree",4);
+    int bdryCubDegree = parlist.sublist("Problem").get("Boundary Cubature Degree",4);
+    int basisDegPres  = parlist.sublist("Problem").get("Pressure Basis Degree",1);
+    int basisDegCtrl  = parlist.sublist("Problem").get("Filter Basis Degree",1);
+    std::string elemType = parlist.sublist("Problem").get("Element Type","QUAD");
+    if (elemType == "TRI") {
+      if (basisDegPres == 2)
+        basisPtrPrs_ = ROL::makePtr<Intrepid2::Basis_HGRAD_TRI_C2_FEM<DeviceType,Real,Real>>();
+      else 
+        basisPtrPrs_ = ROL::makePtr<Intrepid2::Basis_HGRAD_TRI_C1_FEM<DeviceType,Real,Real>>();
+    }
+    else {
+      if (basisDegPres == 2)
+        basisPtrPrs_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C2_FEM<DeviceType,Real,Real>>();
+      else 
+        basisPtrPrs_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceType,Real,Real>>();
+    }
+    basisPtrs_.push_back(basisPtrPrs_);       // Pressure component
+    shards::CellTopology cellType = basisPtrs_[0]->getBaseCellTopology();        // get the cell type from any basis
+    if (basisDegCtrl == 1) {
+      if (elemType == "TRI")
+        basisPtrCtrl_ = ROL::makePtr<Intrepid2::Basis_HGRAD_TRI_C1_FEM<DeviceType,Real,Real>>();
+      else
+        basisPtrCtrl_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceType,Real,Real>>();
+    }
+    else {
+      basisPtrCtrl_ = ROL::makePtr<Intrepid2::Basis_HVOL_C0_FEM<DeviceType,Real,Real>>(cellType);
+    }
+    basisPtrsCtrl_.push_back(basisPtrCtrl_);  // Control component
+    // Quadrature rules.
+    Intrepid2::DefaultCubatureFactory cubFactory;                            // create cubature factory
+    if (cubDegree == -1) {  // nodal cubature
+      cellCub_ = ROL::makePtr<Intrepid2::CubatureNodal<DeviceType,Real,Real>>(cellType);
+    }
+    else {                  // default cubature
+      cellCub_ = cubFactory.create<DeviceType,Real,Real>(cellType, cubDegree); // create default cubature
+    }
+    shards::CellTopology bdryCellType = cellType.getCellTopologyData(1, 0);
+    if (cubDegree == -1) {  // nodal cubature
+      bdryCub_ = ROL::makePtr<Intrepid2::CubatureNodal<DeviceType,Real,Real>>(bdryCellType);
+    }
+    else {                  // default cubature
+      bdryCub_ = cubFactory.create<DeviceType,Real,Real>(bdryCellType, bdryCubDegree); // create default cubature
+    }
+
+    // Other problem parameters.
+    dynvisco_     = parlist.sublist("Problem").get("Dynamic Viscosity", 0.84e-8); // kg/mm-s
+    Real density  = parlist.sublist("Problem").get("Fluid Density", 8.988e-11);   // kg/mm^3
+    Real inRadius = parlist.sublist("Problem").get("Inlet Radius", 0.6875);       // mm
+    Real Reynolds = parlist.sublist("Problem").get("Reynolds Number", 50.0);      // dimensionless
+    inVelocity_ = Reynolds * dynvisco_ / (density*static_cast<Real>(2)*inRadius); // mm/s
+    perm_       = ROL::makePtr<Permeability<Real,DeviceType>>(parlist);
+    useNoSlip_  = parlist.sublist("Problem").get("Use No Slip", false);
+
+    numDofs_ = 0;
+    numFields_ = basisPtrs_.size();
+    offset_.resize(numFields_);
+    numFieldDofs_.resize(numFields_);
+    for (int i=0; i<numFields_; ++i) {
+      offset_[i] = i==0 ? 0 : offset_[i-1] + basisPtrs_[i-1]->getCardinality();
+      numFieldDofs_[i] = basisPtrs_[i]->getCardinality();
+      numDofs_ += numFieldDofs_[i];
+    }
+    numDofsCtrl_ = 0;
+    numFieldsCtrl_ = basisPtrsCtrl_.size();
+    offsetCtrl_.resize(numFieldsCtrl_);
+    numFieldDofsCtrl_.resize(numFieldsCtrl_);
+    for (int i=0; i<numFieldsCtrl_; ++i) {
+      offsetCtrl_[i] = i==0 ? 0 : offsetCtrl_[i-1] + basisPtrsCtrl_[i-1]->getCardinality();
+      numFieldDofsCtrl_[i] = basisPtrsCtrl_[i]->getCardinality();
+      numDofsCtrl_ += numFieldDofsCtrl_[i];
+    }
+  }
+
+  void residual(scalar_view & res,
+                const scalar_view u_coeff,
+                const scalar_view z_coeff = scalar_view(),
+                const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c = fePrs_->gradN().extent_int(0);
+    const int f = fePrs_->gradN().extent_int(1);
+    const int p = fePrs_->gradN().extent_int(2);
+    const int d = fePrs_->gradN().extent_int(3);
+    // Initialize residuals.
+    res = scalar_view("res", c, f);
+    // Evaluate/interpolate finite element fields on cells.
+    scalar_view gradP("gradP", c, p, d);
+    scalar_view AgradP("AgradP", c, p, d);
+    scalar_view valCtrl("valCtrl", c, p);
+    scalar_view alpha("alpha", c, p);
+    fePrs_->evaluateGradient(gradP, u_coeff);
+    feCtrl_->evaluateValue(valCtrl, z_coeff);
+    perm_->compute(alpha, valCtrl, fePrs_->cubPts(), 0);
+    fst::scalarMultiplyDataData(AgradP,alpha,gradP);
+    /*** Evaluate weak form of the residual. ***/
+    multiplyByRadius(AgradP,fePrs_->cubPts(),false);
+    fst::integrate(res,AgradP,fePrs_->gradNdetJ(),false);
+    // Boundary conditions.
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      // APPLY DIRICHLET CONDITIONS
+      for (int i = 0; i < numSideSets; ++i) {
+        if (i==0) /* in flow */ {
+          const int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            const int numCellsSide = bdryCellLocIds_[i][j].size();
+            if (numCellsSide > 0) {
+              const int numCubPerSide = fePrsBdry_[i][j]->cubPts().extent_int(1);
+              scalar_view nRes("nRes", numCellsSide, f);
+              scalar_view nVal("nVal", numCellsSide, numCubPerSide);
+	      Kokkos::deep_copy(nVal, -inVelocity_);
+              multiplyByRadius(nVal,fePrsBdry_[i][j]->cubPts(),false);
+              fst::integrate(nRes,nVal,fePrsBdry_[i][j]->NdetJ(),false);
+              for (int k = 0; k < numCellsSide; ++k) {
+                const int cidx = bdryCellLocIds_[i][j][k];
+                for (int l = 0; l < f; ++l) {
+                  res(cidx,l) += nRes(k,l);
+                }
+              }
+            }
+          }
+        }
+        else if (useNoSlip_ && i==1) /* no slip */ {}
+        else if (i==2) /* out flow */ {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numBdryDofs = fpidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                res(cidx,fpidx_[j][l]) = u_coeff(cidx,fpidx_[j][l]) - Patm_;
+              }
+            }
+          }
+        }
+        else if (i==3) /* no normal */ {} 
+      }
+    }
+  }
+
+  void Jacobian_1(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c = fePrs_->gradN().extent_int(0);
+    const int f = fePrs_->gradN().extent_int(1);
+    const int p = fePrs_->gradN().extent_int(2);
+    const int d = fePrs_->gradN().extent_int(3);
+    // Initialize Jacobians.
+    jac = scalar_view("jac", c, f, f);
+    // Evaluate/interpolate finite element fields on cells.
+    scalar_view valCtrl("valCtrl", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view alphaPhi("alphaPhi", c, f, p, d);
+    feCtrl_->evaluateValue(valCtrl, z_coeff);
+    perm_->compute(alpha, valCtrl, fePrs_->cubPts(), 0);
+    // Multiply velocity with alpha
+    fst::scalarMultiplyDataField(alphaPhi, alpha, fePrs_->gradN());
+    /*** Evaluate weak form of the Jacobian. ***/
+    multiplyByRadius(alphaPhi,fePrs_->cubPts(),true);
+    fst::integrate(jac,alphaPhi,fePrs_->gradNdetJ(),false);
+    // APPLY DIRICHLET CONDITIONS
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if (i==0) /* in flow */ {}
+        else if (useNoSlip_ && i==1) /* no slip */ {}
+        else if (i==2) /* out flow */ {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numBdryDofs = fpidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                for (int m = 0; m < f; ++m) {
+                  jac(cidx,fpidx_[j][l],m) = static_cast<Real>(0);
+                }
+                jac(cidx,fpidx_[j][l],fpidx_[j][l]) = static_cast<Real>(1);
+              }
+            }
+          }
+        }
+        else if (i==3) /* no normal */ {}
+      }
+    }
+  }
+
+  void Jacobian_2(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c  = fePrs_->gradN().extent_int(0);
+    const int f  = fePrs_->gradN().extent_int(1);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int p  = fePrs_->gradN().extent_int(2);
+    const int d  = fePrs_->gradN().extent_int(3);
+    // Initialize Jacobians.
+    jac = scalar_view("jac", c, f, fc);
+    // Evaluate on FE basis.
+    scalar_view valZ("valZ", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view alphaN("alphaN", c, fc, p);
+    scalar_view gradP("gradP", c, p, d);
+    scalar_view gradPgradN("gradPgradN", c, f, p);
+    fePrs_->evaluateGradient(gradP, u_coeff);
+    feCtrl_->evaluateValue(valZ, z_coeff);
+    perm_->compute(alpha, valZ, fePrs_->cubPts(), 1);
+    fst::scalarMultiplyDataField(alphaN,alpha,feCtrl_->N());
+    fst::dotMultiplyDataField(gradPgradN,gradP,fePrs_->gradNdetJ());
+    /*** Evaluate weak form of the residual. ***/
+    multiplyByRadius(gradPgradN,fePrs_->cubPts(),true);
+    fst::integrate(jac,gradPgradN,alphaN,false);
+    // APPLY DIRICHLET CONDITIONS
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if (i==0) /* in flow */ {}
+        else if (useNoSlip_ && i==1) /* no slip */ {}
+        else if (i==2) /* out flow */ {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numBdryDofs = fpidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                for (int m=0; m < fc; ++m) {
+                  jac(cidx,fpidx_[j][l],m) = static_cast<Real>(0);
+                }
+              }
+            }
+          }
+        }
+        else if (i==3) /* no normal */ {}
+      }
+    }
+  }
+
+  void Jacobian_3(std::vector<scalar_view> & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> Jacobian_3 is zero!");
+  }
+
+  void Hessian_11(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> Hessian_11 is zero!");
+  }
+
+  void Hessian_12(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c  = fePrs_->gradN().extent_int(0);
+    const int f  = fePrs_->gradN().extent_int(1);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int p  = fePrs_->gradN().extent_int(2);
+    const int d  = fePrs_->gradN().extent_int(3);
+    // Initialize hessians.
+    hess = scalar_view("hess", c, fc, f);
+    // Apply Dirichlet conditions
+    scalar_view l0_coeff("l0_coeff", c, f);
+    Kokkos::deep_copy(l0_coeff, l_coeff);
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if (i==0) /* in flow */ {}
+        else if (useNoSlip_ && i==1) /* no slip */ {}
+        else if (i==2) /* out flow */ {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numBdryDofs = fpidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                l0_coeff(cidx,fpidx_[j][l]) = static_cast<Real>(0);
+              }
+            }
+          }
+        }
+        else if (i==3) /* no normal */ {}
+      }
+    }
+    // Evaluate/interpolate finite element fields on cells.
+    scalar_view valZ("valZ", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view gradL("gradL", c, p, d);
+    scalar_view alphaL("alphaL", c, fc, p, d);
+    fePrs_->evaluateGradient(gradL, l0_coeff);
+    feCtrl_->evaluateValue(valZ, z_coeff);
+    perm_->compute(alpha, valZ, fePrs_->cubPts(), 1);
+    /*** Evaluate weak form of the Hessian. ***/
+    // Multiply velocity with alpha
+    for (int j = 0; j < c; ++j) {
+      for (int k = 0; k < fc; ++k) {
+        for (int l = 0; l < p; ++l) {
+          for (int m = 0; m < d; ++m) {
+            alphaL(j,k,l,m) = alpha(j,l) * (feCtrl_->N())(j,k,l) * gradL(j,l,m);
+          }
+        }
+      }
+    }
+    multiplyByRadius(alphaL,fePrs_->cubPts(),true);
+    fst::integrate(hess,alphaL,fePrs_->gradNdetJ(),false);
+  }
+
+  void Hessian_13(std::vector<scalar_view> & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> Hessian_13 is zero!");
+  }
+
+  void Hessian_21(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c  = fePrs_->gradN().extent_int(0);
+    const int f  = fePrs_->gradN().extent_int(1);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int p  = fePrs_->gradN().extent_int(2);
+    const int d  = fePrs_->gradN().extent_int(3);
+    // Initialize hessians.
+    hess = scalar_view("hess", c, f, fc);
+    // Apply Dirichlet conditions
+    scalar_view l0_coeff("l0_coeff", c, f);
+    Kokkos::deep_copy(l0_coeff, l_coeff);
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if (i==0) /* in flow */ {}
+        else if (useNoSlip_ && i==1) /* no slip */ {}
+        else if (i==2) /* out flow */ {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numBdryDofs = fpidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                l0_coeff(cidx,fpidx_[j][l]) = static_cast<Real>(0);
+              }
+            }
+          }
+        }
+        else if (i==3) /* no normal */ {}
+      }
+    }
+    // Evaluate/interpolate finite element fields on cells.
+    scalar_view valZ("valZ", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view gradL("gradL", c, p, d);
+    scalar_view alphaL("alphaL", c, fc, p, d);
+    fePrs_->evaluateGradient(gradL, l0_coeff);
+    feCtrl_->evaluateValue(valZ, z_coeff);
+    perm_->compute(alpha, valZ, fePrs_->cubPts(), 1);
+    /*** Evaluate weak form of the Hessian. ***/
+    for (int j = 0; j < c; ++j) {
+      for (int k = 0; k < fc; ++k) {
+        for (int l = 0; l < p; ++l) {
+          for (int m = 0; m < d; ++m) {
+            alphaL(j,k,l,m) = alpha(j,l) * (feCtrl_->N())(j,k,l) * gradL(j,l,m);
+          }
+        }
+      }
+    }
+    multiplyByRadius(alphaL,fePrs_->cubPts(),true);
+    fst::integrate(hess,fePrs_->gradNdetJ(),alphaL,false);
+  }
+
+  void Hessian_22(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c  = fePrs_->gradN().extent_int(0);
+    const int f  = fePrs_->gradN().extent_int(1);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int p  = fePrs_->gradN().extent_int(2);
+    const int d  = fePrs_->gradN().extent_int(3);
+    // Initialize hessians.
+    hess = scalar_view("hess", c, fc, fc);
+    // Apply Dirichlet conditions
+    scalar_view l0_coeff("l0_coeff", c, f);
+    Kokkos::deep_copy(l0_coeff, l_coeff);
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if (i==0) /* in flow */ {}
+        else if (useNoSlip_ && i==1) /* no slip */ {}
+        else if (i==2) /* out flow */ {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numBdryDofs = fpidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                l0_coeff(cidx,fpidx_[j][l]) = static_cast<Real>(0);
+              }
+            }
+          }
+        }
+        else if (i==3) /* no normal */ {}
+      }
+    }
+    // Evaluate/interpolate finite element fields on cells.
+    scalar_view valZ("valZ", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view alphaUL("alphaUL", c, fc, p);
+    scalar_view gradL("gradL", c, p, d);
+    scalar_view gradU("gradU", c, p, d);
+    fePrs_->evaluateGradient(gradL, l0_coeff);
+    fePrs_->evaluateGradient(gradU, u_coeff);
+    feCtrl_->evaluateValue(valZ, z_coeff);
+    perm_->compute(alpha, valZ, fePrs_->cubPts(), 2);
+    // Multiply velocity with alpha
+    Real dot(0);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < fc; ++j) {
+        for (int k = 0; k < p; ++k) {
+          dot = static_cast<Real>(0);
+          for (int l = 0; l < d; ++l) {
+            dot += gradU(i,k,l) * gradL(i,k,l);
+          }
+          alphaUL(i,j,k) = alpha(i,k) * (feCtrl_->N())(i,j,k) * dot;
+        }
+      }
+    }
+    /*** Evaluate weak form of the Hessian. ***/
+    // Velocity equation.
+    multiplyByRadius(alphaUL,fePrs_->cubPts(),true);
+    fst::integrate(hess,alphaUL,feCtrl_->NdetJ(),false);
+  }
+
+  void Hessian_23(std::vector<scalar_view> & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> Hessian_23 is zero!");
+  }
+
+  void Hessian_31(std::vector<scalar_view> & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> Hessian_31 is zero!");
+  }
+
+  void Hessian_32(std::vector<scalar_view> & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> Hessian_32 is zero!");
+  }
+
+  void Hessian_33(std::vector<std::vector<scalar_view>> & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> Hessian_33 is zero!");
+  }
+
+  void RieszMap_1(scalar_view & riesz) override {
+    // Retrieve dimensions.
+    const int c = fePrs_->gradN().extent_int(0);
+    const int f = fePrs_->gradN().extent_int(1);
+    const int p = fePrs_->gradN().extent_int(2);
+    const int d = fePrs_->gradN().extent_int(3);
+    // Initialize Jacobians.
+    riesz = scalar_view("riesz1", c, f, f);
+    // Evaluate/interpolate finite element fields on cells.
+    scalar_view valPhi("valPhi", c, f, p);
+    scalar_view gradPhi("gradPhi", c, f, p, d);
+    Kokkos::deep_copy(valPhi, fePrs_->N());
+    Kokkos::deep_copy(gradPhi, fePrs_->gradN());
+    /*** Evaluate weak form of the Jacobian. ***/
+    multiplyByRadius(gradPhi,fePrs_->cubPts(),true);
+    multiplyByRadius(valPhi,fePrs_->cubPts(),true);
+    fst::integrate(riesz,gradPhi,fePrs_->gradNdetJ(),false);
+    fst::integrate(riesz,valPhi,fePrs_->NdetJ(),true);
+  }
+
+  void RieszMap_2(scalar_view & riesz) override {
+    // Retrieve dimensions.
+    const int c  = feCtrl_->gradN().extent_int(0);
+    const int f  = feCtrl_->gradN().extent_int(1);
+    const int p  = feCtrl_->gradN().extent_int(2);
+    // Initialize Jacobians.
+    riesz = scalar_view("riesz2", c, f, f);
+    // Evaluate on FE basis.
+    scalar_view valPsi("valPsi", c, f, p);
+    Kokkos::deep_copy(valPsi, feCtrl_->N());
+    /*** Evaluate weak form of the residual. ***/
+    multiplyByRadius(valPsi,feCtrl_->cubPts(),true);
+    fst::integrate(riesz,valPsi,feCtrl_->NdetJ(),false);
+  }
+
+  std::vector<basis_ptr> getFields() override {
+    return basisPtrs_;
+  }
+
+  std::vector<basis_ptr> getFields2() override {
+    return basisPtrsCtrl_;
+  }
+
+  void setCellNodes(const scalar_view &volCellNodes,
+                    const std::vector<std::vector<scalar_view>> &bdryCellNodes,
+                    const std::vector<std::vector<std::vector<int>>> &bdryCellLocIds) override {
+    volCellNodes_ = volCellNodes;
+    bdryCellNodes_ = bdryCellNodes;
+    bdryCellLocIds_ = bdryCellLocIds;
+    // Finite element definition.
+    fePrs_  = ROL::makePtr<fe_type>(volCellNodes_,basisPtrPrs_,cellCub_);
+    feCtrl_ = ROL::makePtr<fe_type>(volCellNodes_,basisPtrCtrl_,cellCub_,false);
+    fpidx_  = fePrs_->getBoundaryDofs();
+    // Construct control boundary FE
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      fePrsBdry_.resize(numSideSets);
+      feCtrlBdry_.resize(numSideSets);
+      for (int i = 0; i < numSideSets; ++i) {
+        int numLocSides = bdryCellNodes[i].size();
+        fePrsBdry_[i].resize(numLocSides);
+        feCtrlBdry_[i].resize(numLocSides);
+        for (int j = 0; j < numLocSides; ++j) {
+          if (bdryCellNodes[i][j] != scalar_view()) {
+            fePrsBdry_[i][j] = ROL::makePtr<fe_type>(bdryCellNodes[i][j],basisPtrPrs_,bdryCub_,j);
+            feCtrlBdry_[i][j] = ROL::makePtr<fe_type>(bdryCellNodes[i][j],basisPtrCtrl_,bdryCub_,j);
+          }
+        }
+      }
+    }
+  }
+
+  void setFieldPattern(const std::vector<std::vector<int>> &fieldPattern,
+                       const std::vector<std::vector<int>> &fieldPattern2) override {
+    fieldPattern_ = fieldPattern;
+    fieldInfo_    = ROL::makePtr<FieldUtils::FieldInfo>(numFields_,numDofs_,numFieldDofs_,fieldPattern_);
+    fieldPatternCtrl_ = fieldPattern2;
+    fieldInfoCtrl_    = ROL::makePtr<FieldUtils::FieldInfo>(numFieldsCtrl_,numDofsCtrl_,numFieldDofsCtrl_,fieldPatternCtrl_);
+  }
+
+  void printData(std::string tag,
+                 const scalar_view u_coeff,
+                 const scalar_view z_coeff = scalar_view(),
+                 const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c = fePrs_->gradN().extent_int(0);
+    const int p = fePrs_->gradN().extent_int(2);
+    const int d = fePrs_->gradN().extent_int(3);
+    // Evaluate/interpolate finite element fields on cells.
+    scalar_view gradP("gradP", c, p, d);
+    scalar_view AgradP("AgradP", c, p, d);
+    scalar_view valCtrl("valCtrl", c, p);
+    scalar_view alpha("alpha", c, p);
+    fePrs_->evaluateGradient(gradP, u_coeff);
+    feCtrl_->evaluateValue(valCtrl, z_coeff);
+    perm_->compute(alpha, valCtrl, fePrs_->cubPts(), 0);
+    fst::scalarMultiplyDataData(AgradP,alpha,gradP);
+    // Print to velocity file
+    std::stringstream nameVel;
+    nameVel << "velocity" << tag << ".txt";
+    std::ofstream fileVel;
+    fileVel.open(nameVel.str());
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        fileVel << std::scientific << std::setprecision(15);
+        for (int k = 0; k < d; ++k) fileVel << std::setw(25) << (fePrs_->cubPts())(i,j,k);
+        for (int k = 0; k < d; ++k) fileVel << std::setw(25) << -AgradP(i,j,k);
+        fileVel << std::endl;
+      }
+    }
+    fileVel.close();
+    // Print to permeability file
+    std::stringstream namePer;
+    namePer << "permeability" << tag << ".txt";
+    std::ofstream filePer;
+    filePer.open(namePer.str());
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        filePer << std::scientific << std::setprecision(15);
+        for (int k = 0; k < d; ++k) filePer << std::setw(25) << (fePrs_->cubPts())(i,j,k);
+        filePer << std::setw(25) << dynvisco_*alpha(i,j);
+        filePer << std::endl;
+      }
+    }
+    filePer.close();
+  }
+
+  const ROL::Ptr<fe_type> getPressureFE(void) const {
+    return fePrs_;
+  }
+
+  const ROL::Ptr<fe_type> getControlFE(void) const {
+    return feCtrl_;
+  }
+
+  const std::vector<ROL::Ptr<fe_type>> getPressureBdryFE(const int sideset = -1) const {
+    int side = (sideset < 0 || sideset > 4) ? 0 : sideset;
+    return fePrsBdry_[side];
+  }
+
+  const std::vector<ROL::Ptr<fe_type>> getControlBdryFE(const int sideset = -1) const {
+    int side = (sideset < 0 || sideset > 4) ? 0 : sideset;
+    return feCtrlBdry_[side];
+  }
+
+  const std::vector<std::vector<int>> getBdryCellLocIds(const int sideset = -1) const {
+    int side = (sideset < 0 || sideset > 4) ? 0 : sideset;
+    return bdryCellLocIds_[side];
+  }
+
+  const ROL::Ptr<Permeability<Real,DeviceType>> getPermeability(void) const {
+    return perm_;
+  }
+
+  const ROL::Ptr<const FieldUtils::FieldInfo> getStateFieldInfo(void) const {
+    return fieldInfo_;
+  }
+
+  const ROL::Ptr<const FieldUtils::FieldInfo> getControlFieldInfo(void) const {
+    return fieldInfoCtrl_;
+  }
+
+}; // PDE_Darcy
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/filteredDarcy/pde_filterK.hpp b/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/filteredDarcy/pde_filterK.hpp
new file mode 100644
index 000000000000..adff2d972dce
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/filteredDarcy/pde_filterK.hpp
@@ -0,0 +1,385 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  pde_darcy.hpp
+    \brief Implements the local PDE interface for the filtered Darcy porosity optimization problem.
+*/
+
+#ifndef PDE_FILTERK_HPP
+#define PDE_FILTERK_HPP
+
+#include "../../../../TOOLS/pdeK.hpp"
+#include "../../../../TOOLS/feK.hpp"
+#include "../../../../TOOLS/fieldhelperK.hpp"
+#include "../../../../TOOLS/Intrepid2_CubatureNodal.hpp"
+
+#include "Intrepid2_HVOL_C0_FEM.hpp"
+#include "Intrepid2_HGRAD_QUAD_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_QUAD_C2_FEM.hpp"
+#include "Intrepid2_HGRAD_TRI_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_TRI_C2_FEM.hpp"
+#include "Intrepid2_DefaultCubatureFactory.hpp"
+#include "Intrepid2_FunctionSpaceTools.hpp"
+#include "Intrepid2_CellTools.hpp"
+
+#include "ROL_Ptr.hpp"
+
+template <class Real, class DeviceType>
+class PDE_Filter : public PDE<Real, DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  // Finite element basis information
+  basis_ptr basisPtrPrs_, basisPtrCtrl_;
+  std::vector<basis_ptr> basisPtrs_, basisPtrsCtrl_;
+  // Cell cubature information
+  ROL::Ptr<Intrepid2::Cubature<DeviceType,Real,Real>> cellCub_;
+  // Cell node information
+  scalar_view volCellNodes_;
+  std::vector<std::vector<scalar_view>> bdryCellNodes_;
+  std::vector<std::vector<std::vector<int>>> bdryCellLocIds_;
+  // Finite element definition
+  ROL::Ptr<fe_type> feFilter_, feCtrl_;
+  // Field pattern, offsets, etc.
+  std::vector<std::vector<int>> fieldPattern_;  // local Field/DOF pattern; set from DOF manager 
+  int numFields_;                               // number of fields (equations in the PDE)
+  int numDofs_;                                 // total number of degrees of freedom for all (local) fields
+  std::vector<int> offset_;                     // for each field, a counting offset
+  std::vector<int> numFieldDofs_;               // for each field, number of degrees of freedom
+  std::vector<std::vector<int>> fieldPatternCtrl_;  // local Field/DOF pattern; set from DOF manager 
+  int numFieldsCtrl_;                               // number of fields (equations in the PDE)
+  int numDofsCtrl_;                                 // total number of degrees of freedom for all (local) fields
+  std::vector<int> offsetCtrl_;                     // for each field, a counting offset
+  std::vector<int> numFieldDofsCtrl_;               // for each field, number of degrees of freedom
+  
+  // Problem parameters.
+  Real filterRadius2_;
+
+  ROL::Ptr<FieldUtils::FieldInfo> fieldInfo_, fieldInfoCtrl_;
+
+  void multiplyByRadius(scalar_view &input, const scalar_view cubPts,
+                        bool isField, bool useReciprocal = false) const {
+    int c = cubPts.extent_int(0);
+    int p = cubPts.extent_int(1);
+    scalar_view r("r",c,p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        if (!useReciprocal) r(i,j) = cubPts(i,j,0);
+        else                r(i,j) = static_cast<Real>(1)/cubPts(i,j,0);
+      }
+    }
+    scalar_view in(input);
+    if (!isField)
+      fst::scalarMultiplyDataData(input,r,in);
+    else
+      fst::scalarMultiplyDataField(input,r,in);
+  }
+
+public:
+  PDE_Filter(ROL::ParameterList &parlist) {
+    // Finite element fields.
+    int cubDegree     = parlist.sublist("Problem").get("Filter Cubature Degree",4);
+    int basisDegPres  = parlist.sublist("Problem").get("Filter Basis Degree",1);
+    int basisDegCtrl  = parlist.sublist("Problem").get("Density Basis Degree",1);
+    std::string elemType = parlist.sublist("Problem").get("Element Type","QUAD");
+    if (elemType == "TRI") {
+      if (basisDegPres == 2)
+        basisPtrPrs_ = ROL::makePtr<Intrepid2::Basis_HGRAD_TRI_C2_FEM<DeviceType,Real,Real>>();
+      else 
+        basisPtrPrs_ = ROL::makePtr<Intrepid2::Basis_HGRAD_TRI_C1_FEM<DeviceType,Real,Real>>();
+    }
+    else {
+      if (basisDegPres == 2)
+        basisPtrPrs_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C2_FEM<DeviceType,Real,Real>>();
+      else 
+        basisPtrPrs_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceType,Real,Real>>();
+    }
+    basisPtrs_.push_back(basisPtrPrs_);       // Pressure component
+    shards::CellTopology cellType = basisPtrs_[0]->getBaseCellTopology();        // get the cell type from any basis
+    if (basisDegCtrl == 1) {
+      if (elemType == "TRI")
+        basisPtrCtrl_ = ROL::makePtr<Intrepid2::Basis_HGRAD_TRI_C1_FEM<DeviceType,Real,Real>>();
+      else
+        basisPtrCtrl_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceType,Real,Real>>();
+    }
+    else {
+      basisPtrCtrl_ = ROL::makePtr<Intrepid2::Basis_HVOL_C0_FEM<DeviceType,Real,Real>>(cellType);
+    }
+    basisPtrsCtrl_.push_back(basisPtrCtrl_);  // Control component
+    // Quadrature rules.
+    Intrepid2::DefaultCubatureFactory cubFactory;                            // create cubature factory
+    //cellCub_ = cubFactory.create<DeviceType,Real,Real>(cellType, cubDegree); // create default cubature
+    if (cubDegree == -1) {  // nodal cubature
+      cellCub_ = ROL::makePtr<Intrepid2::CubatureNodal<DeviceType,Real,Real>>(cellType);
+    }
+    else {                  // default cubature
+      cellCub_ = cubFactory.create<DeviceType,Real,Real>(cellType, cubDegree); // create default cubature
+    }
+
+    // Other problem parameters.
+    Real filterRadius = parlist.sublist("Problem").get("Filter Radius", 1e-2);
+    filterRadius2_ = filterRadius*filterRadius;
+
+    numDofs_ = 0;
+    numFields_ = basisPtrs_.size();
+    offset_.resize(numFields_);
+    numFieldDofs_.resize(numFields_);
+    for (int i=0; i<numFields_; ++i) {
+      offset_[i] = i==0 ? 0 : offset_[i-1] + basisPtrs_[i-1]->getCardinality();
+      numFieldDofs_[i] = basisPtrs_[i]->getCardinality();
+      numDofs_ += numFieldDofs_[i];
+    }
+    numDofsCtrl_ = 0;
+    numFieldsCtrl_ = basisPtrsCtrl_.size();
+    offsetCtrl_.resize(numFieldsCtrl_);
+    numFieldDofsCtrl_.resize(numFieldsCtrl_);
+    for (int i=0; i<numFieldsCtrl_; ++i) {
+      offsetCtrl_[i] = i==0 ? 0 : offsetCtrl_[i-1] + basisPtrsCtrl_[i-1]->getCardinality();
+      numFieldDofsCtrl_[i] = basisPtrsCtrl_[i]->getCardinality();
+      numDofsCtrl_ += numFieldDofsCtrl_[i];
+    }
+  }
+
+  void residual(scalar_view & res,
+                const scalar_view u_coeff,
+                const scalar_view z_coeff = scalar_view(),
+                const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c = feFilter_->gradN().extent_int(0);
+    const int f = feFilter_->gradN().extent_int(1);
+    const int p = feFilter_->gradN().extent_int(2);
+    const int d = feFilter_->gradN().extent_int(3);
+    // Initialize residuals.
+    res = scalar_view("res", c, f);
+    // Evaluate/interpolate finite element fields on cells.
+    scalar_view valP("valP", c, p);
+    scalar_view gradP("gradP", c, p, d);
+    scalar_view valCtrl("valCtrl", c, p);
+    feFilter_->evaluateValue(valP, u_coeff);
+    feFilter_->evaluateGradient(gradP, u_coeff);
+    feCtrl_->evaluateValue(valCtrl, z_coeff);
+    rst::scale(gradP,filterRadius2_);
+    rst::scale(valCtrl,static_cast<Real>(-1));
+    /*** Evaluate weak form of the residual. ***/
+    multiplyByRadius(valP,feFilter_->cubPts(),false);
+    multiplyByRadius(gradP,feFilter_->cubPts(),false);
+    multiplyByRadius(valCtrl,feFilter_->cubPts(),false);
+    fst::integrate(res,gradP,feFilter_->gradNdetJ(),false);
+    fst::integrate(res,valP,feFilter_->NdetJ(),true);
+    fst::integrate(res,valCtrl,feFilter_->NdetJ(),true);
+  }
+
+  void Jacobian_1(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c = feFilter_->gradN().extent_int(0);
+    const int f = feFilter_->gradN().extent_int(1);
+    const int p = feFilter_->gradN().extent_int(2);
+    const int d = feFilter_->gradN().extent_int(3);
+    // Initialize Jacobians.
+    jac = scalar_view("jac", c, f, f);
+    // Evaluate/interpolate finite element fields on cells.
+    scalar_view valPhi("valPhi", c, f, p);
+    scalar_view gradPhi("gradPhi", c, f, p, d);
+    Kokkos::deep_copy(valPhi,feFilter_->N());
+    rst::scale(gradPhi, feFilter_->gradN(), filterRadius2_);
+    /*** Evaluate weak form of the Jacobian. ***/
+    multiplyByRadius(gradPhi,feFilter_->cubPts(),true);
+    multiplyByRadius(valPhi,feFilter_->cubPts(),true);
+    fst::integrate(jac,gradPhi,feFilter_->gradNdetJ(),false);
+    fst::integrate(jac,valPhi,feFilter_->NdetJ(),true);
+  }
+
+  void Jacobian_2(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c  = feFilter_->gradN().extent_int(0);
+    const int f  = feFilter_->gradN().extent_int(1);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int p  = feFilter_->gradN().extent_int(2);
+    // Initialize Jacobians.
+    jac = scalar_view("jac", c, f, fc);
+    // Evaluate on FE basis.
+    scalar_view valPsi("valPsi", c, fc, p);
+    rst::scale(valPsi,feCtrl_->N(),static_cast<Real>(-1));
+    /*** Evaluate weak form of the residual. ***/
+    multiplyByRadius(valPsi,feCtrl_->cubPts(),true);
+    fst::integrate(jac,feFilter_->NdetJ(),valPsi,false);
+  }
+
+  void Jacobian_3(std::vector<scalar_view> & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> Jacobian_3 is zero!");
+  }
+
+  void Hessian_11(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> Hessian_11 is zero!");
+  }
+
+  void Hessian_12(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> Hessian_12 is zero!");
+  }
+
+  void Hessian_13(std::vector<scalar_view> & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> Hessian_13 is zero!");
+  }
+
+  void Hessian_21(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> Hessian_21 is zero!");
+  }
+
+  void Hessian_22(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> Hessian_22 is zero!");
+  }
+
+  void Hessian_23(std::vector<scalar_view> & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> Hessian_23 is zero!");
+  }
+
+  void Hessian_31(std::vector<scalar_view> & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> Hessian_31 is zero!");
+  }
+
+  void Hessian_32(std::vector<scalar_view> & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> Hessian_32 is zero!");
+  }
+
+  void Hessian_33(std::vector<std::vector<scalar_view>> & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> Hessian_33 is zero!");
+  }
+
+  void RieszMap_1(scalar_view & riesz) override {
+    // Retrieve dimensions.
+    const int c = feFilter_->gradN().extent_int(0);
+    const int f = feFilter_->gradN().extent_int(1);
+    const int p = feFilter_->gradN().extent_int(2);
+    const int d = feFilter_->gradN().extent_int(3);
+    // Initialize Jacobians.
+    riesz = scalar_view("riesz1", c, f, f);
+    // Evaluate/interpolate finite element fields on cells.
+    scalar_view valPhi("valPhi", c, f, p);
+    scalar_view gradPhi("gradPhi", c, f, p, d);
+    Kokkos::deep_copy(valPhi, feFilter_->N());
+    rst::scale(gradPhi, feFilter_->gradN(), filterRadius2_);
+    /*** Evaluate weak form of the Jacobian. ***/
+    multiplyByRadius(gradPhi,feFilter_->cubPts(),true);
+    multiplyByRadius(valPhi,feFilter_->cubPts(),true);
+    fst::integrate(riesz,gradPhi,feFilter_->gradNdetJ(),false);
+    fst::integrate(riesz,valPhi,feFilter_->NdetJ(),true);
+  }
+
+  void RieszMap_2(scalar_view & riesz) override {
+    // Retrieve dimensions.
+    const int c  = feCtrl_->gradN().extent_int(0);
+    const int f  = feCtrl_->gradN().extent_int(1);
+    const int p  = feCtrl_->gradN().extent_int(2);
+    // Initialize Jacobians.
+    riesz = scalar_view("riesz", c, f, f);
+    // Evaluate on FE basis.
+    scalar_view valPsi("valPsi", c, f, p);
+    Kokkos::deep_copy(valPsi, feCtrl_->N());
+    /*** Evaluate weak form of the residual. ***/
+    multiplyByRadius(valPsi,feCtrl_->cubPts(),true);
+    fst::integrate(riesz,valPsi,feCtrl_->NdetJ(),false);
+  }
+
+  std::vector<basis_ptr> getFields() override {
+    return basisPtrs_;
+  }
+
+  std::vector<basis_ptr> getFields2() override {
+    return basisPtrsCtrl_;
+  }
+
+  void setCellNodes(const scalar_view &volCellNodes,
+                    const std::vector<std::vector<scalar_view>> &bdryCellNodes,
+                    const std::vector<std::vector<std::vector<int>>> &bdryCellLocIds) override {
+    volCellNodes_ = volCellNodes;
+    bdryCellNodes_ = bdryCellNodes;
+    bdryCellLocIds_ = bdryCellLocIds;
+    // Finite element definition.
+    feFilter_ = ROL::makePtr<fe_type>(volCellNodes_,basisPtrPrs_,cellCub_);
+    feCtrl_   = ROL::makePtr<fe_type>(volCellNodes_,basisPtrCtrl_,cellCub_,false);
+  }
+
+  void setFieldPattern(const std::vector<std::vector<int>> &fieldPattern,
+                       const std::vector<std::vector<int>> &fieldPattern2) override {
+    fieldPattern_ = fieldPattern;
+    fieldInfo_    = ROL::makePtr<FieldUtils::FieldInfo>(numFields_,numDofs_,numFieldDofs_,fieldPattern_);
+    fieldPatternCtrl_ = fieldPattern2;
+    fieldInfoCtrl_    = ROL::makePtr<FieldUtils::FieldInfo>(numFieldsCtrl_,numDofsCtrl_,numFieldDofsCtrl_,fieldPatternCtrl_);
+  }
+
+  const ROL::Ptr<fe_type> getFilterFE() const {
+    return feFilter_;
+  }
+
+  const ROL::Ptr<fe_type> getControlFE() const {
+    return feCtrl_;
+  }
+
+  const ROL::Ptr<const FieldUtils::FieldInfo> getStateFieldInfo(void) const {
+    return fieldInfo_;
+  }
+
+  const ROL::Ptr<const FieldUtils::FieldInfo> getControlFieldInfo(void) const {
+    return fieldInfoCtrl_;
+  }
+
+}; // PDE_Filter
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/filteredDarcy/permeabilityK.hpp b/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/filteredDarcy/permeabilityK.hpp
new file mode 100644
index 000000000000..ccfc3566aebf
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/flow-opt/axisymmetric/models/filteredDarcy/permeabilityK.hpp
@@ -0,0 +1,93 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+#ifndef PERMEABILITYK_HPP
+#define PERMEABILITYK_HPP
+
+#include <vector>
+#include "ROL_ParameterList.hpp"
+#include "ROL_Ptr.hpp"
+#include "../../../../TOOLS/feK.hpp"
+
+template<class Real, class DeviceType>
+class Permeability {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+
+private:
+  Real dRadius_, viscosity_, minPerm_, maxPerm_;
+  int type_;
+
+  Real value(Real z) const {
+    if (type_==1) {
+      const Real a(minPerm_/viscosity_), b(std::log(maxPerm_/minPerm_));
+      return a * std::exp(b * z);
+    }
+    else {
+      const Real a(minPerm_/viscosity_), b(maxPerm_/viscosity_);
+      return a + z * (b - a);
+    }
+  }
+
+  Real deriv1(Real z) const {
+    if (type_==1) {
+      const Real a(minPerm_/viscosity_), b(std::log(maxPerm_/minPerm_));
+      return a * b * std::exp(b * z);
+    }
+    else {
+      const Real a(minPerm_/viscosity_), b(maxPerm_/viscosity_);
+      return b - a;
+    }
+  }
+
+  Real deriv2(Real z) const {
+    if (type_==1) {
+      const Real a(minPerm_/viscosity_), b(std::log(maxPerm_/minPerm_));
+      return a * b * b * std::exp(b * z);
+    }
+    else {
+      return static_cast<Real>(0);
+    }
+  }
+
+public:
+  Permeability(ROL::ParameterList &list) {
+    bool useDarcy = list.sublist("Problem").get("Use Darcy Flow",true);
+    dRadius_   = list.sublist("Problem").get("Diffuser Radius",5.0);        // mm
+    viscosity_ = list.sublist("Problem").get("Dynamic Viscosity", 0.84e-8); // kg/mm-s
+    minPerm_   = list.sublist("Problem").get("Minimum Permeability", 3e-7); // mm^2
+    maxPerm_   = list.sublist("Problem").get("Maximum Permeability", 3e-6); // mm^2
+    if (useDarcy) dRadius_ += static_cast<Real>(10);
+    type_      = list.sublist("Problem").get("Parametrization Type", 0);
+    // type_ = 0 ... K = z kmin + (1-z) kmax
+    // type_ = 1 ... K = exp((1-z) log(kmin) + z log(kmax))
+  }
+
+  void compute(scalar_view &alpha, const scalar_view z, const scalar_view pts, int deriv=0) const {
+    const Real tol = std::sqrt(ROL::ROL_EPSILON<Real>());
+    const Real zero(0), one(1);
+    const int c = pts.extent_int(0);
+    const int p = pts.extent_int(1);
+    const int d = pts.extent_int(2);
+    Real weight(0), norm(0);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        norm = zero;
+        for (int k = 0; k < d; ++k) norm += pts(i,j,k)*pts(i,j,k);
+        weight = (std::sqrt(norm) <= dRadius_ + tol ? one : zero);
+        // Compute spatially dependent viscosity
+        if (deriv==1)      alpha(i,j) = weight * deriv1(z(i,j));
+        else if (deriv==2) alpha(i,j) = weight * deriv2(z(i,j));
+        else               alpha(i,j) = weight * value(z(i,j));
+      }
+    }
+  }
+};
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/fractional/poisson/CMakeLists.txt b/packages/rol/example/PDE-OPT/fractional/poisson/CMakeLists.txt
index e9c783c398bb..2e6fa50bf63d 100644
--- a/packages/rol/example/PDE-OPT/fractional/poisson/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/fractional/poisson/CMakeLists.txt
@@ -7,19 +7,19 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   # Need ROL_TpetraMultiVector.hpp
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_01
     SOURCES example_01.cpp
     ADD_DIR_TO_NAME
   )
 
-#  TRIBITS_ADD_EXECUTABLE(
+#  ROL_ADD_EXECUTABLE(
 #    example_02
 #    SOURCES example_02.cpp
 #    ADD_DIR_TO_NAME
 #  )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     FractionalPoissonDataCopy
     SOURCE_FILES
       input.xml plotresults.m
diff --git a/packages/rol/example/PDE-OPT/fractional/poisson/example_01.cpp b/packages/rol/example/PDE-OPT/fractional/poisson/example_01.cpp
index a34f84d3a928..78e1a9e82db6 100644
--- a/packages/rol/example/PDE-OPT/fractional/poisson/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/fractional/poisson/example_01.cpp
@@ -13,8 +13,7 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -94,7 +93,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int> > comm
     = Tpetra::getDefaultComm();
   const int myRank = comm->getRank();
diff --git a/packages/rol/example/PDE-OPT/fractional/poisson/example_02.cpp b/packages/rol/example/PDE-OPT/fractional/poisson/example_02.cpp
index 829e20e0a0a1..3b3ba12ff4b3 100644
--- a/packages/rol/example/PDE-OPT/fractional/poisson/example_02.cpp
+++ b/packages/rol/example/PDE-OPT/fractional/poisson/example_02.cpp
@@ -13,8 +13,8 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
+#include "ROL_XMLReader.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -43,7 +43,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int> > comm
     = Tpetra::getDefaultComm();
   const int myRank = comm->getRank();
diff --git a/packages/rol/example/PDE-OPT/ginzburg-landau/CMakeLists.txt b/packages/rol/example/PDE-OPT/ginzburg-landau/CMakeLists.txt
index 50398a030963..d1ff6bed9c82 100644
--- a/packages/rol/example/PDE-OPT/ginzburg-landau/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/ginzburg-landau/CMakeLists.txt
@@ -9,19 +9,19 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_01
     SOURCES example_01.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_02
     SOURCES example_02.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_03
     SOURCES example_03.cpp
     ADD_DIR_TO_NAME
@@ -45,7 +45,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     GinzburgLandauDataCopy
     SOURCE_FILES
       input.xml input_ex03.xml plotresults.m
diff --git a/packages/rol/example/PDE-OPT/ginzburg-landau/example_01.cpp b/packages/rol/example/PDE-OPT/ginzburg-landau/example_01.cpp
index cdcd0104f7cd..4c21aa1b33d2 100644
--- a/packages/rol/example/PDE-OPT/ginzburg-landau/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/ginzburg-landau/example_01.cpp
@@ -14,8 +14,7 @@
 #include "Teuchos_Comm.hpp"
 #include "Teuchos_Time.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -43,7 +42,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int>> comm
     = Tpetra::getDefaultComm();
   const int myRank = comm->getRank();
diff --git a/packages/rol/example/PDE-OPT/ginzburg-landau/example_02.cpp b/packages/rol/example/PDE-OPT/ginzburg-landau/example_02.cpp
index c0fe156bd820..8bf640b8ee76 100644
--- a/packages/rol/example/PDE-OPT/ginzburg-landau/example_02.cpp
+++ b/packages/rol/example/PDE-OPT/ginzburg-landau/example_02.cpp
@@ -14,8 +14,7 @@
 #include "Teuchos_Comm.hpp"
 #include "Teuchos_Time.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -51,7 +50,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int> > comm
     = Tpetra::getDefaultComm();
   const int myRank = comm->getRank();
diff --git a/packages/rol/example/PDE-OPT/ginzburg-landau/example_03.cpp b/packages/rol/example/PDE-OPT/ginzburg-landau/example_03.cpp
index 7883fdbeb68d..60c9550afe13 100644
--- a/packages/rol/example/PDE-OPT/ginzburg-landau/example_03.cpp
+++ b/packages/rol/example/PDE-OPT/ginzburg-landau/example_03.cpp
@@ -14,8 +14,7 @@
 #include "Teuchos_Comm.hpp"
 #include "Teuchos_Time.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -43,7 +42,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int> > comm
     = Tpetra::getDefaultComm();
   const int myRank = comm->getRank();
diff --git a/packages/rol/example/PDE-OPT/helmholtz/CMakeLists.txt b/packages/rol/example/PDE-OPT/helmholtz/CMakeLists.txt
index 4d474b848721..d0d94030c06c 100644
--- a/packages/rol/example/PDE-OPT/helmholtz/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/helmholtz/CMakeLists.txt
@@ -1,5 +1,4 @@
-
-IF(${PROJECT_NAME}_ENABLE_Intrepid AND 
+IF(${PROJECT_NAME}_ENABLE_Intrepid AND
    ${PROJECT_NAME}_ENABLE_Amesos2  AND
    ${PROJECT_NAME}_ENABLE_Ifpack2  AND
    ${PROJECT_NAME}_ENABLE_MueLu    AND
@@ -9,13 +8,13 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_01
     SOURCES example_01.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_02
     SOURCES example_02.cpp
     ADD_DIR_TO_NAME
@@ -39,7 +38,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     HelmholtzDataCopy
     SOURCE_FILES
       input.xml plotresults.m
@@ -49,3 +48,44 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
 
 
 ENDIF()
+
+IF(${PROJECT_NAME}_ENABLE_Intrepid2 AND
+   ${PROJECT_NAME}_ENABLE_Amesos2   AND
+   ${PROJECT_NAME}_ENABLE_Ifpack2   AND
+   ${PROJECT_NAME}_ENABLE_MueLu     AND
+   ${PROJECT_NAME}_ENABLE_Tpetra )
+
+  # Need ROL_TpetraMultiVector.hpp
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
+
+  ROL_ADD_EXECUTABLE_AND_TEST(
+    example_01_Kokkos
+    SOURCES example_01K.cpp
+    ARGS PrintItAll
+    NUM_MPI_PROCS 4
+    NUM_TOTAL_CORES_USED 4
+    PASS_REGULAR_EXPRESSION "TEST PASSED"
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_ADD_EXECUTABLE_AND_TEST(
+    example_02_Kokkos
+    SOURCES example_02K.cpp
+    ARGS PrintItAll
+    NUM_MPI_PROCS 1
+    NUM_TOTAL_CORES_USED 1
+    PASS_REGULAR_EXPRESSION "TEST PASSED"
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_COPY_FILES_TO_BINARY_DIR(
+    HelmholtzDataCopyK
+    SOURCE_FILES
+      input.xml plotresults.m
+    SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}"
+    DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
+  )
+
+
+ENDIF()
diff --git a/packages/rol/example/PDE-OPT/helmholtz/example_01.cpp b/packages/rol/example/PDE-OPT/helmholtz/example_01.cpp
index bd604b7c5953..aa22daf319c9 100644
--- a/packages/rol/example/PDE-OPT/helmholtz/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/helmholtz/example_01.cpp
@@ -13,8 +13,7 @@
 
 #include "Teuchos_Comm.hpp"
 #include "Teuchos_Time.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -43,7 +42,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int> > comm
     = Tpetra::getDefaultComm();
   const int myRank = comm->getRank();
diff --git a/packages/rol/example/PDE-OPT/helmholtz/example_01K.cpp b/packages/rol/example/PDE-OPT/helmholtz/example_01K.cpp
new file mode 100644
index 000000000000..2e8d0be73846
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/helmholtz/example_01K.cpp
@@ -0,0 +1,261 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_01.cpp
+    \brief Shows how to solve the optimal control of Helmholtz problem.
+*/
+
+#include "Teuchos_Comm.hpp"
+#include "Teuchos_Time.hpp"
+#include "ROL_GlobalMPISession.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+
+#include "ROL_Stream.hpp"
+#include "ROL_Solver.hpp"
+#include "ROL_Reduced_Objective_SimOpt.hpp"
+
+#include "../TOOLS/linearpdeconstraintK.hpp"
+#include "../TOOLS/pdeobjectiveK.hpp"
+#include "../TOOLS/pdevectorK.hpp"
+#include "../TOOLS/meshmanagerK.hpp"
+
+#include "pde_helmholtzK.hpp"
+#include "obj_helmholtzK.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+int main(int argc, char *argv[]) {
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession(&argc, &argv, &bhs);
+  Kokkos::ScopeGuard kokkosScope(argc, argv);
+  ROL::Ptr<const Teuchos::Comm<int>> comm = Tpetra::getDefaultComm();
+  const int myRank = comm->getRank();
+  if ((iprint > 0) && (myRank == 0)) {
+    outStream = ROL::makePtrFromRef(std::cout);
+  }
+  else {
+    outStream = ROL::makePtrFromRef(bhs);
+  }
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+    RealT tol(1e-8);// one(1);
+
+    /*** Read in XML input ***/
+    std::string filename = "input.xml";
+    auto parlist = Teuchos::rcp( new Teuchos::ParameterList() );
+    Teuchos::updateParametersFromXmlFile( filename, parlist.ptr() );
+
+    /*** Initialize main data structure. ***/
+    auto meshMgr = ROL::makePtr<MeshManager_Rectangle<RealT, DeviceT>>(*parlist);
+    // Initialize PDE describing elasticity equations.
+    auto pde = ROL::makePtr<PDE_Helmholtz<RealT, DeviceT>>(*parlist);
+    ROL::Ptr<ROL::Constraint_SimOpt<RealT>> con
+      = ROL::makePtr<Linear_PDE_Constraint<RealT, DeviceT>>(pde, meshMgr, comm, *parlist, *outStream);
+    // Cast the constraint and get the assembler.
+    ROL::Ptr<Linear_PDE_Constraint<RealT, DeviceT>> pdecon
+      = ROL::dynamicPtrCast<Linear_PDE_Constraint<RealT, DeviceT>>(con);
+    ROL::Ptr<Assembler<RealT, DeviceT>> assembler = pdecon->getAssembler();
+    con->setSolveParameters(*parlist);
+
+    // Create state vector.
+    auto u_ptr = assembler->createStateVector();
+    u_ptr->randomize();
+    auto up = ROL::makePtr<PDE_PrimalSimVector<RealT, DeviceT>>(u_ptr, pde, assembler, *parlist);
+    auto p_ptr = assembler->createStateVector();
+    p_ptr->randomize();
+    auto pp = ROL::makePtr<PDE_PrimalSimVector<RealT, DeviceT>>(p_ptr, pde, assembler, *parlist);
+    // Create control vector.
+    auto z_ptr = assembler->createControlVector();
+    z_ptr->randomize();
+    auto zp = ROL::makePtr<PDE_PrimalOptVector<RealT, DeviceT>>(z_ptr, pde, assembler, *parlist);
+    // Create residual vector.
+    auto r_ptr = assembler->createResidualVector();
+    r_ptr->putScalar(0.0);
+    auto rp = ROL::makePtr<PDE_DualSimVector<RealT, DeviceT>>(r_ptr, pde, assembler, *parlist);
+    // Create state direction vector.
+    auto du_ptr = assembler->createStateVector();
+    du_ptr->randomize();
+    //du_ptr->putScalar(0);
+    auto dup = ROL::makePtr<PDE_PrimalSimVector<RealT, DeviceT>>(du_ptr, pde, assembler, *parlist);
+    // Create control direction vector.
+    auto dz_ptr = assembler->createControlVector();
+    dz_ptr->randomize();
+    //dz_ptr->putScalar(0);
+    auto dzp = ROL::makePtr<PDE_PrimalOptVector<RealT, DeviceT>>(dz_ptr, pde, assembler, *parlist);
+    // Create control test vector.
+    auto rz_ptr = assembler->createControlVector();
+    rz_ptr->randomize();
+    auto rzp = ROL::makePtr<PDE_PrimalOptVector<RealT, DeviceT>>(rz_ptr, pde, assembler, *parlist);
+
+    auto dualu_ptr = assembler->createStateVector();
+    auto dualup = ROL::makePtr<PDE_DualSimVector<RealT, DeviceT>>(dualu_ptr, pde, assembler, *parlist);
+    auto dualz_ptr = assembler->createControlVector();
+    auto dualzp = ROL::makePtr<PDE_DualOptVector<RealT, DeviceT>>(dualz_ptr, pde, assembler, *parlist);
+
+    // Create ROL SimOpt vectors.
+    ROL::Vector_SimOpt<RealT> x(up, zp);
+    ROL::Vector_SimOpt<RealT> d(dup, dzp);
+
+    // Initialize compliance objective function.
+    bool storage = parlist->sublist("Problem").get("Use Storage",true);
+    std::vector<ROL::Ptr<QoI<RealT, DeviceT>>> qoi_vec(2,ROL::nullPtr);
+    qoi_vec[0] = ROL::makePtr<QoI_Helmholtz_StateTracking<RealT, DeviceT>>(pde->getFE(),
+                                                                           pde->getFieldHelper(),
+                                                                           *parlist);
+    qoi_vec[1] = ROL::makePtr<QoI_Helmholtz_ControlPenalty<RealT, DeviceT>>(pde->getFE(),
+                                                                            pde->getFieldHelper(),
+                                                                            *parlist);
+    auto obj = ROL::makePtr<PDE_Objective<RealT, DeviceT>>(qoi_vec, assembler);
+    auto robj = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(obj, con, up, zp, pp, storage, false);
+
+    // Run derivative checks
+    bool checkDeriv = parlist->sublist("Problem").get("Check derivatives",false);
+    if ( checkDeriv ) {
+      *outStream << "\n\nCheck Opt Vector\n";
+      zp->checkVector(*dzp, *rzp, true, *outStream);
+
+      std::vector<ROL::Ptr<ROL::Objective_SimOpt<RealT> > > obj_vec(2,ROL::nullPtr);
+      obj_vec[0] = ROL::makePtr<IntegralObjective<RealT, DeviceT>>(qoi_vec[0], assembler);
+      obj_vec[1] = ROL::makePtr<IntegralObjective<RealT, DeviceT>>(qoi_vec[1], assembler);
+
+      *outStream << "\n\nCheck Gradient of State Objective Function\n";
+      obj_vec[0]->checkGradient(x,d,true,*outStream);
+      *outStream << "\n\nCheck Gradient_1 of State Objective Function\n";
+      obj_vec[0]->checkGradient_1(*up,*zp,*dup,true,*outStream);
+      *outStream << "\n\nCheck Gradient_2 of State Objective Function\n";
+      obj_vec[0]->checkGradient_2(*up,*zp,*dzp,true,*outStream);
+      *outStream << "\n\nCheck Hessian of State Objective Function\n";
+      obj_vec[0]->checkHessVec(x,d,true,*outStream);
+      *outStream << "\n\nCheck Hessian_11 of State Objective Function\n";
+      obj_vec[0]->checkHessVec_11(*up,*zp,*dup,true,*outStream);
+      *outStream << "\n\nCheck Hessian_12 of State Objective Function\n";
+      obj_vec[0]->checkHessVec_12(*up,*zp,*dzp,true,*outStream);
+      *outStream << "\n\nCheck Hessian_21 of State Objective Function\n";
+      obj_vec[0]->checkHessVec_21(*up,*zp,*dup,true,*outStream);
+      *outStream << "\n\nCheck Hessian_22 of State Objective Function\n";
+      obj_vec[0]->checkHessVec_22(*up,*zp,*dzp,true,*outStream);
+
+      *outStream << "\n\nCheck Gradient of Control Objective Function\n";
+      obj_vec[1]->checkGradient(x,d,true,*outStream);
+      *outStream << "\n\nCheck Gradient_1 of Control Objective Function\n";
+      obj_vec[1]->checkGradient_1(*up,*zp,*dup,true,*outStream);
+      *outStream << "\n\nCheck Gradient_2 of Control Objective Function\n";
+      obj_vec[1]->checkGradient_2(*up,*zp,*dzp,true,*outStream);
+      *outStream << "\n\nCheck Hessian of Control Objective Function\n";
+      obj_vec[1]->checkHessVec(x,d,true,*outStream);
+      *outStream << "\n\nCheck Hessian_11 of Control Objective Function\n";
+      obj_vec[1]->checkHessVec_11(*up,*zp,*dup,true,*outStream);
+      *outStream << "\n\nCheck Hessian_12 of State Objective Function\n";
+      obj_vec[1]->checkHessVec_12(*up,*zp,*dzp,true,*outStream);
+      *outStream << "\n\nCheck Hessian_21 of State Objective Function\n";
+      obj_vec[1]->checkHessVec_21(*up,*zp,*dup,true,*outStream);
+      *outStream << "\n\nCheck Hessian_22 of Control Objective Function\n";
+      obj_vec[1]->checkHessVec_22(*up,*zp,*dzp,true,*outStream);
+
+      *outStream << "\n\nCheck Gradient of Full Objective Function\n";
+      obj->checkGradient(x,d,true,*outStream);
+      *outStream << "\n\nCheck Gradient_1 of Full Objective Function\n";
+      obj->checkGradient_1(*up,*zp,*dup,true,*outStream);
+      *outStream << "\n\nCheck Gradient_2 of Full Objective Function\n";
+      obj->checkGradient_2(*up,*zp,*dzp,true,*outStream);
+      *outStream << "\n\nCheck Hessian of Full Objective Function\n";
+      obj->checkHessVec(x,d,true,*outStream);
+      *outStream << "\n\nCheck Hessian_11 of Full Objective Function\n";
+      obj->checkHessVec_11(*up,*zp,*dup,true,*outStream);
+      *outStream << "\n\nCheck Hessian_12 of State Objective Function\n";
+      obj->checkHessVec_12(*up,*zp,*dzp,true,*outStream);
+      *outStream << "\n\nCheck Hessian_21 of State Objective Function\n";
+      obj->checkHessVec_21(*up,*zp,*dup,true,*outStream);
+      *outStream << "\n\nCheck Hessian_22 of Full Objective Function\n";
+      obj->checkHessVec_22(*up,*zp,*dzp,true,*outStream);
+
+      *outStream << "\n\nCheck Full Jacobian of PDE Constraint\n";
+      con->checkApplyJacobian(x,d,*rp,true,*outStream);
+      *outStream << "\n\nCheck Jacobian_1 of PDE Constraint\n";
+      con->checkApplyJacobian_1(*up,*zp,*dup,*rp,true,*outStream);
+      *outStream << "\n\nCheck Jacobian_2 of PDE Constraint\n";
+      con->checkApplyJacobian_2(*up,*zp,*dzp,*rp,true,*outStream);
+      *outStream << "\n\nCheck Full Hessian of PDE Constraint\n";
+      con->checkApplyAdjointHessian(x,*pp,d,x,true,*outStream);
+      *outStream << "\n\nCheck Hessian_11 of PDE Constraint\n";
+      con->checkApplyAdjointHessian_11(*up,*zp,*pp,*dup,*dualup,true,*outStream);
+      *outStream << "\n\nCheck Hessian_21 of PDE Constraint\n";
+      con->checkApplyAdjointHessian_21(*up,*zp,*pp,*dzp,*dualup,true,*outStream);
+      *outStream << "\n\nCheck Hessian_12 of PDE Constraint\n";
+      con->checkApplyAdjointHessian_12(*up,*zp,*pp,*dup,*dualzp,true,*outStream);
+      *outStream << "\n\nCheck Hessian_22 of PDE Constraint\n";
+      con->checkApplyAdjointHessian_22(*up,*zp,*pp,*dzp,*dualzp,true,*outStream);
+      *outStream << "\n";
+      con->checkAdjointConsistencyJacobian(*dup,d,x,true,*outStream);
+      *outStream << "\n";
+      con->checkInverseJacobian_1(*up,*up,*up,*zp,true,*outStream);
+      *outStream << "\n";
+      con->checkInverseAdjointJacobian_1(*up,*up,*up,*zp,true,*outStream);
+
+      *outStream << "\n\nCheck Gradient of Reduced Objective Function\n";
+      robj->checkGradient(*zp,*dzp,true,*outStream);
+      *outStream << "\n\nCheck Hessian of Reduced Objective Function\n";
+      robj->checkHessVec(*zp,*dzp,true,*outStream);
+    }
+
+    // Output uncontrolled state.
+    zp->zero();
+    pdecon->printMeshData(*outStream);
+    con->solve(*rp, *up, *zp, tol);
+    pdecon->outputTpetraVector(u_ptr, "state_uncontrolled.txt");
+
+    bool useFullSpace = parlist->sublist("Problem").get("Full space",false);
+    ROL::Ptr<ROL::Problem<RealT>> optProb;
+    if ( useFullSpace ) {
+      optProb = ROL::makePtr<ROL::Problem<RealT>>(obj, makePtrFromRef(x));
+      optProb->addConstraint("PDE", con, rp);
+    }
+    else {
+      optProb = ROL::makePtr<ROL::Problem<RealT>>(robj,zp);
+    }
+    ROL::Solver<RealT> optSolver(optProb, *parlist);
+    Teuchos::Time algoTimer("Algorithm Time", true);
+    optSolver.solve(*outStream);
+    algoTimer.stop();
+    *outStream << "Total optimization time = " << algoTimer.totalElapsedTime() << " seconds.\n";
+
+    // Output.
+    pdecon->printMeshData(*outStream);
+    con->solve(*rp, *up, *zp, tol);
+    pdecon->outputTpetraVector(u_ptr, "state.txt");
+    pdecon->outputTpetraVector(z_ptr, "control.txt");
+
+    // Get a summary from the time monitor.
+    Teuchos::TimeMonitor::summarize();
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/helmholtz/example_02.cpp b/packages/rol/example/PDE-OPT/helmholtz/example_02.cpp
index 523c3cb3c40d..674c3284dda3 100644
--- a/packages/rol/example/PDE-OPT/helmholtz/example_02.cpp
+++ b/packages/rol/example/PDE-OPT/helmholtz/example_02.cpp
@@ -14,8 +14,7 @@
 #include "Teuchos_Comm.hpp"
 #include "Teuchos_Time.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -44,7 +43,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int> > comm
     = Tpetra::getDefaultComm();
   const int myRank = comm->getRank();
diff --git a/packages/rol/example/PDE-OPT/helmholtz/example_02K.cpp b/packages/rol/example/PDE-OPT/helmholtz/example_02K.cpp
new file mode 100644
index 000000000000..5050ec153dce
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/helmholtz/example_02K.cpp
@@ -0,0 +1,316 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_01.cpp
+    \brief Shows how to solve the optimal control of Helmholtz problem.
+*/
+
+#include "Teuchos_Comm.hpp"
+#include "Teuchos_Time.hpp"
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+
+#include "ROL_Solver.hpp"
+#include "ROL_Reduced_Objective_SimOpt.hpp"
+
+#include "../TOOLS/linearpdeconstraintK.hpp"
+#include "../TOOLS/pdeobjectiveK.hpp"
+#include "../TOOLS/pdevectorK.hpp"
+#include "../TOOLS/meshmanagerK.hpp"
+#include "../TOOLS/sysbuilder.hpp"
+
+#include "pde_helmholtzK.hpp"
+#include "obj_helmholtzK.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+int main(int argc, char *argv[]) {
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  Kokkos::ScopeGuard kokkosScope(argc, argv);
+  ROL::Ptr<const Teuchos::Comm<int> > comm = Tpetra::getDefaultComm();
+  const int myRank = comm->getRank();
+  if ((iprint > 0) && (myRank == 0)) {
+    outStream = ROL::makePtrFromRef(std::cout);
+  }
+  else {
+    outStream = ROL::makePtrFromRef(bhs);
+  }
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+    RealT tol(1e-8);// one(1);
+
+    /*** Read in XML input ***/
+    std::string filename = "input.xml";
+    auto parlist = Teuchos::rcp( new Teuchos::ParameterList() );
+    Teuchos::updateParametersFromXmlFile( filename, parlist.ptr() );
+
+    /*** Initialize main data structure. ***/
+    auto meshMgr = ROL::makePtr<MeshManager_Rectangle<RealT, DeviceT>>(*parlist);
+    // Initialize PDE describing elasticity equations.
+    auto pde = ROL::makePtr<PDE_Helmholtz<RealT, DeviceT>>(*parlist);
+    ROL::Ptr<ROL::Constraint_SimOpt<RealT>> con
+      = ROL::makePtr<Linear_PDE_Constraint<RealT, DeviceT>>(pde, meshMgr, comm, *parlist, *outStream);
+    // Cast the constraint and get the assembler.
+    ROL::Ptr<Linear_PDE_Constraint<RealT, DeviceT>> pdecon
+      = ROL::dynamicPtrCast<Linear_PDE_Constraint<RealT, DeviceT>>(con);
+    ROL::Ptr<Assembler<RealT, DeviceT>> assembler = pdecon->getAssembler();
+    con->setSolveParameters(*parlist);
+
+    // Create state vector.
+    auto u_ptr = assembler->createStateVector();
+    u_ptr->randomize();
+    auto up = ROL::makePtr<PDE_PrimalSimVector<RealT, DeviceT>>(u_ptr,pde,assembler,*parlist);
+    auto p_ptr = assembler->createStateVector();
+    p_ptr->randomize();
+    auto pp = ROL::makePtr<PDE_PrimalSimVector<RealT, DeviceT>>(p_ptr,pde,assembler,*parlist);
+    // Create control vector.
+    auto z_ptr = assembler->createControlVector();
+    z_ptr->randomize();
+    auto zp = ROL::makePtr<PDE_PrimalOptVector<RealT, DeviceT>>(z_ptr,pde,assembler,*parlist);
+    // Create residual vector.
+    auto r_ptr = assembler->createResidualVector();
+    r_ptr->putScalar(0.0);
+    auto rp = ROL::makePtr<PDE_DualSimVector<RealT, DeviceT>>(r_ptr,pde,assembler,*parlist);
+    // Create state direction vector.
+    auto du_ptr = assembler->createStateVector();
+    du_ptr->randomize();
+    //du_ptr->putScalar(0);
+    auto dup = ROL::makePtr<PDE_PrimalSimVector<RealT, DeviceT>>(du_ptr,pde,assembler,*parlist);
+    // Create control direction vector.
+    auto dz_ptr = assembler->createControlVector();
+    dz_ptr->randomize();
+    //dz_ptr->putScalar(0);
+    auto dzp = ROL::makePtr<PDE_PrimalOptVector<RealT, DeviceT>>(dz_ptr,pde,assembler,*parlist);
+    // Create control test vector.
+    auto rz_ptr = assembler->createControlVector();
+    rz_ptr->randomize();
+    auto rzp = ROL::makePtr<PDE_PrimalOptVector<RealT, DeviceT>>(rz_ptr,pde,assembler,*parlist);
+
+    auto dualu_ptr = assembler->createStateVector();
+    auto dualup = ROL::makePtr<PDE_DualSimVector<RealT, DeviceT>>(dualu_ptr,pde,assembler,*parlist);
+    auto dualz_ptr = assembler->createControlVector();
+    auto dualzp = ROL::makePtr<PDE_DualOptVector<RealT, DeviceT>>(dualz_ptr,pde,assembler,*parlist);
+
+    // Create ROL SimOpt vectors.
+    ROL::Vector_SimOpt<RealT> x(up,zp);
+    ROL::Vector_SimOpt<RealT> d(dup,dzp);
+
+    // Initialize compliance objective function.
+    bool storage = parlist->sublist("Problem").get("Use Storage",true);
+    std::vector<ROL::Ptr<QoI<RealT, DeviceT>>> qoi_vec(2, ROL::nullPtr);
+    qoi_vec[0] = ROL::makePtr<QoI_Helmholtz_StateTracking<RealT, DeviceT>>(pde->getFE(),
+                                                                           pde->getFieldHelper(),
+                                                                           *parlist);
+    qoi_vec[1] = ROL::makePtr<QoI_Helmholtz_ControlPenalty<RealT, DeviceT>>(pde->getFE(),
+                                                                            pde->getFieldHelper(),
+                                                                            *parlist);
+    auto obj = ROL::makePtr<PDE_Objective<RealT, DeviceT>>(qoi_vec, assembler);
+    auto robj = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(obj, con, up, zp, pp, storage, false);
+
+    // Run derivative checks
+    bool checkDeriv = parlist->sublist("Problem").get("Check derivatives",false);
+    if ( checkDeriv ) {
+      *outStream << "\n\nCheck Opt Vector\n";
+      zp->checkVector(*dzp, *rzp, true, *outStream);
+
+      std::vector<ROL::Ptr<ROL::Objective_SimOpt<RealT> > > obj_vec(2,ROL::nullPtr);
+      obj_vec[0] = ROL::makePtr<IntegralObjective<RealT, DeviceT>>(qoi_vec[0], assembler);
+      obj_vec[1] = ROL::makePtr<IntegralObjective<RealT, DeviceT>>(qoi_vec[1], assembler);
+
+      *outStream << "\n\nCheck Gradient of State Objective Function\n";
+      obj_vec[0]->checkGradient(x,d,true,*outStream);
+      *outStream << "\n\nCheck Gradient_1 of State Objective Function\n";
+      obj_vec[0]->checkGradient_1(*up,*zp,*dup,true,*outStream);
+      *outStream << "\n\nCheck Gradient_2 of State Objective Function\n";
+      obj_vec[0]->checkGradient_2(*up,*zp,*dzp,true,*outStream);
+      *outStream << "\n\nCheck Hessian of State Objective Function\n";
+      obj_vec[0]->checkHessVec(x,d,true,*outStream);
+      *outStream << "\n\nCheck Hessian_11 of State Objective Function\n";
+      obj_vec[0]->checkHessVec_11(*up,*zp,*dup,true,*outStream);
+      *outStream << "\n\nCheck Hessian_12 of State Objective Function\n";
+      obj_vec[0]->checkHessVec_12(*up,*zp,*dzp,true,*outStream);
+      *outStream << "\n\nCheck Hessian_21 of State Objective Function\n";
+      obj_vec[0]->checkHessVec_21(*up,*zp,*dup,true,*outStream);
+      *outStream << "\n\nCheck Hessian_22 of State Objective Function\n";
+      obj_vec[0]->checkHessVec_22(*up,*zp,*dzp,true,*outStream);
+
+      *outStream << "\n\nCheck Gradient of Control Objective Function\n";
+      obj_vec[1]->checkGradient(x,d,true,*outStream);
+      *outStream << "\n\nCheck Gradient_1 of Control Objective Function\n";
+      obj_vec[1]->checkGradient_1(*up,*zp,*dup,true,*outStream);
+      *outStream << "\n\nCheck Gradient_2 of Control Objective Function\n";
+      obj_vec[1]->checkGradient_2(*up,*zp,*dzp,true,*outStream);
+      *outStream << "\n\nCheck Hessian of Control Objective Function\n";
+      obj_vec[1]->checkHessVec(x,d,true,*outStream);
+      *outStream << "\n\nCheck Hessian_11 of Control Objective Function\n";
+      obj_vec[1]->checkHessVec_11(*up,*zp,*dup,true,*outStream);
+      *outStream << "\n\nCheck Hessian_12 of State Objective Function\n";
+      obj_vec[1]->checkHessVec_12(*up,*zp,*dzp,true,*outStream);
+      *outStream << "\n\nCheck Hessian_21 of State Objective Function\n";
+      obj_vec[1]->checkHessVec_21(*up,*zp,*dup,true,*outStream);
+      *outStream << "\n\nCheck Hessian_22 of Control Objective Function\n";
+      obj_vec[1]->checkHessVec_22(*up,*zp,*dzp,true,*outStream);
+
+      *outStream << "\n\nCheck Gradient of Full Objective Function\n";
+      obj->checkGradient(x,d,true,*outStream);
+      *outStream << "\n\nCheck Gradient_1 of Full Objective Function\n";
+      obj->checkGradient_1(*up,*zp,*dup,true,*outStream);
+      *outStream << "\n\nCheck Gradient_2 of Full Objective Function\n";
+      obj->checkGradient_2(*up,*zp,*dzp,true,*outStream);
+      *outStream << "\n\nCheck Hessian of Full Objective Function\n";
+      obj->checkHessVec(x,d,true,*outStream);
+      *outStream << "\n\nCheck Hessian_11 of Full Objective Function\n";
+      obj->checkHessVec_11(*up,*zp,*dup,true,*outStream);
+      *outStream << "\n\nCheck Hessian_12 of State Objective Function\n";
+      obj->checkHessVec_12(*up,*zp,*dzp,true,*outStream);
+      *outStream << "\n\nCheck Hessian_21 of State Objective Function\n";
+      obj->checkHessVec_21(*up,*zp,*dup,true,*outStream);
+      *outStream << "\n\nCheck Hessian_22 of Full Objective Function\n";
+      obj->checkHessVec_22(*up,*zp,*dzp,true,*outStream);
+
+      *outStream << "\n\nCheck Full Jacobian of PDE Constraint\n";
+      con->checkApplyJacobian(x,d,*rp,true,*outStream);
+      *outStream << "\n\nCheck Jacobian_1 of PDE Constraint\n";
+      con->checkApplyJacobian_1(*up,*zp,*dup,*rp,true,*outStream);
+      *outStream << "\n\nCheck Jacobian_2 of PDE Constraint\n";
+      con->checkApplyJacobian_2(*up,*zp,*dzp,*rp,true,*outStream);
+      *outStream << "\n\nCheck Full Hessian of PDE Constraint\n";
+      con->checkApplyAdjointHessian(x,*pp,d,x,true,*outStream);
+      *outStream << "\n\nCheck Hessian_11 of PDE Constraint\n";
+      con->checkApplyAdjointHessian_11(*up,*zp,*pp,*dup,*dualup,true,*outStream);
+      *outStream << "\n\nCheck Hessian_21 of PDE Constraint\n";
+      con->checkApplyAdjointHessian_21(*up,*zp,*pp,*dzp,*dualup,true,*outStream);
+      *outStream << "\n\nCheck Hessian_12 of PDE Constraint\n";
+      con->checkApplyAdjointHessian_12(*up,*zp,*pp,*dup,*dualzp,true,*outStream);
+      *outStream << "\n\nCheck Hessian_22 of PDE Constraint\n";
+      con->checkApplyAdjointHessian_22(*up,*zp,*pp,*dzp,*dualzp,true,*outStream);
+      *outStream << "\n";
+      con->checkAdjointConsistencyJacobian(*dup,d,x,true,*outStream);
+      *outStream << "\n";
+      con->checkInverseJacobian_1(*up,*up,*up,*zp,true,*outStream);
+      *outStream << "\n";
+      con->checkInverseAdjointJacobian_1(*up,*up,*up,*zp,true,*outStream);
+
+      *outStream << "\n\nCheck Gradient of Reduced Objective Function\n";
+      robj->checkGradient(*zp,*dzp,true,*outStream);
+      *outStream << "\n\nCheck Hessian of Reduced Objective Function\n";
+      robj->checkHessVec(*zp,*dzp,true,*outStream);
+    }
+
+    // Build KKT system.
+    ROL::Ptr<Tpetra::CrsMatrix<>> A;
+    ROL::Ptr<Tpetra::CrsMatrix<>> B;
+    ROL::Ptr<Tpetra::CrsMatrix<>> M;
+    ROL::Ptr<Tpetra::CrsMatrix<>> R;
+    ROL::Ptr<Tpetra::MultiVector<>> simres;
+    assembler->assemblePDEJacobian1(A, pde, u_ptr);
+    assembler->assemblePDEJacobian2(B, pde, u_ptr);
+    assembler->assemblePDERieszMap1(M, pde);
+    assembler->assemblePDERieszMap2(R, pde);
+    assembler->assemblePDEResidual(simres, pde, u_ptr, z_ptr);
+    SysBuilder<RealT> kktbuilder;
+    ROL::Ptr<Tpetra::CrsMatrix<>>   KKT;
+    ROL::Ptr<Tpetra::MultiVector<>> BKKT;
+    ROL::Ptr<Tpetra::MultiVector<>> XKKT;
+    Teuchos::Time kktTimer("KKT Time", true);
+    kktbuilder.buildMatrix(KKT, A, B, M, ROL::nullPtr, ROL::nullPtr, R);
+    kktTimer.stop();
+    *outStream << "KKT matrix assembly time = " << kktTimer.totalElapsedTime() << " seconds.\n";
+    kktTimer.reset(); kktTimer.start();
+    kktbuilder.buildVector(BKKT, simres, simres, simres);
+    kktTimer.stop();
+    *outStream << "KKT RHS assembly time = " << kktTimer.totalElapsedTime() << " seconds.\n";
+    kktTimer.reset(); kktTimer.start();
+    kktbuilder.buildVector(XKKT, simres, simres, simres);
+    kktTimer.stop();
+    *outStream << "KKT XVEC assembly time = " << kktTimer.totalElapsedTime() << " seconds.\n";
+    //Tpetra::MatrixMarket::Writer< Tpetra::CrsMatrix<> > matWriter;
+    //if (KKT != ROL::nullPtr) {
+    //  matWriter.writeSparseFile("kkt.txt", KKT);
+    //}
+    Solver<RealT> kktsolver(parlist->sublist("Solver"));
+    kktTimer.reset(); kktTimer.start();
+    kktsolver.setA(KKT);
+    kktTimer.stop();
+    *outStream << "KKT factorization time = " << kktTimer.totalElapsedTime() << " seconds.\n";
+    kktTimer.reset(); kktTimer.start();
+    kktsolver.solve(XKKT, BKKT);
+    kktTimer.stop();
+    *outStream << "KKT solution time = " << kktTimer.totalElapsedTime() << " seconds.\n";
+
+    // Schur matrix.
+    Teuchos::Time schurTimer("Schur Time", true);
+    ROL::Ptr<Tpetra::CrsMatrix<>> AAT =
+      ROL::makePtr<Tpetra::CrsMatrix<>>(A->getRowMap(), 2*A->getGlobalMaxNumRowEntries());
+    ROL::Ptr<Tpetra::CrsMatrix<>> BBT =
+      ROL::makePtr<Tpetra::CrsMatrix<>>(B->getRowMap(), 2*B->getGlobalMaxNumRowEntries());
+    ROL::Ptr<Tpetra::CrsMatrix<>> C;
+    Tpetra::MatrixMatrix::Multiply(*A, false, *A, true, *AAT);
+    Tpetra::MatrixMatrix::Multiply(*B, false, *B, true, *BBT);
+    C = Tpetra::MatrixMatrix::add(1.0, false, *AAT, 1.0, false, *BBT);
+    ROL::Ptr<Tpetra::MultiVector<>> Xschur;
+    ROL::Ptr<Tpetra::MultiVector<>> Bschur;
+    assembler->assemblePDEResidual(Xschur, pde, u_ptr, z_ptr);
+    assembler->assemblePDEResidual(Bschur, pde, u_ptr, z_ptr);
+    schurTimer.stop();
+    *outStream << "Schur complement assembly time = " << schurTimer.totalElapsedTime() << " seconds.\n";
+    Solver<RealT> schursolver(parlist->sublist("Solver"));
+    schurTimer.reset(); schurTimer.start();
+    schursolver.setA(C);
+    schurTimer.stop();
+    *outStream << "Schur complement factorization time = " << schurTimer.totalElapsedTime() << " seconds.\n";
+    schurTimer.reset(); schurTimer.start();
+    schursolver.solve(Xschur, Bschur);
+    schurTimer.stop();
+    *outStream << "Schur solution time = " << schurTimer.totalElapsedTime() << " seconds.\n";
+
+
+    auto prob = ROL::makePtr<ROL::Problem<RealT>>(robj, zp);
+    ROL::Solver<RealT> solver(prob, *parlist);
+    Teuchos::Time algoTimer("Algorithm Time", true);
+    solver.solve(*outStream);
+    algoTimer.stop();
+    *outStream << "Total optimization time = " << algoTimer.totalElapsedTime() << " seconds.\n";
+
+    // Output.
+    pdecon->printMeshData(*outStream);
+    con->solve(*rp, *up, *zp, tol);
+    pdecon->outputTpetraVector(u_ptr, "state.txt");
+    pdecon->outputTpetraVector(z_ptr, "control.txt");
+
+    // Get a summary from the time monitor.
+    Teuchos::TimeMonitor::summarize();
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/helmholtz/obj_helmholtzK.hpp b/packages/rol/example/PDE-OPT/helmholtz/obj_helmholtzK.hpp
new file mode 100644
index 000000000000..34c0a36243e0
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/helmholtz/obj_helmholtzK.hpp
@@ -0,0 +1,422 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  obj.hpp
+    \brief Provides the interface for local (cell-based) objective function computations.
+*/
+
+#ifndef PDEOPT_QOI_HELMHOLTZK_HPP
+#define PDEOPT_QOI_HELMHOLTZK_HPP
+
+#include "../TOOLS/qoiK.hpp"
+#include "pde_helmholtzK.hpp"
+
+template <class Real, class DeviceType>
+class QoI_Helmholtz_StateTracking : public QoI<Real, DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using fe_type = FE<Real, DeviceType>;
+  using fst = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using rst = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> fe_;
+  const ROL::Ptr<FieldHelper<Real, DeviceType>> fieldHelper_;
+
+  scalar_view weight_;
+  std::vector<scalar_view> target_;
+
+  Real RoiRadius_;
+  Real waveNumber_;
+  Real angle_;
+
+protected:
+  void computeDomainWeight(void) {
+    int c = fe_->gradN().extent_int(0);
+    int p = fe_->gradN().extent_int(2);
+    int d = fe_->gradN().extent_int(3);
+
+    weight_ = scalar_view("weight_", c, p);
+
+    const Real zero(0), one(1);
+    bool inside(false);
+    std::vector<Real> x(d);
+    for (int i = 0; i < c; ++i) {
+      inside = false;
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k) {
+          x[k] = fe_->cubPts()(i, j, k);
+        }
+        if ( insideDomain(x) ) {
+          inside = true;
+          break;
+        }
+      }
+      for (int j = 0; j < p; ++j) {
+        weight_(i, j) = (inside ? one : zero);
+      }
+    }
+  }
+
+  void computeTarget(void) {
+    int c = fe_->gradN().extent_int(0);
+    int p = fe_->gradN().extent_int(2);
+    int d = fe_->gradN().extent_int(3);
+
+    target_.clear();
+    target_.resize(2);
+    target_[0] = scalar_view("target_[0]", c, p);
+    target_[1] = scalar_view("target_[1]", c, p);
+
+    std::vector<Real> x(d);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k) {
+          x[k] = fe_->cubPts()(i, j, k);
+        }
+        (target_[0])(i, j) = evaluateTarget(x, 0);
+        (target_[1])(i, j) = evaluateTarget(x, 1);
+      }
+    }
+  }
+
+  Real DegreesToRadians(const Real deg) const {
+    return deg * static_cast<Real>(M_PI) / static_cast<Real>(180);
+  }
+
+public:
+  QoI_Helmholtz_StateTracking(const ROL::Ptr<fe_type> &fe,
+                              const ROL::Ptr<FieldHelper<Real, DeviceType>> &fieldHelper,
+                              Teuchos::ParameterList &parlist)
+    : fe_(fe), fieldHelper_(fieldHelper) {
+    RoiRadius_  = parlist.sublist("Problem").get("ROI Radius", 2.0);
+    waveNumber_ = parlist.sublist("Problem").get("Wave Number", 10.0);
+    angle_      = parlist.sublist("Problem").get("Target Angle", 45.0);
+    angle_      = DegreesToRadians(angle_);
+    computeDomainWeight();
+    computeTarget();
+  }
+
+  virtual bool insideDomain(const std::vector<Real> &x) const {
+    Real xnorm(0);
+    const int d = x.size();
+    for (int i = 0; i < d; ++i) {
+      xnorm += x[i]*x[i];
+    }
+    xnorm = std::sqrt(xnorm);
+    return (xnorm <= RoiRadius_);
+  }
+
+  virtual Real evaluateTarget(const std::vector<Real> &x, const int component) const {
+    const Real arg = waveNumber_ * (std::cos(angle_)*x[0] + std::sin(angle_)*x[1]);
+    return (component==0) ? std::cos(arg) : std::sin(arg);
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = fe_->gradN().extent_int(0);
+    int p = fe_->gradN().extent_int(2);
+    // Initialize output val
+    val = scalar_view("val", c);
+    // Get components of the control
+    std::vector<scalar_view> U;
+    fieldHelper_->splitFieldCoeff(U, u_coeff);
+    // Evaluate tracking term
+    scalar_view diffU("diffU", c, p);
+    scalar_view WdiffU("WdiffU", c, p);
+    for (int i=0; i<2; ++i) {
+      Kokkos::deep_copy(diffU, static_cast<Real>(0));
+      Kokkos::deep_copy(WdiffU, static_cast<Real>(0));
+      fe_->evaluateValue(diffU, U[i]);
+      rst::subtract(diffU, target_[i]);
+      fst::scalarMultiplyDataData(WdiffU, weight_, diffU);
+      fe_->computeIntegral(val, WdiffU, diffU, true);
+    }
+    rst::scale(val,static_cast<Real>(0.5));
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    int p = fe_->gradN().extent_int(2);
+    // Initialize output grad
+    std::vector<scalar_view> G(2);
+    // Get components of the control
+    std::vector<scalar_view> U;
+    fieldHelper_->splitFieldCoeff(U, u_coeff);
+    // Evaluate tracking term
+    scalar_view diffU("diffU", c, p);
+    scalar_view WdiffU("WdiffU", c, p);
+    for (int i=0; i<2; ++i) {
+      Kokkos::deep_copy(diffU, static_cast<Real>(0));
+      Kokkos::deep_copy(WdiffU, static_cast<Real>(0));
+      fe_->evaluateValue(diffU, U[i]);
+      rst::subtract(diffU, target_[i]);
+      fst::scalarMultiplyDataData(WdiffU, weight_, diffU);
+      G[i] = scalar_view("G", c, f);
+      fst::integrate(G[i], WdiffU, fe_->NdetJ(), false);
+    }
+    fieldHelper_->combineFieldCoeff(grad, G);
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Helmholtz_StateTracking::gradient_2 is zero.");
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    int p = fe_->gradN().extent_int(2);
+    // Initialize output hessvec
+    std::vector<scalar_view> H(2);
+    // Get components of the control
+    std::vector<scalar_view> V;
+    fieldHelper_->splitFieldCoeff(V, v_coeff);
+    // Evaluate tracking term
+    scalar_view valV("valV", c, p);
+    scalar_view WvalV("WvalV", c, p);
+    for (int i=0; i<2; ++i) {
+      Kokkos::deep_copy(valV, static_cast<Real>(0));
+      Kokkos::deep_copy(WvalV, static_cast<Real>(0));
+      fe_->evaluateValue(valV, V[i]);
+      fst::scalarMultiplyDataData(WvalV, weight_, valV);
+      H[i] = scalar_view("H", c, f);
+      fst::integrate(H[i], WvalV, fe_->NdetJ(), false);
+    }
+    fieldHelper_->combineFieldCoeff(hess, H);
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Helmholtz::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Helmholtz::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Helmholtz::HessVec_22 is zero.");
+  }
+
+}; // QoI_Helmholtz
+
+
+template <class Real, class DeviceType>
+class QoI_Helmholtz_ControlPenalty : public QoI<Real, DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using fe_type = FE<Real, DeviceType>;
+  using fst = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using rst = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> fe_;
+  const ROL::Ptr<FieldHelper<Real, DeviceType>> fieldHelper_;
+
+  Real innerAnnulusRadius_;
+  Real outerAnnulusRadius_;
+  Real RoiRadius_;
+  Real alpha_;
+
+  scalar_view weight_;
+
+  void computeDomainWeight(void) {
+    int c = fe_->gradN().extent_int(0);
+    int p = fe_->gradN().extent_int(2);
+    int d = fe_->gradN().extent_int(3);
+
+    weight_ = scalar_view("weight_", c, p);
+
+    const Real zero(0);
+    bool inside(false);
+    std::vector<Real> x(d);
+    for (int i = 0; i < c; ++i) {
+      inside = false;
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k) {
+          x[k] = fe_->cubPts()(i, j, k);
+        }
+        if ( insideDomain(x) ) {
+          inside = true;
+          break;
+        }
+      }
+      for (int j = 0; j < p; ++j) {
+        weight_(i,j) = (inside ? alpha_ : zero);
+      }
+    }
+  }
+
+public:
+  QoI_Helmholtz_ControlPenalty(const ROL::Ptr<fe_type> &fe,
+                               const ROL::Ptr<FieldHelper<Real, DeviceType>> &fieldHelper,
+                               Teuchos::ParameterList &parlist)
+  : fe_(fe), fieldHelper_(fieldHelper) {
+    Real dist2annulus   = parlist.sublist("Problem").get("Distance to Control Annulus",0.5);
+    Real annulusWidth   = parlist.sublist("Problem").get("Control Annulus Width",0.1);
+    RoiRadius_          = parlist.sublist("Problem").get("ROI Radius",2.0);
+    innerAnnulusRadius_ = RoiRadius_ + dist2annulus;
+    outerAnnulusRadius_ = innerAnnulusRadius_ + annulusWidth;
+    alpha_              = parlist.sublist("Problem").get("Control Penalty",1e-4);
+    computeDomainWeight();
+  }
+
+  virtual bool insideDomain(const std::vector<Real> &x) const {
+    Real xnorm(0);
+    const int d = x.size();
+    for (int i = 0; i < d; ++i) {
+      xnorm += x[i]*x[i];
+    }
+    xnorm = std::sqrt(xnorm);
+    return (xnorm <= outerAnnulusRadius_ && xnorm >= innerAnnulusRadius_);
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = fe_->gradN().extent_int(0);
+    int p = fe_->gradN().extent_int(2);
+    // Initialize output val
+    val = scalar_view("val", c);
+    // Get components of the control
+    std::vector<scalar_view> Z;
+    fieldHelper_->splitFieldCoeff(Z, z_coeff);
+    // Evaluate control penalty
+    scalar_view valZ("valZ", c, p);
+    scalar_view WvalZ("WvalZ", c, p);
+    for (int i=0; i<2; ++i) {
+      Kokkos::deep_copy(valZ, static_cast<Real>(0));
+      Kokkos::deep_copy(WvalZ, static_cast<Real>(0));
+      fe_->evaluateValue(valZ, Z[i]);
+      fst::scalarMultiplyDataData(WvalZ, weight_, valZ);
+      fe_->computeIntegral(val, WvalZ, valZ, true);
+    }
+    rst::scale(val, static_cast<Real>(0.5));
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Helmholtz_ControlCost::gradient_1 is zero.");
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    int p = fe_->gradN().extent_int(2);
+    // Initialize output grad
+    std::vector<scalar_view> G(2);
+    // Get components of the control
+    std::vector<scalar_view> Z;
+    fieldHelper_->splitFieldCoeff(Z, z_coeff);
+    // Evaluate control penalty
+    scalar_view valZ("valZ", c, p);
+    scalar_view WvalZ("WvalZ", c, p);
+    for (int i=0; i<2; ++i) {
+      Kokkos::deep_copy(valZ, static_cast<Real>(0));
+      Kokkos::deep_copy(WvalZ, static_cast<Real>(0));
+      fe_->evaluateValue(valZ, Z[i]);
+      fst::scalarMultiplyDataData(WvalZ, weight_, valZ);
+      G[i] = scalar_view("G", c, f);
+      fst::integrate(G[i], WvalZ, fe_->NdetJ(), false);
+    }
+    fieldHelper_->combineFieldCoeff(grad, G);
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Helmholtz_ControlCost::HessVec_11 is zero.");
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Helmholtz_ControlCost::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Helmholtz_ControlCost::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    int p = fe_->gradN().extent_int(2);
+    // Initialize output hessvec
+    std::vector<scalar_view> H(2);
+    // Get components of the control
+    std::vector<scalar_view> V;
+    fieldHelper_->splitFieldCoeff(V, v_coeff);
+    // Evaluate control penalty
+    scalar_view valV("valV", c, p);
+    scalar_view WvalV("WvalV", c, p);
+    for (int i=0; i<2; ++i) {
+      Kokkos::deep_copy(valV, static_cast<Real>(0));
+      Kokkos::deep_copy(WvalV, static_cast<Real>(0));
+      fe_->evaluateValue(valV, V[i]);
+      fst::scalarMultiplyDataData(WvalV, weight_, valV);
+      H[i] = scalar_view("H", c, f);
+      fst::integrate(H[i], WvalV, fe_->NdetJ(), false);
+    }
+    fieldHelper_->combineFieldCoeff(hess, H);
+  }
+
+}; // QoI_Helmholtz_ControlPenalty
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/helmholtz/pde_helmholtzK.hpp b/packages/rol/example/PDE-OPT/helmholtz/pde_helmholtzK.hpp
new file mode 100644
index 000000000000..7723744cb6cc
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/helmholtz/pde_helmholtzK.hpp
@@ -0,0 +1,591 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  pde_helmholtz.hpp
+    \brief Implements the local PDE interface for the optimal control of
+           Helmholtz.
+*/
+
+#ifndef PDE_HELMHOLTZK_HPP
+#define PDE_HELMHOLTZK_HPP
+
+#include "../TOOLS/pdeK.hpp"
+#include "../TOOLS/feK.hpp"
+#include "../TOOLS/fieldhelperK.hpp"
+
+#include "Intrepid2_HGRAD_QUAD_C1_FEM.hpp"
+#include "Intrepid2_DefaultCubatureFactory.hpp"
+#include "Intrepid2_FunctionSpaceTools.hpp"
+#include "Intrepid2_CellTools.hpp"
+
+#include "ROL_Ptr.hpp"
+
+
+template <class Real, class DeviceType>
+class PDE_Helmholtz : public PDE<Real, DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type = FE<Real, DeviceType>;
+  using fst = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct = Intrepid2::CellTools<DeviceType>;
+  using rst = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  // Finite element basis information
+  basis_ptr basisPtr_;
+  std::vector<basis_ptr> basisPtrs_;
+  // Cell cubature information
+  ROL::Ptr<Intrepid2::Cubature<DeviceType, Real, Real>> cellCub_;
+  ROL::Ptr<Intrepid2::Cubature<DeviceType, Real, Real>> bdryCub_;
+  // Cell node information
+  scalar_view volCellNodes_;
+  std::vector<std::vector<scalar_view>> bdryCellNodes_;
+  std::vector<std::vector<std::vector<int>>> bdryCellLocIds_;
+  // Finite element definition
+  ROL::Ptr<fe_type> fe_;
+  std::vector<ROL::Ptr<fe_type>> feBdry_;
+  // Local degrees of freedom on boundary, for each side of the reference cell (first index).
+  std::vector<std::vector<int>> fidx_;
+  // Coordinates of degrees freedom on boundary cells.
+  // Indexing:  [sideset number][local side id](cell number, value at dof)
+  std::vector<std::vector<scalar_view>> bdryCellDofValues_;
+  // Field pattern, offsets, etc.
+  std::vector<std::vector<int>> fieldPattern_;   // local Field/DOF pattern; set from DOF manager 
+  int numFields_;                                // number of fields (equations in the PDE)
+  int numDofs_;                                  // total number of degrees of freedom for all (local) fields
+  std::vector<int> offset_;                      // for each field, a counting offset
+  std::vector<int> numFieldDofs_;                // for each field, number of degrees of freedom
+
+  Real waveNumber_;
+  Real RoiWaveNumber_;
+  Real innerAnnulusRadius_;
+  Real outerAnnulusRadius_;
+  Real RoiRadius_;
+
+  scalar_view ctrlWeight_;
+  scalar_view ctrlJac_;
+
+  ROL::Ptr<FieldHelper<Real, DeviceType>> fieldHelper_;
+
+  void computeRefractiveIndex(scalar_view & kappa) const {
+    int c = fe_->gradN().extent_int(0);
+    int p = fe_->gradN().extent_int(2);
+    int d = fe_->gradN().extent_int(3);
+
+    std::vector<Real> x(d);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k) {
+          x[k] = fe_->cubPts()(i, j, k);
+        }
+        kappa(i, j) = std::pow(evaluateRefractiveIndex(x), 2);
+      }
+    }
+  }
+
+  void computeForce(scalar_view & F, const int component) const {
+    int c = fe_->gradN().extent_int(0);
+    int p = fe_->gradN().extent_int(2);
+    int d = fe_->gradN().extent_int(3);
+
+    std::vector<Real> x(d);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k) {
+          x[k] = fe_->cubPts()(i, j, k);
+        }
+        F(i,j) = evaluateForce(x, component);
+      }
+    }
+  }
+
+  void computeControlWeight(void) {
+    int c = fe_->gradN().extent_int(0);
+    int p = fe_->gradN().extent_int(2);
+    int d = fe_->gradN().extent_int(3);
+
+    ctrlWeight_ = scalar_view("ctrlWeight_", c, p);
+
+    const Real zero(0), one(1);
+    bool inside(false);
+    std::vector<Real> x(d);
+    for (int i = 0; i < c; ++i) {
+      inside = false;
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k) {
+          x[k] = fe_->cubPts()(i, j, k);
+        }
+        if ( insideControlDomain(x) ) {
+          inside = true;
+          break;
+        }
+      }
+      for (int j = 0; j < p; ++j) {
+        ctrlWeight_(i,j) = (inside ? one : zero);
+      }
+    }
+  }
+
+  scalar_view getBoundaryCoeff(const scalar_view cell_coeff, int sideSet, int cell) const {
+    std::vector<int> bdryCellLocId = bdryCellLocIds_[sideSet][cell];
+    const int numCellsSide = bdryCellLocId.size();
+    const int f = basisPtr_->getCardinality();
+
+    scalar_view bdry_coeff("bdry_coeff", numCellsSide, f);
+    for (int i = 0; i < numCellsSide; ++i) {
+      for (int j = 0; j < f; ++j) {
+        bdry_coeff(i, j) = cell_coeff(bdryCellLocId[i], j);
+      }
+    }
+    return bdry_coeff;
+  }
+
+  void buildControlJacobian(void) {
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    int p = fe_->gradN().extent_int(2);
+
+    // Build force/control term
+    scalar_view F("F", c, f, p);
+    fst::scalarMultiplyDataField(F, ctrlWeight_, fe_->N());
+    ctrlJac_ = scalar_view("ctrlJac_", c, f, f);
+    fst::integrate(ctrlJac_, F, fe_->NdetJ(), false);
+  }
+
+public:
+  PDE_Helmholtz(Teuchos::ParameterList &parlist) {
+    // Finite element fields.
+    basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceType, Real, Real>>();
+    // Quadrature rules.
+    shards::CellTopology cellType = basisPtr_->getBaseCellTopology();            // get the cell type from the basis
+    Intrepid2::DefaultCubatureFactory cubFactory;                          // create cubature factory
+    int cubDegree = parlist.sublist("Problem").get("Cubature Degree", 2);        // set cubature degree, e.g., 2
+    cellCub_ = cubFactory.create<DeviceType, Real, Real>(cellType, cubDegree);                           // create default cubature
+    int d = cellType.getDimension();
+
+    basisPtrs_.clear();
+    for (int i=0; i<2; ++i) {
+      basisPtrs_.push_back(basisPtr_);  // Displacement component
+    }
+
+    shards::CellTopology bdryCellType = cellType.getCellTopologyData(d-1, 0);
+    int bdryCubDegree = parlist.sublist("Problem").get("Boundary Cubature Degree",2); // set cubature degree, e.g., 2
+    bdryCub_ = cubFactory.create<DeviceType, Real, Real>(bdryCellType, bdryCubDegree);
+
+    numDofs_ = 0;
+    numFields_ = basisPtrs_.size();
+    offset_.resize(numFields_);
+    numFieldDofs_.resize(numFields_);
+    for (int i=0; i<numFields_; ++i) {
+      if (i==0) {
+        offset_[i]  = 0;
+      }
+      else {
+        offset_[i]  = offset_[i-1] + basisPtrs_[i-1]->getCardinality();
+      }
+      numFieldDofs_[i] = basisPtrs_[i]->getCardinality();
+      numDofs_ += numFieldDofs_[i];
+    }
+
+    waveNumber_         = parlist.sublist("Problem").get("Wave Number",10.0);
+    RoiWaveNumber_      = parlist.sublist("Problem").get("ROI Wave Number",10.0);
+
+    Real dist2annulus   = parlist.sublist("Problem").get("Distance to Control Annulus",0.5);
+    Real annulusWidth   = parlist.sublist("Problem").get("Control Annulus Width",0.1);
+    RoiRadius_          = parlist.sublist("Problem").get("ROI Radius",2.0);
+    innerAnnulusRadius_ = RoiRadius_ + dist2annulus;
+    outerAnnulusRadius_ = innerAnnulusRadius_ + annulusWidth;
+  }
+
+  virtual Real evaluateRefractiveIndex(const std::vector<Real> &x) const {
+    Real xnorm(0), val(0);
+    const int d = x.size();
+    for (int i = 0; i < d; ++i) {
+      xnorm += x[i]*x[i];
+    }
+    xnorm = std::sqrt(xnorm);
+    val   = (xnorm <= RoiRadius_) ? RoiWaveNumber_ : waveNumber_;
+    return val;
+  }
+
+  virtual bool insideControlDomain(const std::vector<Real> &x) const {
+    Real xnorm(0);
+    const int d = x.size();
+    for (int i = 0; i < d; ++i) {
+      xnorm += x[i]*x[i];
+    }
+    xnorm = std::sqrt(xnorm);
+    return (xnorm <= outerAnnulusRadius_ && xnorm >= innerAnnulusRadius_);
+  }
+
+  virtual Real evaluateForce(const std::vector<Real> &x, const int component) const {
+    Real val(0);
+    return val;
+  }
+
+  void residual(scalar_view & res,
+                const scalar_view u_coeff,
+                const scalar_view z_coeff = scalar_view(),
+                const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    int p = fe_->gradN().extent_int(2);
+    int d = fe_->gradN().extent_int(3);
+
+    // Initialize residuals.
+    std::vector<scalar_view> R(2);
+    for (int i=0; i<2; ++i) {
+      R[i] = scalar_view("R", c, f);
+    }
+
+    // Split u_coeff into components.
+    std::vector<scalar_view> U;
+    std::vector<scalar_view> Z;
+    fieldHelper_->splitFieldCoeff(U, u_coeff);
+    fieldHelper_->splitFieldCoeff(Z, z_coeff);
+
+    // Evaluate/interpolate finite element fields on cells.
+    std::vector<scalar_view> valZ_eval(2);
+    std::vector<scalar_view> valU_eval(2);
+    std::vector<scalar_view> gradU_eval(2);
+    for (int i=0; i<2; ++i) {
+      valZ_eval[i]  = scalar_view("valZ_eval", c, p);
+      fe_->evaluateValue(valZ_eval[i], Z[i]);
+      valU_eval[i]  = scalar_view("valU_eval", c, p);
+      fe_->evaluateValue(valU_eval[i], U[i]);
+      gradU_eval[i] = scalar_view("gradU_eval", c, p, d);
+      fe_->evaluateGradient(gradU_eval[i], U[i]);
+    }
+
+    // Build force/control term
+    std::vector<scalar_view> F(2);
+    for (int i=0; i<2; ++i) {
+      F[i] = scalar_view("F", c, p);
+      computeForce(F[i], i);
+      scalar_view wZ("wZ", c, p);
+      fst::scalarMultiplyDataData(wZ, ctrlWeight_, valZ_eval[i]);
+      rst::add(F[i], wZ);
+    }
+
+    // Build wave number
+    scalar_view kappa("kappa", c, p);
+    computeRefractiveIndex(kappa);
+    std::vector<scalar_view> kappaU(2);
+    for (int i=0; i<2; ++i) {
+      kappaU[i] = scalar_view("kappaU", c, p);
+      fst::scalarMultiplyDataData(kappaU[i], kappa, valU_eval[i]);
+    }
+
+    /*******************************************************************/
+    /*** Evaluate weak form of the residual.****************************/
+    /*** a(ur,vr) - a(ui,vi) + b(ur,vr) + b(ui,vi) = zr(vr) - zi(vi) ***/
+    /*** a(ur,vi) + a(ui,vr) - b(ur,vi) + b(ui,vr) = zr(vi) + zi(vr) ***/
+    /*******************************************************************/
+    std::vector<scalar_view> stiff(2);
+    std::vector<scalar_view> mass(2);
+    std::vector<scalar_view> load(2);
+    for (int i=0; i<2; ++i) {
+      stiff[i] = scalar_view("stiff", c, f);
+      mass[i]  = scalar_view("mass", c, f);
+      load[i]  = scalar_view("load", c, f);
+
+      fst::integrate(stiff[i],
+                     gradU_eval[i],     // grad U
+                     fe_->gradNdetJ(),  // grad N
+                     false);
+      fst::integrate(mass[i],
+                     kappaU[i],         // -kappa2 U
+                     fe_->NdetJ(),      // N
+                     false);
+      fst::integrate(load[i],
+                     F[i],              // F
+                     fe_->NdetJ(),      // N
+                     false);
+    }
+
+    Kokkos::deep_copy(R[0], stiff[0]);
+    rst::subtract(R[0], stiff[1]);
+    rst::subtract(R[0], mass[0]);
+    rst::add(R[0], mass[1]);
+    rst::subtract(R[0], load[0]);
+    rst::add(R[0], load[1]);
+
+    Kokkos::deep_copy(R[1], stiff[0]);
+    rst::add(R[1], stiff[1]);
+    rst::subtract(R[1], mass[0]);
+    rst::subtract(R[1], mass[1]);
+    rst::subtract(R[1], load[0]);
+    rst::subtract(R[1], load[1]);
+
+    // APPLY ROBIN CONTROLS: Sideset 0
+    int sideset = 0;
+    int numLocalSideIds = bdryCellLocIds_[sideset].size();
+    const int numCubPerSide = bdryCub_->getNumPoints();
+    for (int j = 0; j < numLocalSideIds; ++j) {
+      int numCellsSide = bdryCellLocIds_[sideset][j].size();
+      if (numCellsSide) {
+        std::vector<scalar_view> robinRes(2);
+        for (int i = 0; i < 2; ++i) {
+          robinRes[i] = scalar_view("robinRes", numCellsSide, f);
+          // Get U coefficients on Robin boundary
+          scalar_view u_coeff_bdry = getBoundaryCoeff(U[i], sideset, j);
+          // Evaluate U on FE basis
+          scalar_view valU_eval_bdry("valU_eval_bdry", numCellsSide, numCubPerSide);
+          feBdry_[j]->evaluateValue(valU_eval_bdry, u_coeff_bdry);
+          // Compute Neumann residual
+          fst::integrate(robinRes[i], valU_eval_bdry, feBdry_[j]->NdetJ(), false);
+        }
+        // Add Neumann control residual to volume residual
+        for (int k = 0; k < numCellsSide; ++k) {
+          int cidx = bdryCellLocIds_[sideset][j][k];
+          for (int l = 0; l < f; ++l) {
+            (R[0])(cidx, l) += waveNumber_ * ((robinRes[0])(k, l) + (robinRes[1])(k, l));
+            (R[1])(cidx, l) += waveNumber_ * ((robinRes[1])(k, l) - (robinRes[0])(k, l));
+          }
+        }
+      }
+    }
+
+    // Combine the residuals.
+    fieldHelper_->combineFieldCoeff(res, R);
+  }
+
+  void Jacobian_1(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    int p = fe_->gradN().extent_int(2);
+
+    // Initialize Jacobians.
+    std::vector<std::vector<scalar_view>> J(2);
+    for (int i=0; i<2; ++i) {
+      for (int j=0; j<2; ++j) {
+        J[i].push_back(scalar_view("J1", c, f, f));
+      }
+    }
+
+    // Build wave number
+    scalar_view kappa("kappa", c, p);
+    computeRefractiveIndex(kappa);
+    scalar_view kappaN("kappaN", c, f, p);
+    fst::scalarMultiplyDataField(kappaN, kappa, fe_->N());
+
+    /*** Evaluate weak form of the Jacobian. ***/
+    scalar_view kappaM("kappaM", c, f, f);
+    fst::integrate(kappaM, kappaN, fe_->NdetJ(), false);
+
+    Kokkos::deep_copy(J[0][0], fe_->stiffMat());
+    rst::subtract(J[0][0], kappaM);
+
+    Kokkos::deep_copy(J[0][1], fe_->stiffMat());
+    rst::subtract(J[0][1], kappaM);
+    rst::scale(J[0][1], static_cast<Real>(-1));
+
+    Kokkos::deep_copy(J[1][0], fe_->stiffMat());
+    rst::subtract(J[1][0], kappaM);
+
+    Kokkos::deep_copy(J[1][1], fe_->stiffMat());
+    rst::subtract(J[1][1], kappaM);
+
+    // APPLY ROBIN CONTROL: Sideset 0
+    int sideset = 0;
+    int numLocalSideIds = bdryCellLocIds_[sideset].size();
+    for (int j = 0; j < numLocalSideIds; ++j) {
+      int numCellsSide = bdryCellLocIds_[sideset][j].size();
+      if (numCellsSide) {
+        // Add Neumann control Jacobian to volume residual
+        for (int k = 0; k < numCellsSide; ++k) {
+          int cidx = bdryCellLocIds_[sideset][j][k];
+          for (int l = 0; l < f; ++l) {
+            for (int m = 0; m < f; ++m) {
+              (J[0][0])(cidx, l, m) += waveNumber_ * (feBdry_[j]->massMat())(k, l, m);
+              (J[0][1])(cidx, l, m) += waveNumber_ * (feBdry_[j]->massMat())(k, l, m);
+              (J[1][0])(cidx, l, m) -= waveNumber_ * (feBdry_[j]->massMat())(k, l, m);
+              (J[1][1])(cidx, l, m) += waveNumber_ * (feBdry_[j]->massMat())(k, l, m);
+            }
+          }
+        }
+      }
+    }
+
+    // Combine the jacobians.
+    fieldHelper_->combineFieldCoeff(jac, J);
+  }
+
+
+  void Jacobian_2(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = ROL::nullPtr,
+                  const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+
+    // Initialize Jacobians.
+    std::vector<std::vector<scalar_view>> J(2);
+    for (int i=0; i<2; ++i) {
+      for (int j=0; j<2; ++j) {
+        J[i].push_back(scalar_view("J2", c, f, f));
+      }
+    }
+
+    /*** Evaluate weak form of the Jacobian. ***/
+    Kokkos::deep_copy(J[0][0], ctrlJac_);
+    rst::scale(J[0][0], static_cast<Real>(-1));
+
+    Kokkos::deep_copy(J[0][1], ctrlJac_);
+
+    Kokkos::deep_copy(J[1][0], ctrlJac_);
+    rst::scale(J[1][0], static_cast<Real>(-1));
+
+    Kokkos::deep_copy(J[1][1], ctrlJac_);
+    rst::scale(J[1][1], static_cast<Real>(-1));
+
+    // Combine the jacobians.
+    fieldHelper_->combineFieldCoeff(jac, J);
+  }
+
+  void Hessian_11(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Helmholtz::Hessian_11): Hessian is zero.");
+  }
+
+  void Hessian_12(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Helmholtz::Hessian_12): Hessian is zero.");
+  }
+
+  void Hessian_21(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Helmholtz::Hessian_21): Hessian is zero.");
+  }
+
+  void Hessian_22(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Helmholtz::Hessian_22): Hessian is zero.");
+  }
+
+  void RieszMap_1(scalar_view & riesz) override {
+    //throw Exception::NotImplemented(">>> (PDE_TopoOpt::RieszMap_1): Not implemented.");
+
+    // Retrieve dimensions.
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    int d = fe_->gradN().extent_int(3);
+
+    // Initialize Jacobians.
+    std::vector<std::vector<scalar_view>> J(d);
+    for (int i=0; i<d; ++i) {
+      for (int j=0; j<d; ++j) {
+        J[i].push_back(scalar_view("JRM1", c, f, f));
+      }
+    }
+
+    for (int i=0; i<d; ++i) {
+      Kokkos::deep_copy(J[i][i], fe_->stiffMat());
+      rst::add(J[i][i], fe_->massMat());
+    }
+
+    // Combine the jacobians.
+    fieldHelper_->combineFieldCoeff(riesz, J);
+  }
+
+  void RieszMap_2(scalar_view & riesz) override {
+    //throw Exception::NotImplemented(">>> (PDE_TopoOpt::RieszMap_2): Not implemented.");
+
+    // Retrieve dimensions.
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    int d = fe_->gradN().extent_int(3);
+ 
+    // Initialize Jacobians.
+    std::vector<std::vector<scalar_view>> J(d);
+    for (int i=0; i<d; ++i) {
+      for (int j=0; j<d; ++j) {
+        J[i].push_back(scalar_view("JRM2", c, f, f));
+      }
+    }
+
+    for (int i=0; i<d; ++i) {
+      Kokkos::deep_copy(J[i][i], fe_->massMat());
+    }
+
+    // Combine the jacobians.
+    fieldHelper_->combineFieldCoeff(riesz, J);
+  }
+
+  std::vector<basis_ptr> getFields() override {
+    return basisPtrs_;
+  }
+
+  void setCellNodes(const scalar_view & volCellNodes,
+                    const std::vector<std::vector<scalar_view>> & bdryCellNodes,
+                    const std::vector<std::vector<std::vector<int>>> & bdryCellLocIds) override {
+    volCellNodes_ = volCellNodes;
+    bdryCellNodes_ = bdryCellNodes;
+    bdryCellLocIds_ = bdryCellLocIds;
+    // Finite element definition.
+    fe_ = ROL::makePtr<fe_type>(volCellNodes_, basisPtr_, cellCub_);
+    fidx_ = fe_->getBoundaryDofs();
+    // Construct boundary FE
+    int sideset = 0;
+    int numLocSides = bdryCellNodes[sideset].size();
+    feBdry_.resize(numLocSides);
+    for (int j = 0; j < numLocSides; ++j) {
+      if (bdryCellNodes[sideset][j] != scalar_view()) {
+        feBdry_[j] = ROL::makePtr<fe_type>(bdryCellNodes[sideset][j], basisPtr_, bdryCub_, j);
+      }
+    }
+    // Compute control weight
+    computeControlWeight();
+    buildControlJacobian();
+  }
+
+  void setFieldPattern(const std::vector<std::vector<int>> & fieldPattern) override {
+    fieldPattern_ = fieldPattern;
+    fieldHelper_ = ROL::makePtr<FieldHelper<Real, DeviceType>>(numFields_, numDofs_, numFieldDofs_, fieldPattern_);
+  }
+
+  const ROL::Ptr<fe_type> getFE(void) const {
+    return fe_;
+  }
+
+  const std::vector<fe_type> getBdryFE(void) const {
+    return feBdry_;
+  }
+
+  const std::vector<std::vector<int>> getBdryCellLocIds(const int sideset = 0) const {
+    return bdryCellLocIds_[sideset];
+  }
+
+  const ROL::Ptr<FieldHelper<Real, DeviceType>> getFieldHelper(void) const {
+    return fieldHelper_;
+  }
+
+}; // PDE_Helmholtz
+
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/navier-stokes/CMakeLists.txt b/packages/rol/example/PDE-OPT/navier-stokes/CMakeLists.txt
index 348f7f62300f..cf696baf71ac 100644
--- a/packages/rol/example/PDE-OPT/navier-stokes/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/navier-stokes/CMakeLists.txt
@@ -11,31 +11,31 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_01
     SOURCES example_01.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_02
     SOURCES example_02.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_03
     SOURCES example_03.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_04
     SOURCES example_04.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_05
     SOURCES example_05.cpp
     ADD_DIR_TO_NAME
@@ -59,7 +59,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   #  ADD_DIR_TO_NAME
   #)
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     NavierStokesDataCopy
     SOURCE_FILES
       input.xml input_ex03.xml input_ex04.xml input_ex05.xml plotresults.m plotexample02.m process.sh
@@ -67,5 +67,75 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
   )
 
+ENDIF()
+
+IF(${PROJECT_NAME}_ENABLE_Intrepid2 AND
+   ${PROJECT_NAME}_ENABLE_Ifpack2   AND
+   ${PROJECT_NAME}_ENABLE_MueLu     AND
+   ${PROJECT_NAME}_ENABLE_Amesos2   AND
+   ${PROJECT_NAME}_ENABLE_Tpetra )
+
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/teuchos/src/sol)
+
+  # Need ROL_TpetraMultiVector.hpp
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
+
+  ROL_ADD_EXECUTABLE(
+    example_01_Kokkos
+    SOURCES example_01K.cpp
+    ADD_DIR_TO_NAME
+  )
+
+  #  ROL_ADD_EXECUTABLE(
+  #    example_02_Kokkos
+  #    SOURCES example_02K.cpp
+  #    ADD_DIR_TO_NAME
+  #  )
+
+  #  ROL_ADD_EXECUTABLE(
+  #    example_03_Kokkos
+  #    SOURCES example_03K.cpp
+  #    ADD_DIR_TO_NAME
+  #  )
+  #
+  #  ROL_ADD_EXECUTABLE(
+  #    example_04_Kokkos
+  #    SOURCES example_04K.cpp
+  #    ADD_DIR_TO_NAME
+  #  )
+  #
+  ROL_ADD_EXECUTABLE(
+    example_05_Kokkos
+    SOURCES example_05K.cpp
+    ADD_DIR_TO_NAME
+  )
+
+  TRIBITS_ADD_TEST(
+    example_01_Kokkos
+    ARGS PrintItAll
+    NUM_MPI_PROCS 2
+    NUM_TOTAL_CORES_USED 2
+    PASS_REGULAR_EXPRESSION "TEST PASSED"
+    ADD_DIR_TO_NAME
+    RUN_SERIAL
+  )
+
+  #TRIBITS_ADD_TEST(
+  #  example_02
+  #  ARGS PrintItAll
+  #  NUM_MPI_PROCS 4
+  #  NUM_TOTAL_CORES_USED 4
+  #  PASS_REGULAR_EXPRESSION "TEST PASSED"
+  #  ADD_DIR_TO_NAME
+  #)
+
+  ROL_COPY_FILES_TO_BINARY_DIR(
+    NavierStokesDataCopyK
+    SOURCE_FILES
+      input.xml input_ex03.xml input_ex04.xml input_ex05.xml plotresults.m plotexample02.m process.sh
+    SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}"
+    DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
+  )
 
 ENDIF()
diff --git a/packages/rol/example/PDE-OPT/navier-stokes/example_01.cpp b/packages/rol/example/PDE-OPT/navier-stokes/example_01.cpp
index 123afb3ca32e..7e8700432bdd 100644
--- a/packages/rol/example/PDE-OPT/navier-stokes/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/navier-stokes/example_01.cpp
@@ -13,8 +13,7 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -42,7 +41,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int>> comm
     = Tpetra::getDefaultComm();
   const int myRank = comm->getRank();
diff --git a/packages/rol/example/PDE-OPT/navier-stokes/example_01K.cpp b/packages/rol/example/PDE-OPT/navier-stokes/example_01K.cpp
new file mode 100644
index 000000000000..2489cb5db4c4
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/navier-stokes/example_01K.cpp
@@ -0,0 +1,175 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_01.cpp
+    \brief Shows how to solve the Navier-Stokes control problem.
+*/
+
+#include "Teuchos_Comm.hpp"
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+
+#include "ROL_TpetraMultiVector.hpp"
+#include "ROL_Reduced_Objective_SimOpt.hpp"
+#include "ROL_Solver.hpp"
+
+#include "../TOOLS/meshmanagerK.hpp"
+#include "../TOOLS/pdeconstraintK.hpp"
+#include "../TOOLS/pdeobjectiveK.hpp"
+#include "../TOOLS/pdevectorK.hpp"
+#include "pde_navier-stokesK.hpp"
+#include "obj_navier-stokesK.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+int main(int argc, char *argv[]) {
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  Kokkos::ScopeGuard kokkosScope (argc, argv);
+  ROL::Ptr<const Teuchos::Comm<int>> comm
+    = Tpetra::getDefaultComm();
+  const int myRank = comm->getRank();
+  if ((iprint > 0) && (myRank == 0))
+    outStream = ROL::makePtrFromRef(std::cout);
+  else
+    outStream = ROL::makePtrFromRef(bhs);
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+
+    /*** Read in XML input ***/
+    std::string filename = "input.xml";
+    auto parlist = ROL::getParametersFromXmlFile(filename);
+
+    /*** Initialize main data structure. ***/
+    auto meshMgr = ROL::makePtr<MeshManager_BackwardFacingStepChannel<RealT,DeviceT>>(*parlist);
+    // Initialize PDE describing Navier-Stokes equations.
+    auto pde = ROL::makePtr<PDE_NavierStokes<RealT,DeviceT>>(*parlist);
+    auto con = ROL::makePtr<PDE_Constraint<RealT,DeviceT>>(pde,meshMgr,comm,*parlist,*outStream);
+    // Cast the constraint and get the assembler.
+    auto assembler = con->getAssembler();
+    con->setSolveParameters(*parlist);
+
+    // Create state vector and set to zeroes
+    auto u_ptr = assembler->createStateVector();    u_ptr->randomize();
+    auto p_ptr = assembler->createStateVector();    p_ptr->randomize();
+    auto z_ptr = assembler->createControlVector();  z_ptr->randomize();
+    auto r_ptr = assembler->createResidualVector(); r_ptr->putScalar(0.0);
+    auto up = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(u_ptr,pde,assembler,*parlist);
+    auto pp = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(p_ptr,pde,assembler,*parlist);
+    auto zp = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(z_ptr,pde,assembler,*parlist);
+    auto rp = ROL::makePtr<PDE_DualSimVector<RealT,DeviceT>>(r_ptr,pde,assembler,*parlist);
+    // Create ROL SimOpt vectors
+    ROL::Vector_SimOpt<RealT> x(up,zp);
+
+    // Initialize quadratic objective function.
+    std::vector<ROL::Ptr<QoI<RealT,DeviceT>>> qoi_vec(2,ROL::nullPtr);
+    qoi_vec[0] = ROL::makePtr<QoI_State_NavierStokes<RealT,DeviceT>>(*parlist,
+                                                             pde->getVelocityFE(),
+                                                             pde->getPressureFE(),
+                                                             pde->getFieldHelper());
+    qoi_vec[1] = ROL::makePtr<QoI_L2Penalty_NavierStokes<RealT,DeviceT>>(pde->getVelocityFE(),
+                                                                 pde->getPressureFE(),
+                                                                 pde->getVelocityBdryFE(),
+                                                                 pde->getBdryCellLocIds(),
+                                                                 pde->getFieldHelper());
+    auto std_obj = ROL::makePtr<StdObjective_NavierStokes<RealT>>(*parlist);
+    auto obj = ROL::makePtr<PDE_Objective<RealT,DeviceT>>(qoi_vec,std_obj,assembler);
+    auto robj = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(obj, con, up, zp, pp, true, false);
+
+    // Run derivative checks
+    bool checkDeriv = parlist->sublist("Problem").get("Check derivatives",false);
+    if ( checkDeriv ) {
+      auto dup = up->clone(); dup->randomize(-1.0,1.0);
+      auto dzp = zp->clone(); dzp->randomize(-1.0,1.0);
+      ROL::Vector_SimOpt<RealT> d(dup,dzp);
+      obj->checkGradient(x,d,true,*outStream);
+      obj->checkHessVec(x,d,true,*outStream);
+      con->checkApplyJacobian_1(*up,*zp,*dup,*up,true,*outStream);
+      con->checkApplyJacobian_2(*up,*zp,*dzp,*up,true,*outStream);
+      con->checkApplyJacobian(x,d,*up,true,*outStream);
+      con->checkApplyAdjointHessian(x,*dup,d,x,true,*outStream);
+      con->checkAdjointConsistencyJacobian(*dup,d,x,true,*outStream);
+      con->checkInverseJacobian_1(*up,*up,*up,*zp,true,*outStream);
+      con->checkInverseAdjointJacobian_1(*up,*up,*up,*zp,true,*outStream);
+      robj->checkGradient(*zp,*dzp,true,*outStream);
+      robj->checkHessVec(*zp,*dzp,true,*outStream);
+    }
+
+    up->zero(); zp->zero();
+    ROL::Ptr<ROL::Problem<RealT>> optProb;
+    bool useFullSpace = parlist->sublist("Problem").get("Full space",false);
+    if ( useFullSpace ) {
+      std::string step = parlist->sublist("Step").get("Type", "Composite Step");
+      auto els = ROL::StringToEStep(step);
+      switch( els ) {
+        case ROL::STEP_COMPOSITESTEP:
+        case ROL::STEP_FLETCHER: {
+          RealT tol(1.e-8);
+          bool initSolve = parlist->sublist("Problem").get("Solve state for full space",true);
+          if (initSolve) {
+            con->solve(*rp,*up,*zp,tol);
+            con->outputTpetraVector(u_ptr,"state_uncontrolled.txt");
+          }
+          optProb = ROL::makePtr<ROL::Problem<RealT>>(obj, makePtrFromRef(x));
+          optProb->addConstraint("PDE", con, rp);
+          break;
+        }
+        default: {
+          *outStream << "ERROR: Unsupported step." << std::endl;      
+          errorFlag = 1; 
+        }
+      }
+    }
+    else {
+      parlist->sublist("Step").set("Type","Trust Region");
+      optProb = ROL::makePtr<ROL::Problem<RealT>>(robj, zp);
+    }
+    optProb->finalize(false,true,*outStream);
+    ROL::Solver<RealT> optSolver(optProb, *parlist);
+    optSolver.solve(*outStream);
+
+    // Output.
+    assembler->printMeshData(*outStream);
+    RealT tol(1.e-8);
+    Teuchos::Array<RealT> res(1,0);
+    con->solve(*rp,*up,*zp,tol);
+    con->outputTpetraVector(u_ptr,"state.txt");
+    con->outputTpetraVector(z_ptr,"control.txt");
+    con->value(*rp,*up,*zp,tol);
+    r_ptr->norm2(res.view(0,1));
+    *outStream << "Residual Norm: " << res[0] << std::endl;
+    errorFlag += (res[0] > 1.e-6 ? 1 : 0);
+    //con->outputTpetraData();
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/navier-stokes/example_02.cpp b/packages/rol/example/PDE-OPT/navier-stokes/example_02.cpp
index 736ac9d2983e..cb549bbafbea 100644
--- a/packages/rol/example/PDE-OPT/navier-stokes/example_02.cpp
+++ b/packages/rol/example/PDE-OPT/navier-stokes/example_02.cpp
@@ -13,8 +13,7 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -61,7 +60,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int> > comm
     = Tpetra::getDefaultComm();
   ROL::Ptr<const Teuchos::Comm<int> > serial_comm
diff --git a/packages/rol/example/PDE-OPT/navier-stokes/example_02K.cpp b/packages/rol/example/PDE-OPT/navier-stokes/example_02K.cpp
new file mode 100644
index 000000000000..9ce9c8cb773d
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/navier-stokes/example_02K.cpp
@@ -0,0 +1,303 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_01.cpp
+    \brief Shows how to solve the stochastic Stefan-Boltzmann problem.
+*/
+
+#include "Teuchos_Comm.hpp"
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+//#include <fenv.h>
+
+#include "ROL_Bounds.hpp"
+#include "ROL_Reduced_Objective_SimOpt.hpp"
+#include "ROL_MonteCarloGenerator.hpp"
+#include "ROL_StochasticProblem.hpp"
+#include "ROL_Solver.hpp"
+#include "ROL_TpetraTeuchosBatchManager.hpp"
+
+#include "../TOOLS/meshmanagerK.hpp"
+#include "../TOOLS/pdeconstraintK.hpp"
+#include "../TOOLS/pdeobjectiveK.hpp"
+#include "../TOOLS/pdevectorK.hpp"
+#include "../TOOLS/batchmanagerK.hpp"
+#include "pde_navier-stokesK.hpp"
+#include "obj_navier-stokesK.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+template<class Real>
+Real random(const Teuchos::Comm<int> &comm,
+            const Real a = -1, const Real b = 1) {
+  Real val(0), u(0);
+  if ( Teuchos::rank<int>(comm)==0 ) {
+    u   = static_cast<Real>(rand())/static_cast<Real>(RAND_MAX);
+    val = (b-a)*u + a;
+  }
+  Teuchos::broadcast<int,Real>(comm,0,1,&val);
+  return val;
+}
+
+int main(int argc, char *argv[]) {
+  //feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
+
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  Kokkos::ScopeGuard kokkosScope (argc, argv);
+  ROL::Ptr<const Teuchos::Comm<int> > comm
+    = Tpetra::getDefaultComm();
+  ROL::Ptr<const Teuchos::Comm<int> > serial_comm
+    = ROL::makePtr<Teuchos::SerialComm<int>>();
+  const int myRank = comm->getRank();
+  if ((iprint > 0) && (myRank == 0))
+    outStream = ROL::makePtrFromRef(std::cout);
+  else
+    outStream = ROL::makePtrFromRef(bhs);
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+
+    /*** Read in XML input ***/
+    std::string filename = "input.xml";
+    auto parlist = ROL::getParametersFromXmlFile(filename);
+
+    // Problem dimensions
+    const int stochDim = 2;
+    const RealT one(1); 
+
+    /*************************************************************************/
+    /***************** BUILD GOVERNING PDE ***********************************/
+    /*************************************************************************/
+    /*** Initialize main data structure. ***/
+    auto meshMgr = ROL::makePtr<MeshManager_BackwardFacingStepChannel<RealT,DeviceT>>(*parlist);
+    // Initialize PDE describing advection-diffusion equation
+    auto pde = ROL::makePtr<PDE_NavierStokes<RealT,DeviceT>>(*parlist);
+    auto con = ROL::makePtr<PDE_Constraint<RealT,DeviceT>>(pde,meshMgr,serial_comm,*parlist,*outStream);
+    // Cast the constraint and get the assembler.
+    auto assembler = con->getAssembler();
+    con->setSolveParameters(*parlist);
+
+    /*************************************************************************/
+    /***************** BUILD VECTORS *****************************************/
+    /*************************************************************************/
+    auto  u_ptr = assembler->createStateVector();    u_ptr->randomize();
+    auto du_ptr = assembler->createStateVector();    du_ptr->randomize();
+    auto  p_ptr = assembler->createStateVector();    p_ptr->randomize();
+    auto  r_ptr = assembler->createResidualVector(); r_ptr->randomize();
+    auto  z_ptr = assembler->createControlVector();  z_ptr->randomize();
+    auto up  = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(u_ptr,pde,assembler,*parlist);
+    auto dup = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(u_ptr,pde,assembler,*parlist);
+    auto pp  = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(p_ptr,pde,assembler,*parlist);
+    auto rp  = ROL::makePtr<PDE_DualSimVector<RealT,DeviceT>>(r_ptr,pde,assembler,*parlist);
+    ROL::Ptr<ROL::TpetraMultiVector<RealT>> zpde
+      = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(z_ptr,pde,assembler,*parlist);
+    auto zp = ROL::makePtr<PDE_OptVector<RealT>>(zpde);
+    // Create ROL SimOpt vectors
+    ROL::Vector_SimOpt<RealT> x(up,zp);
+
+    /*************************************************************************/
+    /***************** BUILD COST FUNCTIONAL *********************************/
+    /*************************************************************************/
+    std::vector<ROL::Ptr<QoI<RealT,DeviceT>>> qoi_vec(2,ROL::nullPtr);
+    qoi_vec[0] = ROL::makePtr<QoI_State_NavierStokes<RealT,DeviceT>>(*parlist,
+                                                             pde->getVelocityFE(),
+                                                             pde->getPressureFE(),
+                                                             pde->getFieldHelper());
+    qoi_vec[1] = ROL::makePtr<QoI_L2Penalty_NavierStokes<RealT,DeviceT>>(pde->getVelocityFE(),
+                                                                 pde->getPressureFE(),
+                                                                 pde->getVelocityBdryFE(),
+                                                                 pde->getBdryCellLocIds(),
+                                                                 pde->getFieldHelper());
+    auto std_obj = ROL::makePtr<StdObjective_NavierStokes<RealT>>(*parlist);
+    auto obj = ROL::makePtr<PDE_Objective<RealT,DeviceT>>(qoi_vec,std_obj,assembler);
+    auto objReduced = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(obj, con, up, zp, pp, true, false);
+
+    /*************************************************************************/
+    /***************** BUILD BOUND CONSTRAINT ********************************/
+    /*************************************************************************/
+    auto zlo_ptr = assembler->createControlVector();
+    auto zhi_ptr = assembler->createControlVector();
+    zlo_ptr->putScalar(static_cast<RealT>(0));
+    zhi_ptr->putScalar(ROL::ROL_INF<RealT>());
+    ROL::Ptr<ROL::TpetraMultiVector<RealT>> zlopde, zhipde;
+    zlopde = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(zlo_ptr,pde,assembler,*parlist);
+    zhipde = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(zhi_ptr,pde,assembler,*parlist);
+    auto zlop = ROL::makePtr<PDE_OptVector<RealT>>(zlopde);
+    auto zhip = ROL::makePtr<PDE_OptVector<RealT>>(zhipde);
+    auto bnd = ROL::makePtr<ROL::Bounds<RealT>>(zlop,zhip);
+    bool useBounds = parlist->sublist("Problem").get("Use bounds", false);
+    if (!useBounds) bnd->deactivate();
+
+    /*************************************************************************/
+    /***************** BUILD SAMPLER *****************************************/
+    /*************************************************************************/
+    int nsamp = parlist->sublist("Problem").get("Number of samples",100);
+    std::vector<RealT> tmp = {-one,one};
+    std::vector<std::vector<RealT>> bounds(stochDim,tmp);
+    auto bman = ROL::makePtr<PDE_OptVector_BatchManager<RealT,DeviceT>>(comm);
+    auto sampler = ROL::makePtr<ROL::MonteCarloGenerator<RealT>>(nsamp,bounds,bman);
+
+    /*************************************************************************/
+    /***************** BUILD STOCHASTIC PROBLEM ******************************/
+    /*************************************************************************/
+    auto opt = ROL::makePtr<ROL::StochasticProblem<RealT>>(objReduced,zp);
+    opt->addBoundConstraint(bnd);
+    parlist->sublist("SOL").set("Initial Statistic",one);
+    opt->makeObjectiveStochastic(*parlist,sampler);
+    opt->finalize(false,true,*outStream);
+
+    /*************************************************************************/
+    /***************** RUN VECTOR AND DERIVATIVE CHECKS **********************/
+    /*************************************************************************/
+    bool checkDeriv = parlist->sublist("Problem").get("Check derivatives",false);
+    if ( checkDeriv ) {
+      auto dzp = zp->clone(); dzp->randomize(-one,one);
+      auto yzp = zp->clone(); yzp->randomize(-one,one);
+      ROL::Vector_SimOpt<RealT> d(dup,dzp);
+      up->checkVector(*pp,*dup,true,*outStream);
+      zp->checkVector(*yzp,*dzp,true,*outStream);
+      obj->checkGradient(x,d,true,*outStream);
+      obj->checkHessVec(x,d,true,*outStream);
+      con->checkApplyJacobian(x,d,*up,true,*outStream);
+      con->checkApplyAdjointHessian(x,*dup,d,x,true,*outStream);
+      con->checkAdjointConsistencyJacobian(*dup,d,x,true,*outStream);
+      con->checkInverseJacobian_1(*up,*up,*up,*zp,true,*outStream);
+      con->checkInverseAdjointJacobian_1(*up,*up,*up,*zp,true,*outStream);
+      objReduced->checkGradient(*zp,*dzp,true,*outStream);
+      objReduced->checkHessVec(*zp,*dzp,true,*outStream);
+      opt->check(true,*outStream);
+    }
+
+    /*************************************************************************/
+    /***************** SOLVE OPTIMIZATION PROBLEM ****************************/
+    /*************************************************************************/
+    parlist->sublist("Step").set("Type","Trust Region");
+    ROL::Solver<RealT> solver(opt,*parlist);
+    up->zero(); zp->zero();
+    std::clock_t timer = std::clock();
+    solver.solve(*outStream);
+    *outStream << "Optimization time: "
+               << static_cast<RealT>(std::clock()-timer)/static_cast<RealT>(CLOCKS_PER_SEC)
+               << " seconds." << std::endl << std::endl;
+
+    /*************************************************************************/
+    /***************** OUTPUT RESULTS ****************************************/
+    /*************************************************************************/
+    std::clock_t timer_print = std::clock();
+    assembler->printMeshData(*outStream);
+    // Output control to file
+    con->outputTpetraVector(z_ptr,"control.txt");
+    // Output expected state and samples to file
+    *outStream << std::endl << "Print Expected Value of State" << std::endl;
+    up->zero(); pp->zero(); dup->zero();
+    RealT tol(1.e-8);
+    auto bman_Eu = ROL::makePtr<ROL::TpetraTeuchosBatchManager<RealT>>(comm);
+    std::vector<RealT> sample(stochDim);
+    std::stringstream name_samp;
+    name_samp << "samples_" << bman->batchID() << ".txt";
+    std::ofstream file_samp;
+    file_samp.open(name_samp.str());
+    file_samp << std::scientific << std::setprecision(15);
+    for (int i = 0; i < sampler->numMySamples(); ++i) {
+      *outStream << "Sample i = " << i << std::endl;
+      sample = sampler->getMyPoint(i);
+      con->setParameter(sample);
+      con->solve(*rp,*dup,*zp,tol);
+      up->axpy(sampler->getMyWeight(i),*dup);
+      for (int j = 0; j < stochDim; ++j)
+        file_samp << std::setw(25) << std::left << sample[j];
+      file_samp << std::endl;
+    }
+    file_samp.close();
+    bman_Eu->sumAll(*up,*pp);
+    con->outputTpetraVector(p_ptr,"mean_state.txt");
+    // Output extreme cases
+    std::vector<RealT> param(stochDim,0);
+    param[0] = -one; param[1] = -one;
+    con->setParameter(param);
+    con->solve(*rp,*dup,*zp,tol);
+    con->outputTpetraVector(du_ptr,"state_m1_m1.txt");
+    param[0] =  one; param[1] = -one;
+    con->setParameter(param);
+    con->solve(*rp,*dup,*zp,tol);
+    con->outputTpetraVector(du_ptr,"state_p1_m1.txt");
+    param[0] = -one; param[1] =  one;
+    con->setParameter(param);
+    con->solve(*rp,*dup,*zp,tol);
+    con->outputTpetraVector(du_ptr,"state_m1_p1.txt");
+    param[0] =  one; param[1] =  one;
+    con->setParameter(param);
+    con->solve(*rp,*dup,*zp,tol);
+    con->outputTpetraVector(du_ptr,"state_p1_p1.txt");
+    // Build objective function distribution
+    *outStream << std::endl << "Print Objective CDF" << std::endl;
+    RealT val1(0), val2(0);
+    int nsamp_dist = parlist->sublist("Problem").get("Number of output samples",100);
+    auto stateCost = ROL::makePtr<IntegralObjective<RealT,DeviceT>>(qoi_vec[0],assembler);
+    auto redStateCost = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(stateCost, con, up, zp, pp, true, false);
+    auto ctrlCost = ROL::makePtr<IntegralObjective<RealT,DeviceT>>(qoi_vec[1],assembler);
+    auto redCtrlCost = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(ctrlCost, con, up, zp, pp, true, false);
+    auto sampler_dist = ROL::makePtr<ROL::MonteCarloGenerator<RealT>>(nsamp_dist,bounds,bman);
+    std::stringstream name;
+    name << "obj_samples_" << bman->batchID() << ".txt";
+    std::ofstream file;
+    file.open(name.str());
+    file << std::scientific << std::setprecision(15);
+    for (int i = 0; i < sampler_dist->numMySamples(); ++i) {
+      sample = sampler_dist->getMyPoint(i);
+      redStateCost->setParameter(sample);
+      val1 = redStateCost->value(*zp,tol);
+      redCtrlCost->setParameter(sample);
+      val2 = redCtrlCost->value(*zp,tol);
+      for (int j = 0; j < stochDim; ++j)
+        file << std::setw(25) << std::left << sample[j];
+      file << std::setw(25) << std::left << val1;
+      file << std::setw(25) << std::left << val2 << std::endl;
+    }
+    file.close();
+    *outStream << "Output time: "
+               << static_cast<RealT>(std::clock()-timer_print)/static_cast<RealT>(CLOCKS_PER_SEC)
+               << " seconds." << std::endl << std::endl;
+
+    Teuchos::Array<RealT> res(1,0);
+    con->solve(*rp,*up,*zp,tol);
+    r_ptr->norm2(res.view(0,1));
+
+    /*************************************************************************/
+    /***************** CHECK RESIDUAL NORM ***********************************/
+    /*************************************************************************/
+    *outStream << "Residual Norm: " << res[0] << std::endl << std::endl;
+    errorFlag += (res[0] > 1.e-6 ? 1 : 0);
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/navier-stokes/example_03.cpp b/packages/rol/example/PDE-OPT/navier-stokes/example_03.cpp
index 173ab82ff089..30194a0adaf2 100644
--- a/packages/rol/example/PDE-OPT/navier-stokes/example_03.cpp
+++ b/packages/rol/example/PDE-OPT/navier-stokes/example_03.cpp
@@ -16,8 +16,7 @@
 #include "Teuchos_DefaultMpiComm.hpp"
 #endif
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -138,7 +137,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int>> comm
     = Tpetra::getDefaultComm();
   ROL::Ptr<const Teuchos::Comm<int>> serial_comm
diff --git a/packages/rol/example/PDE-OPT/navier-stokes/example_03K.cpp b/packages/rol/example/PDE-OPT/navier-stokes/example_03K.cpp
new file mode 100644
index 000000000000..ac92a973e752
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/navier-stokes/example_03K.cpp
@@ -0,0 +1,366 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_03.cpp
+    \brief Shows how to solve the stochastic Navier-Stokes problem.
+*/
+
+#include "Teuchos_Comm.hpp"
+#ifdef HAVE_MPI
+#include "Teuchos_DefaultMpiComm.hpp"
+#endif
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+//#include <fenv.h>
+
+#include "ROL_Bounds.hpp"
+#include "ROL_Reduced_Objective_SimOpt.hpp"
+#include "ROL_MonteCarloGenerator.hpp"
+#include "ROL_StochasticProblem.hpp"
+#include "ROL_Solver.hpp"
+#include "ROL_TpetraTeuchosBatchManager.hpp"
+
+#include "../TOOLS/meshmanagerK.hpp"
+#include "../TOOLS/pdeconstraintK.hpp"
+#include "../TOOLS/pdeobjectiveK.hpp"
+#include "../TOOLS/pdevectorK.hpp"
+#include "../TOOLS/batchmanagerK.hpp"
+#include "pde_navier-stokesK.hpp"
+#include "obj_navier-stokesK.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+template<class Real>
+Real random(const Teuchos::Comm<int> &comm,
+            const Real a = -1, const Real b = 1) {
+  Real val(0), u(0);
+  if ( Teuchos::rank<int>(comm)==0 ) {
+    u   = static_cast<Real>(rand())/static_cast<Real>(RAND_MAX);
+    val = (b-a)*u + a;
+  }
+  Teuchos::broadcast<int,Real>(comm,0,1,&val);
+  return val;
+}
+
+template<class Real>
+void setUpAndSolve(const ROL::Ptr<ROL::Problem<Real>> &opt,
+                   Teuchos::ParameterList &parlist,
+                   std::ostream &outStream) {
+  parlist.sublist("Step").set("Type","Trust Region");
+  ROL::Solver<Real> solver(opt,parlist);
+  Teuchos::Time timer("Optimization Time", true);
+  solver.solve(outStream);
+  timer.stop();
+  outStream << "Total optimization time = " << timer.totalElapsedTime() << " seconds." << std::endl;
+}
+
+template<class Real>
+void print(ROL::Objective<Real> &obj,
+           const ROL::Vector<Real> &z,
+           ROL::SampleGenerator<Real> &sampler,
+           const int ngsamp,
+           const ROL::Ptr<const Teuchos::Comm<int>> &comm,
+           const std::string &filename) {
+  Real tol(1e-8);
+  // Build objective function distribution
+  int nsamp = sampler.numMySamples();
+  std::vector<Real> myvalues(nsamp), myzerovec(nsamp, 0);
+  std::vector<double> gvalues(ngsamp), gzerovec(ngsamp, 0);
+  std::vector<Real> sample = sampler.getMyPoint(0);
+  int sdim = sample.size();
+  std::vector<std::vector<Real>> mysamples(sdim, myzerovec);
+  std::vector<std::vector<double>> gsamples(sdim, gzerovec);
+  for (int i = 0; i < nsamp; ++i) {
+    sample = sampler.getMyPoint(i);
+    obj.setParameter(sample);
+    myvalues[i] = static_cast<double>(obj.value(z,tol));
+    for (int j = 0; j < sdim; ++j) {
+      mysamples[j][i] = static_cast<double>(sample[j]);
+    }
+  }
+
+  // Send data to root processor
+#ifdef HAVE_MPI
+  ROL::Ptr<const Teuchos::MpiComm<int>> mpicomm
+    = ROL::dynamicPtrCast<const Teuchos::MpiComm<int>>(comm);
+  int nproc = Teuchos::size<int>(*mpicomm);
+  std::vector<int> sampleCounts(nproc, 0), sampleDispls(nproc, 0);
+  MPI_Gather(&nsamp,1,MPI_INT,&sampleCounts[0],1,MPI_INT,0,*(mpicomm->getRawMpiComm())());
+  for (int i = 1; i < nproc; ++i) {
+    sampleDispls[i] = sampleDispls[i-1] + sampleCounts[i-1];
+  }
+  MPI_Gatherv(&myvalues[0],nsamp,MPI_DOUBLE,&gvalues[0],&sampleCounts[0],&sampleDispls[0],MPI_DOUBLE,0,*(mpicomm->getRawMpiComm())());
+  for (int j = 0; j < sdim; ++j) {
+    MPI_Gatherv(&mysamples[j][0],nsamp,MPI_DOUBLE,&gsamples[j][0],&sampleCounts[0],&sampleDispls[0],MPI_DOUBLE,0,*(mpicomm->getRawMpiComm())());
+  }
+#else
+  gvalues.assign(myvalues.begin(),myvalues.end());
+  for (int j = 0; j < sdim; ++j) {
+    gsamples[j].assign(mysamples[j].begin(),mysamples[j].end());
+  }
+#endif
+
+  // Print
+  int rank  = Teuchos::rank<int>(*comm);
+  if ( rank==0 ) {
+    std::ofstream file;
+    file.open(filename);
+    file << std::scientific << std::setprecision(15);
+    for (int i = 0; i < ngsamp; ++i) {
+      for (int j = 0; j < sdim; ++j) {
+        file << std::setw(25) << std::left << gsamples[j][i];
+      }
+      file << std::setw(25) << std::left << gvalues[i] << std::endl;
+    }
+    file.close();
+  }
+}
+
+int main(int argc, char *argv[]) {
+  //feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
+
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  Kokkos::ScopeGuard kokkosScope (argc, argv);
+  ROL::Ptr<const Teuchos::Comm<int>> comm
+    = Tpetra::getDefaultComm();
+  ROL::Ptr<const Teuchos::Comm<int>> serial_comm
+    = ROL::makePtr<Teuchos::SerialComm<int>>();
+  const int myRank = comm->getRank();
+  if ((iprint > 0) && (myRank == 0))
+    outStream = ROL::makePtrFromRef(std::cout);
+  else
+    outStream = ROL::makePtrFromRef(bhs);
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+
+    /*** Read in XML input ***/
+    std::string filename = "input_ex03.xml";
+    auto parlist = ROL::getParametersFromXmlFile(filename);
+
+    // Problem dimensions
+    const int stochDim = 2;
+    const RealT one(1); 
+
+    /*************************************************************************/
+    /***************** BUILD GOVERNING PDE ***********************************/
+    /*************************************************************************/
+    /*** Initialize main data structure. ***/
+    auto meshMgr = ROL::makePtr<MeshManager_BackwardFacingStepChannel<RealT,DeviceT>>(*parlist);
+    // Initialize PDE describing advection-diffusion equation
+    auto pde = ROL::makePtr<PDE_NavierStokes<RealT,DeviceT>>(*parlist);
+    con = ROL::makePtr<PDE_Constraint<RealT,DeviceT>>(pde,meshMgr,serial_comm,*parlist,*outStream);
+    // Cast the constraint and get the assembler.
+    auto assembler = pdecon->getAssembler();
+    assembler->printMeshData(*outStream);
+    con->setSolveParameters(*parlist);
+
+    /*************************************************************************/
+    /***************** BUILD VECTORS *****************************************/
+    /*************************************************************************/
+    auto  u_ptr = assembler->createStateVector();
+    auto  p_ptr = assembler->createStateVector();
+    auto du_ptr = assembler->createStateVector();
+    u_ptr->randomize();  //u_ptr->putScalar(static_cast<RealT>(1));
+    p_ptr->randomize();  //p_ptr->putScalar(static_cast<RealT>(1));
+    du_ptr->randomize(); //du_ptr->putScalar(static_cast<RealT>(0));
+    auto up  = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(u_ptr,pde,assembler,*parlist);
+    auto pp  = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(p_ptr,pde,assembler,*parlist);
+    auto dup = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(du_ptr,pde,assembler,*parlist);
+    // Create residual vectors
+    auto r_ptr = assembler->createResidualVector();
+    r_ptr->randomize(); //r_ptr->putScalar(static_cast<RealT>(1));
+    auto rp = ROL::makePtr<PDE_DualSimVector<RealT,DeviceT>>(r_ptr,pde,assembler,*parlist);
+    // Create control vector and set to ones
+    auto  z_ptr = assembler->createControlVector();
+    auto dz_ptr = assembler->createControlVector();
+    auto yz_ptr = assembler->createControlVector();
+    z_ptr->randomize();  z_ptr->putScalar(static_cast<RealT>(0));
+    dz_ptr->randomize(); //dz_ptr->putScalar(static_cast<RealT>(0));
+    yz_ptr->randomize(); //yz_ptr->putScalar(static_cast<RealT>(0));
+    ROL::Ptr<ROL::TpetraMultiVector<RealT>> zpde
+      = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(z_ptr,pde,assembler,*parlist);
+    ROL::Ptr<ROL::TpetraMultiVector<RealT>> dzpde
+      = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(dz_ptr,pde,assembler,*parlist);
+    ROL::Ptr<ROL::TpetraMultiVector<RealT>> yzpde
+      = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(yz_ptr,pde,assembler,*parlist);
+    auto zp  = ROL::makePtr<PDE_OptVector<RealT>>(zpde);
+    auto dzp = ROL::makePtr<PDE_OptVector<RealT>>(dzpde);
+    auto yzp = ROL::makePtr<PDE_OptVector<RealT>>(yzpde);
+    // Create ROL SimOpt vectors
+    ROL::Vector_SimOpt<RealT> x(up,zp);
+    ROL::Vector_SimOpt<RealT> d(dup,dzp);
+
+    /*************************************************************************/
+    /***************** BUILD COST FUNCTIONAL *********************************/
+    /*************************************************************************/
+    std::vector<ROL::Ptr<QoI<RealT,DeviceT>>> qoi_vec(2,ROL::nullPtr);
+    qoi_vec[0] = ROL::makePtr<QoI_State_NavierStokes<RealT,DeviceT>>(*parlist,
+                                                             pde->getVelocityFE(),
+                                                             pde->getPressureFE(),
+                                                             pde->getFieldHelper());
+    qoi_vec[1] = ROL::makePtr<QoI_L2Penalty_NavierStokes<RealT,DeviceT>>(pde->getVelocityFE(),
+                                                                 pde->getPressureFE(),
+                                                                 pde->getVelocityBdryFE(),
+                                                                 pde->getBdryCellLocIds(),
+                                                                 pde->getFieldHelper());
+    auto std_obj = ROL::makePtr<StdObjective_NavierStokes<RealT>>(*parlist);
+    auto obj = ROL::makePtr<PDE_Objective<RealT,DeviceT>>(qoi_vec,std_obj,assembler);
+    auto objState = ROL::makePtr<PDE_Objective<RealT,DeviceT>>(qoi_vec[0],assembler);
+    auto objCtrl = ROL::makePtr<PDE_Objective<RealT,DeviceT>>(qoi_vec[1],assembler);
+    auto objRed = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(obj, con, up, zp, pp, true, false);
+
+    /*************************************************************************/
+    /***************** BUILD BOUND CONSTRAINT ********************************/
+    /*************************************************************************/
+    auto zlo_ptr = assembler->createControlVector();
+    auto zhi_ptr = assembler->createControlVector();
+    zlo_ptr->putScalar(static_cast<RealT>(0));
+    zhi_ptr->putScalar(ROL::ROL_INF<RealT>());
+    ROL::Ptr<ROL::TpetraMultiVector<RealT>> zlopde
+      = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(zlo_ptr,pde,assembler,*parlist);
+    ROL::Ptr<ROL::TpetraMultiVector<RealT>> zhipde
+      = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(zhi_ptr,pde,assembler,*parlist);
+    auto zlop = ROL::makePtr<PDE_OptVector<RealT>>(zlopde);
+    auto zhip = ROL::makePtr<PDE_OptVector<RealT>>(zhipde);
+    auto bnd = ROL::makePtr<ROL::Bounds<RealT>>(zlop,zhip);
+    bool useBounds = parlist->sublist("Problem").get("Use bounds", false);
+    if (!useBounds) bnd->deactivate();
+
+    /*************************************************************************/
+    /***************** BUILD SAMPLER *****************************************/
+    /*************************************************************************/
+    int nsamp = parlist->sublist("Problem").get("Number of samples",100);
+    int nsamp_dist = parlist->sublist("Problem").get("Number of output samples",100);
+    std::vector<RealT> tmp = {-one,one};
+    std::vector<std::vector<RealT>> bounds(stochDim,tmp);
+    auto bman = ROL::makePtr<PDE_OptVector_BatchManager<RealT>>(comm);
+    auto sampler = ROL::makePtr<ROL::MonteCarloGenerator<RealT>>(nsamp,bounds,bman);
+    auto sampler_dist = ROL::makePtr<ROL::MonteCarloGenerator<RealT>>(nsamp_dist,bounds,bman);
+
+    /*************************************************************************/
+    /***************** BUILD STOCHASTIC PROBLEM ******************************/
+    /*************************************************************************/
+    ROL::Ptr<ROL::StochasticProblem<RealT>> opt;
+    std::vector<RealT> ctrl;
+    std::vector<RealT> var;
+
+    std::vector<RealT> alphaArray
+      = ROL::getArrayFromStringParameter<RealT>(parlist->sublist("Problem"),"Confidence Levels");
+    std::vector<RealT> alpha = alphaArray.toVector();
+    std::sort(alpha.begin(),alpha.end());
+    int N = alpha.size();
+
+    /*************************************************************************/
+    /***************** SOLVE RISK NEUTRAL ************************************/
+    /*************************************************************************/
+    RealT tol(1e-8);
+    bool alphaZero = (alpha[0] == static_cast<RealT>(0));
+    if ( alphaZero ) {
+      alpha.erase(alpha.begin()); --N;
+      // Solve.
+      parlist->sublist("SOL").set("Type","Risk Neutral");
+      opt = ROL::makePtr<ROL::StochasticProblem<RealT>>(objRed,zp);
+      opt->addBoundConstraint(bnd);
+      parlist->sublist("SOL").set("Initial Statistic",one);
+      opt->makeObjectiveStochastic(*parlist,sampler);
+      opt->finalize(false,true,*outStream);
+      setUpAndSolve<RealT>(opt,*parlist,*outStream);
+      // Output.
+      ctrl.push_back(objCtrl->value(*up,*zp,tol));
+      try                       { var.push_back(opt->getSolutionStatistic()); }
+      catch (std::exception &e) { var.push_back(0.0); }
+      std::string CtrlRN = "control_RN.txt";
+      con->outputTpetraVector(z_ptr,CtrlRN);
+      std::string ObjRN = "obj_samples_RN.txt";
+      print<RealT>(*objRed,*zp,*sampler_dist,nsamp_dist,comm,ObjRN);
+    }
+
+    /*************************************************************************/
+    /***************** SOLVE MEAN PLUS CVAR **********************************/
+    /*************************************************************************/
+    parlist->sublist("SOL").set("Type","Risk Averse");
+    parlist->sublist("SOL").sublist("Risk Measure").set("Name","CVaR");
+    parlist->sublist("SOL").sublist("Risk Measure").sublist("CVaR").set("Convex Combination Parameter",1.0);
+    parlist->sublist("SOL").sublist("Risk Measure").sublist("CVaR").set("Smoothing Parameter",1e-4);
+    parlist->sublist("SOL").sublist("Risk Measure").sublist("CVaR").sublist("Distribution").set("Name","Parabolic");
+    parlist->sublist("SOL").sublist("Risk Measure").sublist("CVaR").sublist("Distribution").sublist("Parabolic").set("Lower Bound",0.0);
+    parlist->sublist("SOL").sublist("Risk Measure").sublist("CVaR").sublist("Distribution").sublist("Parabolic").set("Upper Bound",1.0);
+    for (int i = 0; i < N; ++i) {
+      // Solve.
+      parlist->sublist("SOL").sublist("Risk Measure").sublist("CVaR").set("Confidence Level",alpha[i]);
+      opt = ROL::makePtr<ROL::StochasticProblem<RealT>>(objRed,zp);
+      opt->addBoundConstraint(bnd);
+      parlist->sublist("SOL").set("Initial Statistic",one);
+      opt->makeObjectiveStochastic(*parlist,sampler);
+      opt->finalize(false,true,*outStream);
+      setUpAndSolve<RealT>(opt,*parlist,*outStream);
+      // Output.
+      ctrl.push_back(objCtrl->value(*up,*zp,tol));
+      try                       { var.push_back(opt->getSolutionStatistic()); }
+      catch (std::exception &e) { var.push_back(0.0); }
+      std::stringstream nameCtrl;
+      nameCtrl << "control_CVaR_" << i+1 << ".txt";
+      con->outputTpetraVector(z_ptr,nameCtrl.str().c_str());
+      std::stringstream nameObj;
+      nameObj << "obj_samples_CVaR_" << i+1 << ".txt";
+      print<RealT>(*objRed,*zp,*sampler_dist,nsamp_dist,comm,nameObj.str());
+    }
+
+    /*************************************************************************/
+    /***************** PRINT CONTROL OBJ AND VAR *****************************/
+    /*************************************************************************/
+    const int rank = Teuchos::rank<int>(*comm);
+    if ( rank==0 ) {
+      std::stringstream nameCTRL, nameVAR;
+      nameCTRL << "ctrl.txt";
+      nameVAR << "var.txt";
+      std::ofstream fileCTRL, fileVAR;
+      fileCTRL.open(nameCTRL.str());
+      fileVAR.open(nameVAR.str());
+      fileCTRL << std::scientific << std::setprecision(15);
+      fileVAR << std::scientific << std::setprecision(15);
+      int size = var.size();
+      for (int i = 0; i < size; ++i) {
+        fileCTRL << std::setw(25) << std::left << ctrl[i] << std::endl;
+        fileVAR << std::setw(25) << std::left << var[i] << std::endl;
+      }
+      fileCTRL.close();
+      fileVAR.close();
+    }
+
+    // Get a summary from the time monitor.
+    Teuchos::TimeMonitor::summarize();
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/navier-stokes/example_04.cpp b/packages/rol/example/PDE-OPT/navier-stokes/example_04.cpp
index c2106fef4615..a7c03cd89432 100644
--- a/packages/rol/example/PDE-OPT/navier-stokes/example_04.cpp
+++ b/packages/rol/example/PDE-OPT/navier-stokes/example_04.cpp
@@ -16,8 +16,7 @@
 #include "Teuchos_DefaultMpiComm.hpp"
 #endif
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -126,7 +125,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int>> comm
     = Tpetra::getDefaultComm();
   ROL::Ptr<const Teuchos::Comm<int>> serial_comm
diff --git a/packages/rol/example/PDE-OPT/navier-stokes/example_04K.cpp b/packages/rol/example/PDE-OPT/navier-stokes/example_04K.cpp
new file mode 100644
index 000000000000..a7c03cd89432
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/navier-stokes/example_04K.cpp
@@ -0,0 +1,347 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_04.cpp
+    \brief Shows how to solve the stochastic Navier-Stokes problem.
+*/
+
+#include "Teuchos_Comm.hpp"
+#ifdef HAVE_MPI
+#include "Teuchos_DefaultMpiComm.hpp"
+#endif
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+//#include <fenv.h>
+
+#include "ROL_Bounds.hpp"
+#include "ROL_Reduced_Objective_SimOpt.hpp"
+#include "ROL_MonteCarloGenerator.hpp"
+#include "ROL_StochasticProblem.hpp"
+#include "ROL_Solver.hpp"
+#include "ROL_TpetraTeuchosBatchManager.hpp"
+
+#include "../TOOLS/meshmanager.hpp"
+#include "../TOOLS/pdeconstraint.hpp"
+#include "../TOOLS/pdeobjective.hpp"
+#include "../TOOLS/pdevector.hpp"
+#include "../TOOLS/batchmanager.hpp"
+#include "pde_navier-stokes.hpp"
+#include "obj_navier-stokes.hpp"
+
+typedef double RealT;
+
+template<class Real>
+void setUpAndSolve(const ROL::Ptr<ROL::Problem<Real>> &opt,
+                   Teuchos::ParameterList &parlist,
+                   std::ostream &outStream) {
+  parlist.sublist("Step").set("Type","Trust Region");
+  ROL::Solver<Real> solver(opt,parlist);
+  Teuchos::Time timer("Optimization Time", true);
+  solver.solve(outStream);
+  timer.stop();
+  outStream << "Total optimization time = " << timer.totalElapsedTime() << " seconds." << std::endl;
+}
+
+template<class Real>
+void print(ROL::Objective<Real> &obj,
+           const ROL::Vector<Real> &z,
+           ROL::SampleGenerator<Real> &sampler,
+           const int ngsamp,
+           const ROL::Ptr<const Teuchos::Comm<int>> &comm,
+           const std::string &filename) {
+  Real tol(1e-8);
+  // Build objective function distribution
+  int nsamp = sampler.numMySamples();
+  std::vector<Real> myvalues(nsamp), myzerovec(nsamp, 0);
+  std::vector<double> gvalues(ngsamp), gzerovec(ngsamp, 0);
+  std::vector<Real> sample = sampler.getMyPoint(0);
+  int sdim = sample.size();
+  std::vector<std::vector<Real>> mysamples(sdim, myzerovec);
+  std::vector<std::vector<double>> gsamples(sdim, gzerovec);
+  for (int i = 0; i < nsamp; ++i) {
+    sample = sampler.getMyPoint(i);
+    obj.setParameter(sample);
+    myvalues[i] = static_cast<double>(obj.value(z,tol));
+    for (int j = 0; j < sdim; ++j) {
+      mysamples[j][i] = static_cast<double>(sample[j]);
+    }
+  }
+
+  // Send data to root processor
+#ifdef HAVE_MPI
+  ROL::Ptr<const Teuchos::MpiComm<int>> mpicomm
+    = ROL::dynamicPtrCast<const Teuchos::MpiComm<int>>(comm);
+  int nproc = Teuchos::size<int>(*mpicomm);
+  std::vector<int> sampleCounts(nproc, 0), sampleDispls(nproc, 0);
+  MPI_Gather(&nsamp,1,MPI_INT,&sampleCounts[0],1,MPI_INT,0,*(mpicomm->getRawMpiComm())());
+  for (int i = 1; i < nproc; ++i) {
+    sampleDispls[i] = sampleDispls[i-1] + sampleCounts[i-1];
+  }
+  MPI_Gatherv(&myvalues[0],nsamp,MPI_DOUBLE,&gvalues[0],&sampleCounts[0],&sampleDispls[0],MPI_DOUBLE,0,*(mpicomm->getRawMpiComm())());
+  for (int j = 0; j < sdim; ++j) {
+    MPI_Gatherv(&mysamples[j][0],nsamp,MPI_DOUBLE,&gsamples[j][0],&sampleCounts[0],&sampleDispls[0],MPI_DOUBLE,0,*(mpicomm->getRawMpiComm())());
+  }
+#else
+  gvalues.assign(myvalues.begin(),myvalues.end());
+  for (int j = 0; j < sdim; ++j) {
+    gsamples[j].assign(mysamples[j].begin(),mysamples[j].end());
+  }
+#endif
+
+  // Print
+  int rank  = Teuchos::rank<int>(*comm);
+  if ( rank==0 ) {
+    std::ofstream file;
+    file.open(filename);
+    file << std::scientific << std::setprecision(15);
+    for (int i = 0; i < ngsamp; ++i) {
+      for (int j = 0; j < sdim; ++j) {
+        file << std::setw(25) << std::left << gsamples[j][i];
+      }
+      file << std::setw(25) << std::left << gvalues[i] << std::endl;
+    }
+    file.close();
+  }
+}
+
+int main(int argc, char *argv[]) {
+  //feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
+
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::Ptr<const Teuchos::Comm<int>> comm
+    = Tpetra::getDefaultComm();
+  ROL::Ptr<const Teuchos::Comm<int>> serial_comm
+    = ROL::makePtr<Teuchos::SerialComm<int>>();
+  const int myRank = comm->getRank();
+  if ((iprint > 0) && (myRank == 0)) {
+    outStream = ROL::makePtrFromRef(std::cout);
+  }
+  else {
+    outStream = ROL::makePtrFromRef(bhs);
+  }
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+
+    /*** Read in XML input ***/
+    std::string filename = "input_ex04.xml";
+    Teuchos::RCP<Teuchos::ParameterList> parlist = Teuchos::rcp( new Teuchos::ParameterList() );
+    Teuchos::updateParametersFromXmlFile( filename, parlist.ptr() );
+
+    // Problem dimensions
+    const int stochDim = 2;
+    const RealT one(1); 
+
+    /*************************************************************************/
+    /***************** BUILD GOVERNING PDE ***********************************/
+    /*************************************************************************/
+    /*** Initialize main data structure. ***/
+    ROL::Ptr<MeshManager<RealT>> meshMgr
+      = ROL::makePtr<MeshManager_BackwardFacingStepChannel<RealT>>(*parlist);
+    // Initialize PDE describing advection-diffusion equation
+    ROL::Ptr<PDE_NavierStokes<RealT>> pde
+      = ROL::makePtr<PDE_NavierStokes<RealT>>(*parlist);
+    ROL::Ptr<ROL::Constraint_SimOpt<RealT>> con
+      = ROL::makePtr<PDE_Constraint<RealT>>(pde,meshMgr,serial_comm,*parlist,*outStream);
+    // Cast the constraint and get the assembler.
+    ROL::Ptr<PDE_Constraint<RealT>> pdecon
+      = ROL::dynamicPtrCast<PDE_Constraint<RealT>>(con);
+    ROL::Ptr<Assembler<RealT>> assembler = pdecon->getAssembler();
+    assembler->printMeshData(*outStream);
+    con->setSolveParameters(*parlist);
+
+    /*************************************************************************/
+    /***************** BUILD VECTORS *****************************************/
+    /*************************************************************************/
+    ROL::Ptr<Tpetra::MultiVector<>>  u_ptr = assembler->createStateVector();
+    ROL::Ptr<Tpetra::MultiVector<>>  p_ptr = assembler->createStateVector();
+    ROL::Ptr<Tpetra::MultiVector<>> du_ptr = assembler->createStateVector();
+    u_ptr->randomize();  //u_ptr->putScalar(static_cast<RealT>(1));
+    p_ptr->randomize();  //p_ptr->putScalar(static_cast<RealT>(1));
+    du_ptr->randomize(); //du_ptr->putScalar(static_cast<RealT>(0));
+    ROL::Ptr<ROL::Vector<RealT>> up
+      = ROL::makePtr<PDE_PrimalSimVector<RealT>>(u_ptr,pde,assembler,*parlist);
+    ROL::Ptr<ROL::Vector<RealT>> pp
+      = ROL::makePtr<PDE_PrimalSimVector<RealT>>(p_ptr,pde,assembler,*parlist);
+    ROL::Ptr<ROL::Vector<RealT>> dup
+      = ROL::makePtr<PDE_PrimalSimVector<RealT>>(du_ptr,pde,assembler,*parlist);
+    // Create residual vectors
+    ROL::Ptr<Tpetra::MultiVector<>> r_ptr = assembler->createResidualVector();
+    r_ptr->randomize(); //r_ptr->putScalar(static_cast<RealT>(1));
+    ROL::Ptr<ROL::Vector<RealT>> rp
+      = ROL::makePtr<PDE_DualSimVector<RealT>>(r_ptr,pde,assembler,*parlist);
+    // Create control vector and set to ones
+    ROL::Ptr<Tpetra::MultiVector<>>  z_ptr = assembler->createControlVector();
+    ROL::Ptr<Tpetra::MultiVector<>> dz_ptr = assembler->createControlVector();
+    ROL::Ptr<Tpetra::MultiVector<>> yz_ptr = assembler->createControlVector();
+    z_ptr->randomize();  z_ptr->putScalar(static_cast<RealT>(0));
+    dz_ptr->randomize(); //dz_ptr->putScalar(static_cast<RealT>(0));
+    yz_ptr->randomize(); //yz_ptr->putScalar(static_cast<RealT>(0));
+    ROL::Ptr<ROL::TpetraMultiVector<RealT>> zpde
+      = ROL::makePtr<PDE_PrimalOptVector<RealT>>(z_ptr,pde,assembler,*parlist);
+    ROL::Ptr<ROL::TpetraMultiVector<RealT>> dzpde
+      = ROL::makePtr<PDE_PrimalOptVector<RealT>>(dz_ptr,pde,assembler,*parlist);
+    ROL::Ptr<ROL::TpetraMultiVector<RealT>> yzpde
+      = ROL::makePtr<PDE_PrimalOptVector<RealT>>(yz_ptr,pde,assembler,*parlist);
+    ROL::Ptr<ROL::Vector<RealT>> zp  = ROL::makePtr<PDE_OptVector<RealT>>(zpde);
+    ROL::Ptr<ROL::Vector<RealT>> dzp = ROL::makePtr<PDE_OptVector<RealT>>(dzpde);
+    ROL::Ptr<ROL::Vector<RealT>> yzp = ROL::makePtr<PDE_OptVector<RealT>>(yzpde);
+    // Create ROL SimOpt vectors
+    ROL::Vector_SimOpt<RealT> x(up,zp);
+    ROL::Vector_SimOpt<RealT> d(dup,dzp);
+
+    /*************************************************************************/
+    /***************** BUILD COST FUNCTIONAL *********************************/
+    /*************************************************************************/
+    std::vector<ROL::Ptr<QoI<RealT>>> qoi_vec(2,ROL::nullPtr);
+    qoi_vec[0] = ROL::makePtr<QoI_State_NavierStokes<RealT>>(*parlist,
+                                                                pde->getVelocityFE(),
+                                                                pde->getPressureFE(),
+                                                                pde->getFieldHelper());
+    qoi_vec[1] = ROL::makePtr<QoI_L2Penalty_NavierStokes<RealT>>(pde->getVelocityFE(),
+                                                                    pde->getPressureFE(),
+                                                                    pde->getVelocityBdryFE(),
+                                                                    pde->getBdryCellLocIds(),
+                                                                    pde->getFieldHelper());
+    ROL::Ptr<StdObjective_NavierStokes<RealT>> std_obj
+      = ROL::makePtr<StdObjective_NavierStokes<RealT>>(*parlist);
+    ROL::Ptr<ROL::Objective_SimOpt<RealT>> obj
+      = ROL::makePtr<PDE_Objective<RealT>>(qoi_vec,std_obj,assembler);
+    ROL::Ptr<ROL::Objective_SimOpt<RealT>> objState
+      = ROL::makePtr<PDE_Objective<RealT>>(qoi_vec[0],assembler);
+    ROL::Ptr<ROL::Objective_SimOpt<RealT>> objCtrl
+      = ROL::makePtr<PDE_Objective<RealT>>(qoi_vec[1],assembler);
+    ROL::Ptr<ROL::Reduced_Objective_SimOpt<RealT>> objRed
+      = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(obj, con, up, zp, pp, true, false);
+
+    /*************************************************************************/
+    /***************** BUILD BOUND CONSTRAINT ********************************/
+    /*************************************************************************/
+    ROL::Ptr<Tpetra::MultiVector<>>  zlo_ptr = assembler->createControlVector();
+    ROL::Ptr<Tpetra::MultiVector<>>  zhi_ptr = assembler->createControlVector();
+    zlo_ptr->putScalar(static_cast<RealT>(0));
+    zhi_ptr->putScalar(ROL::ROL_INF<RealT>());
+    ROL::Ptr<ROL::TpetraMultiVector<RealT>> zlopde
+      = ROL::makePtr<PDE_PrimalOptVector<RealT>>(zlo_ptr,pde,assembler,*parlist);
+    ROL::Ptr<ROL::TpetraMultiVector<RealT>> zhipde
+      = ROL::makePtr<PDE_PrimalOptVector<RealT>>(zhi_ptr,pde,assembler,*parlist);
+    ROL::Ptr<ROL::Vector<RealT>> zlop = ROL::makePtr<PDE_OptVector<RealT>>(zlopde);
+    ROL::Ptr<ROL::Vector<RealT>> zhip = ROL::makePtr<PDE_OptVector<RealT>>(zhipde);
+    ROL::Ptr<ROL::BoundConstraint<RealT>> bnd
+      = ROL::makePtr<ROL::Bounds<RealT>>(zlop,zhip);
+    bool useBounds = parlist->sublist("Problem").get("Use bounds", false);
+    if (!useBounds) bnd->deactivate();
+
+    /*************************************************************************/
+    /***************** BUILD SAMPLER *****************************************/
+    /*************************************************************************/
+    int nsamp = parlist->sublist("Problem").get("Number of samples",100);
+    int nsamp_dist = parlist->sublist("Problem").get("Number of output samples",100);
+    std::vector<RealT> tmp = {-one,one};
+    std::vector<std::vector<RealT>> bounds(stochDim,tmp);
+    ROL::Ptr<ROL::BatchManager<RealT>> bman
+      = ROL::makePtr<PDE_OptVector_BatchManager<RealT>>(comm);
+    ROL::Ptr<ROL::SampleGenerator<RealT>> sampler
+      = ROL::makePtr<ROL::MonteCarloGenerator<RealT>>(nsamp,bounds,bman);
+    ROL::Ptr<ROL::SampleGenerator<RealT>> sampler_dist
+      = ROL::makePtr<ROL::MonteCarloGenerator<RealT>>(nsamp_dist,bounds,bman);
+
+    /*************************************************************************/
+    /***************** BUILD STOCHASTIC PROBLEM ******************************/
+    /*************************************************************************/
+    ROL::Ptr<ROL::StochasticProblem<RealT>> opt;
+    std::vector<RealT> ctrl;
+    std::vector<RealT> var;
+
+    Teuchos::Array<RealT> alphaArray
+      = Teuchos::getArrayFromStringParameter<RealT>(parlist->sublist("Problem"),"bPOE Thresholds");
+    std::vector<RealT> alpha = alphaArray.toVector();
+    std::sort(alpha.begin(),alpha.end());
+    int N = alpha.size();
+
+    /*************************************************************************/
+    /***************** SOLVE MEAN PLUS CVAR **********************************/
+    /*************************************************************************/
+    RealT tol(1e-8);
+    parlist->sublist("SOL").set("Type","Risk Averse");
+    parlist->sublist("SOL").sublist("Risk Measure").set("Name","bPOE");
+    parlist->sublist("SOL").sublist("Risk Measure").sublist("bPOE").set("Moment Order",2.0);
+    for (int i = 0; i < N; ++i) {
+      // Solve.
+      parlist->sublist("SOL").sublist("Risk Measure").sublist("bPOE").set("Threshold",alpha[i]);
+      opt = ROL::makePtr<ROL::StochasticProblem<RealT>>(objRed,zp);
+      opt->addBoundConstraint(bnd);
+      RealT stat(1);
+      if ( i > 0 ) stat = var[i];
+      parlist->sublist("SOL").set("Initial Statistic",stat);
+      opt->makeObjectiveStochastic(*parlist,sampler);
+      opt->finalize(false,true,*outStream);
+      setUpAndSolve<RealT>(opt,*parlist,*outStream);
+      // Output.
+      ctrl.push_back(objCtrl->value(*up,*zp,tol));
+      try                       { var.push_back(opt->getSolutionStatistic()); }
+      catch (std::exception &e) { var.push_back(0.0); }
+      std::stringstream nameCtrl;
+      nameCtrl << "control_bPOE_" << i+1 << ".txt";
+      pdecon->outputTpetraVector(z_ptr,nameCtrl.str().c_str());
+      std::stringstream nameObj;
+      nameObj << "obj_samples_bPOE_" << i+1 << ".txt";
+      print<RealT>(*objRed,*zp,*sampler_dist,nsamp_dist,comm,nameObj.str());
+    }
+
+    /*************************************************************************/
+    /***************** PRINT CONTROL OBJ AND VAR *****************************/
+    /*************************************************************************/
+    const int rank = Teuchos::rank<int>(*comm);
+    if ( rank==0 ) {
+      std::stringstream nameCTRL, nameVAR;
+      nameCTRL << "ctrl.txt";
+      nameVAR << "var.txt";
+      std::ofstream fileCTRL, fileVAR;
+      fileCTRL.open(nameCTRL.str());
+      fileVAR.open(nameVAR.str());
+      fileCTRL << std::scientific << std::setprecision(15);
+      fileVAR << std::scientific << std::setprecision(15);
+      int size = var.size();
+      for (int i = 0; i < size; ++i) {
+        fileCTRL << std::setw(25) << std::left << ctrl[i] << std::endl;
+        fileVAR << std::setw(25) << std::left << var[i] << std::endl;
+      }
+      fileCTRL.close();
+      fileVAR.close();
+    }
+
+    // Get a summary from the time monitor.
+    Teuchos::TimeMonitor::summarize();
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/navier-stokes/example_05.cpp b/packages/rol/example/PDE-OPT/navier-stokes/example_05.cpp
index e16bd1074e26..a87414f3a5ab 100644
--- a/packages/rol/example/PDE-OPT/navier-stokes/example_05.cpp
+++ b/packages/rol/example/PDE-OPT/navier-stokes/example_05.cpp
@@ -13,8 +13,7 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -59,7 +58,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int> > comm
     = Tpetra::getDefaultComm();
   ROL::Ptr<const Teuchos::Comm<int> > serial_comm
diff --git a/packages/rol/example/PDE-OPT/navier-stokes/example_05K.cpp b/packages/rol/example/PDE-OPT/navier-stokes/example_05K.cpp
new file mode 100644
index 000000000000..3561110335c2
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/navier-stokes/example_05K.cpp
@@ -0,0 +1,234 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_01.cpp
+    \brief Shows how to solve the stochastic Stefan-Boltzmann problem.
+*/
+
+#include "Teuchos_Comm.hpp"
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+//#include <fenv.h>
+
+#include "ROL_Algorithm.hpp"
+#include "ROL_Bounds.hpp"
+#include "ROL_Reduced_Objective_SimOpt.hpp"
+#include "ROL_Solver.hpp"
+#include "ROL_BoundConstraint_SimOpt.hpp"
+
+#include "../TOOLS/meshmanagerK.hpp"
+#include "../TOOLS/pdeconstraintK.hpp"
+#include "../TOOLS/pdeobjectiveK.hpp"
+#include "../TOOLS/pdevectorK.hpp"
+#include "pde_navier-stokesK.hpp"
+#include "obj_navier-stokesK.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+template<class Real>
+Real random(const Teuchos::Comm<int> &comm,
+            const Real a = -1, const Real b = 1) {
+  Real val(0), u(0);
+  if ( Teuchos::rank<int>(comm)==0 ) {
+    u   = static_cast<Real>(rand())/static_cast<Real>(RAND_MAX);
+    val = (b-a)*u + a;
+  }
+  Teuchos::broadcast<int,Real>(comm,0,1,&val);
+  return val;
+}
+
+int main(int argc, char *argv[]) {
+  //feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
+
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  Kokkos::ScopeGuard kokkosScope (argc, argv);
+  ROL::Ptr<const Teuchos::Comm<int> > comm
+    = Tpetra::getDefaultComm();
+  ROL::Ptr<const Teuchos::Comm<int> > serial_comm
+    = ROL::makePtr<Teuchos::SerialComm<int>>();
+  const int myRank = comm->getRank();
+  if ((iprint > 0) && (myRank == 0))
+    outStream = ROL::makePtrFromRef(std::cout);
+  else
+    outStream = ROL::makePtrFromRef(bhs);
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+
+    /*** Read in XML input ***/
+    std::string filename = "input_ex05.xml";
+    auto parlist = ROL::getParametersFromXmlFile(filename);
+
+    // Problem dimensions
+    //const int stochDim = 2;
+    //const RealT one(1); 
+
+    /*************************************************************************/
+    /***************** BUILD GOVERNING PDE ***********************************/
+    /*************************************************************************/
+    /*** Initialize main data structure. ***/
+    auto meshMgr = ROL::makePtr<MeshManager_BackwardFacingStepChannel<RealT,DeviceT>>(*parlist);
+    // Initialize PDE describing advection-diffusion equation
+    auto pde = ROL::makePtr<PDE_NavierStokes<RealT,DeviceT>>(*parlist);
+    auto con = ROL::makePtr<PDE_Constraint<RealT,DeviceT>>(pde,meshMgr,serial_comm,*parlist,*outStream);
+    // Cast the constraint and get the assembler.
+    auto assembler = con->getAssembler();
+    con->setSolveParameters(*parlist);
+
+    /*************************************************************************/
+    /***************** BUILD VECTORS *****************************************/
+    /*************************************************************************/
+    auto  u_ptr = assembler->createStateVector();
+    auto  p_ptr = assembler->createStateVector();
+    auto du_ptr = assembler->createStateVector();
+    u_ptr->randomize();  //u_ptr->putScalar(static_cast<RealT>(1));
+    p_ptr->randomize();  //p_ptr->putScalar(static_cast<RealT>(1));
+    du_ptr->randomize(); //du_ptr->putScalar(static_cast<RealT>(0));
+    auto up  = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(u_ptr,pde,assembler,*parlist);
+    auto pp  = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(p_ptr,pde,assembler,*parlist);
+    auto dup = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(du_ptr,pde,assembler,*parlist);
+    // Create residual vectors
+    auto r_ptr = assembler->createResidualVector();
+    r_ptr->randomize(); //r_ptr->putScalar(static_cast<RealT>(1));
+    auto rp = ROL::makePtr<PDE_DualSimVector<RealT,DeviceT>>(r_ptr,pde,assembler,*parlist);
+    // Create control vector and set to ones
+    auto  z_ptr = assembler->createControlVector();
+    auto dz_ptr = assembler->createControlVector();
+    auto yz_ptr = assembler->createControlVector();
+    z_ptr->randomize();  z_ptr->putScalar(static_cast<RealT>(0));
+    dz_ptr->randomize(); //dz_ptr->putScalar(static_cast<RealT>(0));
+    yz_ptr->randomize(); //yz_ptr->putScalar(static_cast<RealT>(0));
+    ROL::Ptr<ROL::TpetraMultiVector<RealT>> zpde
+      = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(z_ptr,pde,assembler,*parlist);
+    ROL::Ptr<ROL::TpetraMultiVector<RealT>> dzpde
+      = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(dz_ptr,pde,assembler,*parlist);
+    ROL::Ptr<ROL::TpetraMultiVector<RealT>> yzpde
+      = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(yz_ptr,pde,assembler,*parlist);
+    auto zp  = ROL::makePtr<PDE_OptVector<RealT>>(zpde);
+    auto dzp = ROL::makePtr<PDE_OptVector<RealT>>(dzpde);
+    auto yzp = ROL::makePtr<PDE_OptVector<RealT>>(yzpde);
+    // Create ROL SimOpt vectors
+    ROL::Vector_SimOpt<RealT> x(up,zp);
+    ROL::Vector_SimOpt<RealT> d(dup,dzp);
+
+    /*************************************************************************/
+    /***************** BUILD COST FUNCTIONAL *********************************/
+    /*************************************************************************/
+    std::vector<ROL::Ptr<QoI<RealT,DeviceT>>> qoi_vec(2,ROL::nullPtr);
+    qoi_vec[0] = ROL::makePtr<QoI_State_NavierStokes<RealT,DeviceT>>(*parlist,
+                                                                pde->getVelocityFE(),
+                                                                pde->getPressureFE(),
+                                                                pde->getFieldHelper());
+    qoi_vec[1] = ROL::makePtr<QoI_L2Penalty_NavierStokes<RealT,DeviceT>>(pde->getVelocityFE(),
+                                                                    pde->getPressureFE(),
+                                                                    pde->getVelocityBdryFE(),
+                                                                    pde->getBdryCellLocIds(),
+                                                                    pde->getFieldHelper());
+    auto std_obj = ROL::makePtr<StdObjective_NavierStokes<RealT>>(*parlist);
+    auto obj = ROL::makePtr<PDE_Objective<RealT,DeviceT>>(qoi_vec,std_obj,assembler);
+
+    /*************************************************************************/
+    /***************** BUILD BOUND CONSTRAINT ********************************/
+    /*************************************************************************/
+    auto zlo_ptr = assembler->createControlVector();
+    auto zhi_ptr = assembler->createControlVector();
+    zlo_ptr->putScalar(static_cast<RealT>(0));
+    zhi_ptr->putScalar(ROL::ROL_INF<RealT>());
+    ROL::Ptr<ROL::TpetraMultiVector<RealT>> zlopde
+      = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(zlo_ptr,pde,assembler,*parlist);
+    ROL::Ptr<ROL::TpetraMultiVector<RealT>> zhipde
+      = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(zhi_ptr,pde,assembler,*parlist);
+    auto zlop = ROL::makePtr<PDE_OptVector<RealT>>(zlopde);
+    auto zhip = ROL::makePtr<PDE_OptVector<RealT>>(zhipde);
+    auto zbnd = ROL::makePtr<ROL::Bounds<RealT>>(zlop,zhip);
+    bool useBounds = parlist->sublist("Problem").get("Use bounds", false);
+    if (!useBounds) zbnd->deactivate();
+    // State bounds
+    auto ulo_ptr = assembler->createStateVector();
+    auto uhi_ptr = assembler->createStateVector();
+    ulo_ptr->putScalar(ROL::ROL_NINF<RealT>()); uhi_ptr->putScalar(ROL::ROL_INF<RealT>());
+    auto ulop = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(ulo_ptr,pde,assembler);
+    auto uhip = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(uhi_ptr,pde,assembler);
+    auto ubnd = ROL::makePtr<ROL::Bounds<RealT>>(ulop,uhip);
+    ubnd->deactivate();
+
+    // SimOpt bounds
+    auto bnd = ROL::makePtr<ROL::BoundConstraint_SimOpt<RealT>>(ubnd,zbnd);
+
+
+    /*************************************************************************/
+    /***************** RUN VECTOR AND DERIVATIVE CHECKS **********************/
+    /*************************************************************************/
+    bool checkDeriv = parlist->sublist("Problem").get("Check derivatives",false);
+    if ( checkDeriv ) {
+      up->checkVector(*pp,*dup,true,*outStream);
+      zp->checkVector(*yzp,*dzp,true,*outStream);
+      obj->checkGradient(x,d,true,*outStream);
+      obj->checkHessVec(x,d,true,*outStream);
+      con->checkApplyJacobian(x,d,*up,true,*outStream);
+      con->checkApplyAdjointHessian(x,*dup,d,x,true,*outStream);
+      con->checkAdjointConsistencyJacobian(*dup,d,x,true,*outStream);
+      con->checkInverseJacobian_1(*up,*up,*up,*zp,true,*outStream);
+      con->checkInverseAdjointJacobian_1(*up,*up,*up,*zp,true,*outStream);
+    }
+
+    /*************************************************************************/
+    /***************** SOLVE OPTIMIZATION PROBLEM ****************************/
+    /*************************************************************************/
+    RealT tol(1.e-8);
+    up->setScalar(RealT(1));
+    zp->randomize();//->setScalar(RealT(1));
+    con->solve(*rp,*up,*zp,tol);    
+    auto optProb = ROL::makePtr<ROL::Problem<RealT>>(obj, makePtrFromRef(x));
+    optProb->addBoundConstraint(bnd);
+    optProb->addConstraint("PDE", con, pp);
+    optProb->finalize(false,true,*outStream);
+    ROL::Solver<RealT> optSolver(optProb, *parlist);
+    optSolver.solve(*outStream);
+    std::clock_t timer = std::clock();
+    *outStream << "Optimization time: "
+               << static_cast<RealT>(std::clock()-timer)/static_cast<RealT>(CLOCKS_PER_SEC)
+               << " seconds." << std::endl << std::endl;
+
+    Teuchos::Array<RealT> res(1,0);
+    con->solve(*rp,*up,*zp,tol);
+    r_ptr->norm2(res.view(0,1));
+
+    /*************************************************************************/
+    /***************** CHECK RESIDUAL NORM ***********************************/
+    /*************************************************************************/
+    *outStream << "Residual Norm: " << res[0] << std::endl << std::endl;
+    errorFlag += (res[0] > 1.e-6 ? 1 : 0);
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/navier-stokes/input.xml b/packages/rol/example/PDE-OPT/navier-stokes/input.xml
index 7e54d3410483..300af77e3675 100644
--- a/packages/rol/example/PDE-OPT/navier-stokes/input.xml
+++ b/packages/rol/example/PDE-OPT/navier-stokes/input.xml
@@ -28,7 +28,7 @@
     <Parameter name="Step width"         type="double"  value="1.0" />
     <Parameter name="Step height"        type="double"  value="0.5" />
     <Parameter name="Observation width"  type="double"  value="3.0" />
-    <Parameter name="Refinement level"   type="int"     value="2"   />
+    <Parameter name="Refinement level"   type="int"     value="1"   />
     <Parameter name="Partition type"     type="int"     value="1"    />
   </ParameterList>
 
@@ -152,7 +152,7 @@
   <!-- ===========  STEP SUBLIST  =========== -->
   <ParameterList name="Step">
     <Parameter name="Type" type="string" value="Fletcher"/>
-    
+
     <!-- ===========  LINE SEARCH  =========== -->
     <ParameterList name="Line Search">
       <Parameter name="Function Evaluation Limit"      type="int"    value="20"    />
@@ -164,7 +164,7 @@
 
       <!-- ===========  DESCENT ALGORITHM SPECIFICATION  =========== -->
       <ParameterList name="Descent Method">
-        <Parameter name="Type"              type="string" value="Newton-Krylov"    /> 
+        <Parameter name="Type"              type="string" value="Newton-Krylov"    />
         <Parameter name="Nonlinear CG Type" type="string" value="Hestenes-Stiefel" />
       </ParameterList>
 
diff --git a/packages/rol/example/PDE-OPT/navier-stokes/obj_navier-stokesK.hpp b/packages/rol/example/PDE-OPT/navier-stokes/obj_navier-stokesK.hpp
new file mode 100644
index 000000000000..246f0a1c9389
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/navier-stokes/obj_navier-stokesK.hpp
@@ -0,0 +1,918 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  obj.hpp
+    \brief Provides the interface for local (cell-based) objective function computations.
+*/
+
+#ifndef PDEOPT_QOI_NAVIERSTOKESK_HPP
+#define PDEOPT_QOI_NAVIERSTOKESK_HPP
+
+#include "../TOOLS/qoiK.hpp"
+#include "pde_navier-stokesK.hpp"
+
+template <class Real, class DeviceType>
+class QoI_Vorticity_NavierStokes : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> feVel_, fePrs_;
+  const ROL::Ptr<FieldHelper<Real,DeviceType>> fieldHelper_;
+  scalar_view weight_;
+
+  const Real eps_;
+
+  Real weightFunc(const std::vector<Real> & x) const {
+    return (((x[1] <= 0.5+eps_)&&(x[0]>= 1.0-eps_)&&(x[0] <= 4.0+eps_)) ?
+                static_cast<Real>(1) : static_cast<Real>(0));
+  }
+
+public:
+  QoI_Vorticity_NavierStokes(const ROL::Ptr<fe_type> &feVel,
+                             const ROL::Ptr<fe_type> &fePrs,
+                             const ROL::Ptr<FieldHelper<Real,DeviceType>> &fieldHelper)
+    : feVel_(feVel), fePrs_(fePrs), fieldHelper_(fieldHelper), eps_(std::sqrt(ROL::ROL_EPSILON<Real>())) {
+    int c = feVel_->cubPts().extent_int(0);
+    int p = feVel_->cubPts().extent_int(1);
+    int d = feVel_->cubPts().extent_int(2);
+    std::vector<Real> pt(d);
+    weight_ = scalar_view("weight",c,p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k)
+          pt[k] = (feVel_->cubPts())(i,j,k);
+        weight_(i,j) = weightFunc(pt);
+      }
+    }
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = u_coeff.extent_int(0);
+    int p = feVel_->cubPts().extent_int(1);
+    int d = feVel_->cubPts().extent_int(2);
+    // Initialize output val
+    val = scalar_view("val", c);
+    // Get components of the control
+    std::vector<scalar_view> U;
+    fieldHelper_->splitFieldCoeff(U, u_coeff);
+    // Evaluate on FE basis
+    scalar_view gradUX_eval("gradUX_eval", c, p, d);
+    scalar_view gradUY_eval("gradUY_eval", c, p, d);
+    feVel_->evaluateGradient(gradUX_eval, U[0]);
+    feVel_->evaluateGradient(gradUY_eval, U[1]);
+    // Compute curl
+    scalar_view curlU_eval("curlU_eval", c, p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j)
+        curlU_eval(i,j)   = gradUY_eval(i,j,0) - gradUX_eval(i,j,1);
+    }
+    // Multiply by weight
+    scalar_view weighted_curlU_eval("curlU_eval", c, p);
+    fst::scalarMultiplyDataData(weighted_curlU_eval,weight_,curlU_eval);
+    // Compute L2 norm squared
+    feVel_->computeIntegral(val,curlU_eval,weighted_curlU_eval,false);
+    rst::scale(val,static_cast<Real>(0.5));
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = u_coeff.extent_int(0);
+    int fv = feVel_->N().extent_int(1);
+    int fp = fePrs_->N().extent_int(1);
+    int p = feVel_->cubPts().extent_int(1);
+    int d = feVel_->cubPts().extent_int(2);
+    // Initialize output grad
+    scalar_view velUX_grad("velUX_grad", c, fv);
+    scalar_view velUY_grad("velUY_grad", c, fv);
+    scalar_view presU_grad("presU_grad", c, fp);
+    std::vector<scalar_view> G;
+    G.resize(fieldHelper_->numFields());
+    G[0] = velUX_grad;
+    G[1] = velUY_grad;
+    G[2] = presU_grad;
+    // Get components of the control
+    std::vector<scalar_view> U;
+    fieldHelper_->splitFieldCoeff(U, u_coeff);
+    // Evaluate on FE basis
+    scalar_view gradUX_eval("gradUX_eval", c, p, d);
+    scalar_view gradUY_eval("gradUY_eval", c, p, d);
+    feVel_->evaluateGradient(gradUX_eval, U[0]);
+    feVel_->evaluateGradient(gradUY_eval, U[1]);
+    // Compute curl
+    scalar_view curlU_eval("curlU_eval", c, p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j)
+        curlU_eval(i,j) = weight_(i,j)*(gradUY_eval(i,j,0) - gradUX_eval(i,j,1));
+    }
+    // Build local gradient of state tracking term
+    fst::integrate(velUX_grad,curlU_eval,feVel_->DNDdetJ(1),false);
+    rst::scale(velUX_grad,static_cast<Real>(-1));
+    fst::integrate(velUY_grad,curlU_eval,feVel_->DNDdetJ(0),false);
+
+    fieldHelper_->combineFieldCoeff(grad, G);
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Vorticity_NavierStokes::gradient_2 is zero.");
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c  = z_coeff.extent_int(0);
+    int p  = feVel_->cubPts().extent_int(1);
+    int fv = feVel_->N().extent_int(1);
+    int fp = fePrs_->N().extent_int(1);
+    int d = feVel_->cubPts().extent_int(2);
+    // Initialize output grad
+    scalar_view velVX_grad("velVX_grad", c, fv);
+    scalar_view velVY_grad("velVY_grad", c, fv);
+    scalar_view presV_grad("presV_grad", c, fp);
+    std::vector<scalar_view> G;
+    G.resize(fieldHelper_->numFields());
+    G[0] = velVX_grad;
+    G[1] = velVY_grad;
+    G[2] = presV_grad;
+    // Get components of the control
+    std::vector<scalar_view> V;
+    fieldHelper_->splitFieldCoeff(V, v_coeff);
+    // Evaluate on FE basis
+    scalar_view gradVX_eval("gradVX_eval", c, p, d);
+    scalar_view gradVY_eval("gradVY_eval", c, p, d);
+    feVel_->evaluateGradient(gradVX_eval, V[0]);
+    feVel_->evaluateGradient(gradVY_eval, V[1]);
+    // Compute curl
+    scalar_view curlV_eval("curlV_eval", c, p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        curlV_eval(i,j) = weight_(i,j)*(gradVY_eval(i,j,0) - gradVX_eval(i,j,1));
+      }
+    }
+    // Build local gradient of state tracking term
+    fst::integrate(velVX_grad,curlV_eval,feVel_->DNDdetJ(1),false);
+    rst::scale(velVX_grad,static_cast<Real>(-1));
+    fst::integrate(velVY_grad,curlV_eval,feVel_->DNDdetJ(0),false);
+
+    fieldHelper_->combineFieldCoeff(hess, G);
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Vorticity_NavierStokes::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Vorticity_NavierStokes::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Vorticity_NavierStokes::HessVec_22 is zero.");
+  }
+
+}; // QoI_Vorticity_NavierStokes
+
+template <class Real, class DeviceType>
+class QoI_Circulation_NavierStokes : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> feVel_;
+  const ROL::Ptr<fe_type> fePrs_;
+  const ROL::Ptr<FieldHelper<Real,DeviceType>> fieldHelper_;
+  scalar_view weight_;
+
+  const Real eps_;
+
+  Real weightFunc(const std::vector<Real> & x) const {
+    return (((x[1] <= 1.0+eps_)&&(x[0]>= 1.0-eps_)&&(x[0] <= 4.0+eps_)) ?
+                static_cast<Real>(1) : static_cast<Real>(0));
+  }
+
+public:
+  QoI_Circulation_NavierStokes(const ROL::Ptr<fe_type> &feVel,
+                               const ROL::Ptr<fe_type> &fePrs,
+                               const ROL::Ptr<FieldHelper<Real,DeviceType>> &fieldHelper)
+    : feVel_(feVel), fePrs_(fePrs), fieldHelper_(fieldHelper), eps_(std::sqrt(ROL::ROL_EPSILON<Real>())) {
+    int c = feVel_->cubPts().extent_int(0);
+    int p = feVel_->cubPts().extent_int(1);
+    int d = feVel_->cubPts().extent_int(2);
+    std::vector<Real> pt(d);
+    weight_ = scalar_view("weight",c,p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k)
+          pt[k] = (feVel_->cubPts())(i,j,k);
+        weight_(i,j) = weightFunc(pt);
+      }
+    }
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = u_coeff.extent_int(0);
+    int p = feVel_->cubPts().extent_int(1);
+    int d = feVel_->cubPts().extent_int(2);
+    // Initialize output val
+    val = scalar_view("val",c);
+    // Get components of the control
+    std::vector<scalar_view> U;
+    fieldHelper_->splitFieldCoeff(U, u_coeff);
+    // Evaluate on FE basis
+    scalar_view gradUX_eval("gradUX_eval", c, p, d);
+    scalar_view gradUY_eval("gradUY_eval", c, p, d);
+    feVel_->evaluateGradient(gradUX_eval, U[0]);
+    feVel_->evaluateGradient(gradUY_eval, U[1]);
+    // Compute curl
+    scalar_view curlU_eval("curlU_eval", c, p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j)
+        curlU_eval(i,j) = gradUY_eval(i,j,0) - gradUX_eval(i,j,1);
+    }
+    // Compute circulation
+    feVel_->computeIntegral(val,curlU_eval,weight_,false);
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = u_coeff.extent_int(0);
+    int fv = feVel_->N().extent_int(1);
+    int fp = fePrs_->N().extent_int(1);
+    // Initialize output grad
+    scalar_view velUX_grad("velUX_grad", c, fv);
+    scalar_view velUY_grad("velUY_grad", c, fv);
+    scalar_view presU_grad("presU_grad", c, fp);
+    std::vector<scalar_view> G;
+    G.resize(fieldHelper_->numFields());
+    G[0] = velUX_grad;
+    G[1] = velUY_grad;
+    G[2] = presU_grad;
+    // Build local gradient of state tracking term
+    fst::integrate(velUX_grad,weight_,feVel_->DNDdetJ(1),false);
+    rst::scale(velUX_grad,static_cast<Real>(-1));
+    fst::integrate(velUY_grad,weight_,feVel_->DNDdetJ(0),false);
+
+    fieldHelper_->combineFieldCoeff(grad, G);
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Circulation_NavierStokes::gradient_2 is zero.");
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Circulation_NavierStokes::HessVec_11 is zero.");
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Circulation_NavierStokes::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Circulation_NavierStokes::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Circulation_NavierStokes::HessVec_22 is zero.");
+  }
+
+}; // QoI_Circulation_NavierStokes
+
+template <class Real, class DeviceType>
+class QoI_Horizontal_NavierStokes : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> feVel_;
+  const ROL::Ptr<fe_type> fePrs_;
+  const ROL::Ptr<FieldHelper<Real,DeviceType>> fieldHelper_;
+  scalar_view weight_;
+
+  const Real eps_;
+
+  Real weightFunc(const std::vector<Real> & x) const {
+    return (((x[1] <= 0.5+eps_)&&(x[0]>= 1.0-eps_)&&(x[0] <= 4.0+eps_)) ?
+                static_cast<Real>(1) : static_cast<Real>(0));
+  }
+
+public:
+  QoI_Horizontal_NavierStokes(const ROL::Ptr<fe_type> &feVel,
+                              const ROL::Ptr<fe_type> &fePrs,
+                              const ROL::Ptr<FieldHelper<Real,DeviceType>> &fieldHelper)
+    : feVel_(feVel), fePrs_(fePrs), fieldHelper_(fieldHelper), eps_(std::sqrt(ROL::ROL_EPSILON<Real>())) {
+    int c = feVel_->cubPts().extent_int(0);
+    int p = feVel_->cubPts().extent_int(1);
+    int d = feVel_->cubPts().extent_int(2);
+    std::vector<Real> pt(d);
+    weight_ = scalar_view("weight",c,p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k)
+          pt[k] = (feVel_->cubPts())(i,j,k);
+        weight_(i,j) = weightFunc(pt);
+      }
+    }
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = u_coeff.extent_int(0);
+    int p = feVel_->cubPts().extent_int(1);
+    // Initialize output val
+    val = scalar_view("val", c);
+    // Get components of the control
+    std::vector<scalar_view> U;
+    fieldHelper_->splitFieldCoeff(U, u_coeff);
+    // Evaluate on FE basis
+    scalar_view valUX_eval("valUX_eval", c, p);
+    scalar_view valUY_eval("valUY_eval", c, p);
+    feVel_->evaluateValue(valUX_eval, U[0]);
+    feVel_->evaluateValue(valUY_eval, U[1]);
+    // Compute negative part of x-velocity
+    scalar_view minUX_eval("minUX_eval", c, p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j)
+        minUX_eval(i,j) = std::min(static_cast<Real>(0),valUX_eval(i,j));
+    }
+    // Multiply by weight
+    scalar_view weighted_minUX_eval("minUX_eval", c, p);
+    fst::scalarMultiplyDataData(weighted_minUX_eval,weight_,minUX_eval);
+    scalar_view weighted_valUY_eval("valUY_eval", c, p);
+    fst::scalarMultiplyDataData(weighted_valUY_eval,weight_,valUY_eval);
+    // Compute L2 norm squared
+    feVel_->computeIntegral(val,minUX_eval,weighted_minUX_eval,false);
+    feVel_->computeIntegral(val,valUY_eval,weighted_valUY_eval,true);
+
+    rst::scale(val,static_cast<Real>(0.5));
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = u_coeff.extent_int(0);
+    int fv = feVel_->N().extent_int(1);
+    int fp = fePrs_->N().extent_int(1);
+    int p = feVel_->cubPts().extent_int(1);
+    // Initialize output grad
+    scalar_view velUX_grad("velUX_grad", c, fv);
+    scalar_view velUY_grad("velUY_grad", c, fv);
+    scalar_view presU_grad("presU_grad", c, fp);
+    std::vector<scalar_view> G;
+    G.resize(fieldHelper_->numFields());
+    G[0] = velUX_grad;
+    G[1] = velUY_grad;
+    G[2] = presU_grad;
+    // Get components of the control
+    std::vector<scalar_view> U;
+    fieldHelper_->splitFieldCoeff(U, u_coeff);
+    // Evaluate on FE basis
+    scalar_view valUX_eval("valUX_eval", c, p);
+    scalar_view valUY_eval("valUY_eval", c, p);
+    feVel_->evaluateValue(valUX_eval, U[0]);
+    feVel_->evaluateValue(valUY_eval, U[1]);
+    // Compute negative part of x-velocity
+    scalar_view weighted_minUX_eval("weighted_minUX_eval", c, p);
+    scalar_view weighted_valUY_eval("weighted_valUY_eval", c, p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        weighted_minUX_eval(i,j)
+          = weight_(i,j) * std::min(static_cast<Real>(0),valUX_eval(i,j));
+        weighted_valUY_eval(i,j)
+          = weight_(i,j) * valUY_eval(i,j);
+      }
+    }
+    // Build local gradient of state tracking term
+    fst::integrate(velUX_grad,weighted_minUX_eval,feVel_->NdetJ(),false);
+    fst::integrate(velUY_grad,weighted_valUY_eval,feVel_->NdetJ(),false);
+
+    fieldHelper_->combineFieldCoeff(grad, G);
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Horizontal_NavierStokes::gradient_2 is zero.");
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c  = z_coeff.extent_int(0);
+    int p  = feVel_->cubPts().extent_int(1);
+    int fv = feVel_->N().extent_int(1);
+    int fp = fePrs_->N().extent_int(1);
+    // Initialize output grad
+    scalar_view velVX_grad("velVX_grad", c, fv);
+    scalar_view velVY_grad("velVY_grad", c, fv);
+    scalar_view presV_grad("presV_grad", c, fp);
+    std::vector<scalar_view> G;
+    G.resize(fieldHelper_->numFields());
+    G[0] = velVX_grad;
+    G[1] = velVY_grad;
+    G[2] = presV_grad;
+    // Get components of the control
+    std::vector<scalar_view> U;
+    fieldHelper_->splitFieldCoeff(U, u_coeff);
+    std::vector<scalar_view> V;
+    fieldHelper_->splitFieldCoeff(V, v_coeff);
+    // Evaluate on FE basis
+    scalar_view valUX_eval("valUX_eval", c, p);
+    scalar_view valVX_eval("valVX_eval", c, p);
+    scalar_view valVY_eval("valVY_eval", c, p);
+    feVel_->evaluateValue(valUX_eval, U[0]);
+    feVel_->evaluateValue(valVX_eval, V[0]);
+    feVel_->evaluateValue(valVY_eval, V[1]);
+    // Compute negative part of x-velocity
+    scalar_view weighted_minVX_eval("weighted_minVX_eval", c, p);
+    scalar_view weighted_valVY_eval("weighted_valVY_eval", c, p);
+    Real scale(0);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        if ( valUX_eval(i,j) < static_cast<Real>(0) ) {
+          scale = static_cast<Real>(1);
+        }
+        else if ( valUX_eval(i,j) > static_cast<Real>(0) ) {
+          scale = static_cast<Real>(0);
+        }
+        else {
+          //scale = static_cast<Real>(0);
+          //scale = static_cast<Real>(0.5);
+          scale = static_cast<Real>(1);
+        }
+        weighted_minVX_eval(i,j) = scale * weight_(i,j) * valVX_eval(i,j);
+        weighted_valVY_eval(i,j) = weight_(i,j) * valVY_eval(i,j);
+      }
+    }
+    // Build local gradient of state tracking term
+    fst::integrate(velVX_grad,weighted_minVX_eval,feVel_->NdetJ(),false);
+    fst::integrate(velVY_grad,weighted_valVY_eval,feVel_->NdetJ(),false);
+
+    fieldHelper_->combineFieldCoeff(hess, G);
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Horizontal_NavierStokes::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Horizontal_NavierStokes::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Horizontal_NavierStokes::HessVec_22 is zero.");
+  }
+
+}; // QoI_Horizontal_NavierStokes
+
+template <class Real, class DeviceType>
+class QoI_State_NavierStokes : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  ROL::Ptr<QoI<Real,DeviceType>> qoi_;
+
+public:
+  QoI_State_NavierStokes(Teuchos::ParameterList &parlist,
+                         const ROL::Ptr<fe_type> &feVel,
+                         const ROL::Ptr<fe_type> &fePrs,
+                         const ROL::Ptr<FieldHelper<Real,DeviceType>> &fieldHelper) {
+    std::string stateObj = parlist.sublist("Problem").get("Objective type","Vorticity");
+    if ( stateObj != "Vorticity" && stateObj != "Circulation" && stateObj != "Directional" ) {
+      throw Exception::NotImplemented(">>> (QoI_State_NavierStokes): Unknown objective type."); 
+    }
+    if ( stateObj == "Vorticity" ) {
+      qoi_ = ROL::makePtr<QoI_Vorticity_NavierStokes<Real,DeviceType>>(feVel,fePrs,fieldHelper);
+    }
+    else if ( stateObj == "Directional" ) {
+      qoi_ = ROL::makePtr<QoI_Horizontal_NavierStokes<Real,DeviceType>>(feVel,fePrs,fieldHelper);
+    }
+    else {
+      qoi_ = ROL::makePtr<QoI_Circulation_NavierStokes<Real,DeviceType>>(feVel,fePrs,fieldHelper);
+    }
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    return qoi_->value(val, u_coeff, z_coeff, z_param);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    qoi_->gradient_1(grad, u_coeff, z_coeff, z_param);
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    qoi_->gradient_2(grad, u_coeff, z_coeff, z_param);
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    qoi_->HessVec_11(hess, v_coeff, u_coeff, z_coeff, z_param);
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    qoi_->HessVec_12(hess, v_coeff, u_coeff, z_coeff, z_param);
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    qoi_->HessVec_21(hess, v_coeff, u_coeff, z_coeff, z_param);
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    qoi_->HessVec_22(hess, v_coeff, u_coeff, z_coeff, z_param);
+  }
+
+};
+
+template <class Real, class DeviceType>
+class QoI_L2Penalty_NavierStokes : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> feVel_;
+  const ROL::Ptr<fe_type> fePrs_;
+  const std::vector<ROL::Ptr<fe_type>> feVelBdry_;
+  const std::vector<std::vector<int>> bdryCellLocIds_;
+  const ROL::Ptr<FieldHelper<Real,DeviceType>> fieldHelper_;
+
+  scalar_view getBoundaryCoeff(
+      const scalar_view & cell_coeff,
+      int locSideId) const {
+    std::vector<int> bdryCellLocId = bdryCellLocIds_[locSideId];
+    const int numCellsSide = bdryCellLocId.size();
+    const int f = feVel_->N().extent_int(1);
+    
+    scalar_view bdry_coeff("bdry_coeff", numCellsSide, f);
+    for (int i = 0; i < numCellsSide; ++i) {
+      for (int j = 0; j < f; ++j) {
+        bdry_coeff(i, j) = cell_coeff(bdryCellLocId[i], j);
+      }
+    }
+    return bdry_coeff;
+  }
+
+public:
+  QoI_L2Penalty_NavierStokes(const ROL::Ptr<fe_type> &feVel,
+                             const ROL::Ptr<fe_type> &fePrs,
+                             const std::vector<ROL::Ptr<fe_type>> &feVelBdry,
+                             const std::vector<std::vector<int>> &bdryCellLocIds,
+                             const ROL::Ptr<FieldHelper<Real,DeviceType>> &fieldHelper)
+    : feVel_(feVel), fePrs_(fePrs), feVelBdry_(feVelBdry),
+      bdryCellLocIds_(bdryCellLocIds), fieldHelper_(fieldHelper) {}
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    const int c = feVel_->gradN().extent_int(0);
+    // Get components of the control
+    std::vector<scalar_view> Z;
+    fieldHelper_->splitFieldCoeff(Z, z_coeff);
+    // Initialize output val
+    val = scalar_view("val", c);
+    // Compute cost integral
+    const int numLocSides = bdryCellLocIds_.size();
+    for (int l = 0; l < numLocSides; ++l) {
+      const int numCellsSide  = bdryCellLocIds_[l].size();
+      if ( numCellsSide ) {
+        const int numCubPerSide = feVelBdry_[l]->cubPts().extent_int(1);
+        // Evaluate x-component of control on FE basis
+        scalar_view zx_coeff_bdry = getBoundaryCoeff(Z[0], l);
+        scalar_view valZX_eval("valZX_eval", numCellsSide, numCubPerSide);
+        feVelBdry_[l]->evaluateValue(valZX_eval, zx_coeff_bdry);
+        // Evaluate y-component of control on FE basis
+        scalar_view zy_coeff_bdry = getBoundaryCoeff(Z[1], l);
+        scalar_view valZY_eval("valZY_eval", numCellsSide, numCubPerSide);
+        feVelBdry_[l]->evaluateValue(valZY_eval, zy_coeff_bdry);
+        // Integrate cell L2 norm squared
+        scalar_view intVal("intVal", numCellsSide);
+        feVelBdry_[l]->computeIntegral(intVal,valZX_eval,valZX_eval,false);
+        feVelBdry_[l]->computeIntegral(intVal,valZY_eval,valZY_eval,true);
+        // Add to integral value
+        for (int i = 0; i < numCellsSide; ++i) {
+          int cidx = bdryCellLocIds_[l][i];
+          val(cidx) += static_cast<Real>(0.5)*intVal(i);
+        }
+      }
+    }
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_L2Penalty_NavierStokes::gradient_1 is zero.");
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fv = feVel_->gradN().extent_int(1);
+    const int fp = fePrs_->gradN().extent_int(1);
+    // Initialize output grad
+    scalar_view velZX_grad("velZX_grad", c, fv);
+    scalar_view velZY_grad("velZY_grad", c, fv);
+    scalar_view presZ_grad("presZ_grad", c, fp);
+    std::vector<scalar_view> G;
+    G.resize(fieldHelper_->numFields());
+    G[0] = velZX_grad;
+    G[1] = velZY_grad;
+    G[2] = presZ_grad;
+    // Get components of the control
+    std::vector<scalar_view> Z;
+    fieldHelper_->splitFieldCoeff(Z, z_coeff);
+    // Compute cost integral
+    const int numLocSides = bdryCellLocIds_.size();
+    for (int l = 0; l < numLocSides; ++l) {
+      const int numCellsSide  = bdryCellLocIds_[l].size();
+      if ( numCellsSide ) {
+        const int numCubPerSide = feVelBdry_[l]->cubPts().extent_int(1);
+        // Evaluate x-component of control on FE basis
+        scalar_view zx_coeff_bdry = getBoundaryCoeff(Z[0], l);
+        scalar_view valZX_eval("valZX_eval", numCellsSide, numCubPerSide);
+        feVelBdry_[l]->evaluateValue(valZX_eval, zx_coeff_bdry);
+        // Evaluate y-component of control on FE basis
+        scalar_view zy_coeff_bdry = getBoundaryCoeff(Z[1], l);
+        scalar_view valZY_eval("valZY_eval", numCellsSide, numCubPerSide);
+        feVelBdry_[l]->evaluateValue(valZY_eval, zy_coeff_bdry);
+        // Compute gradient of squared L2-norm of diff
+        scalar_view intGradX("intGradX", numCellsSide, fv);
+        fst::integrate(intGradX,valZX_eval,feVelBdry_[l]->NdetJ(),false);
+        scalar_view intGradY("intGradY", numCellsSide, fv);
+        fst::integrate(intGradY,valZY_eval,feVelBdry_[l]->NdetJ(),false);
+        // Add to integral value
+        for (int i = 0; i < numCellsSide; ++i) {
+          int cidx = bdryCellLocIds_[l][i];
+          for (int j = 0; j < fv; ++j) {
+            (G[0])(cidx,j) += intGradX(i,j);
+            (G[1])(cidx,j) += intGradY(i,j);
+          }
+        }
+      }
+    }
+
+    fieldHelper_->combineFieldCoeff(grad, G);
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_L2Penalty_NavierStokes::HessVec_11 is zero.");
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_L2Penalty_NavierStokes::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_L2Penalty_NavierStokes::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fv = feVel_->gradN().extent_int(1);
+    const int fp = fePrs_->gradN().extent_int(1);
+    // Initialize output grad
+    scalar_view velVX_grad("velVX_grad", c, fv);
+    scalar_view velVY_grad("velVY_grad", c, fv);
+    scalar_view presV_grad("presV_grad", c, fp);
+    std::vector<scalar_view> G;
+    G.resize(fieldHelper_->numFields());
+    G[0] = velVX_grad;
+    G[1] = velVY_grad;
+    G[2] = presV_grad;
+    // Get components of the control
+    std::vector<scalar_view> V;
+    fieldHelper_->splitFieldCoeff(V, v_coeff);
+    // Compute cost integral
+    const int numLocSides = bdryCellLocIds_.size();
+    for (int l = 0; l < numLocSides; ++l) {
+      const int numCellsSide  = bdryCellLocIds_[l].size();
+      if ( numCellsSide ) {
+        const int numCubPerSide = feVelBdry_[l]->cubPts().extent_int(1);
+        // Evaluate x-component of control on FE basis
+        scalar_view vx_coeff_bdry = getBoundaryCoeff(V[0], l);
+        scalar_view valVX_eval("valVX_eval", numCellsSide, numCubPerSide);
+        feVelBdry_[l]->evaluateValue(valVX_eval, vx_coeff_bdry);
+        // Evaluate y-component of control on FE basis
+        scalar_view vy_coeff_bdry = getBoundaryCoeff(V[1], l);
+        scalar_view valVY_eval("valVY_eval", numCellsSide, numCubPerSide);
+        feVelBdry_[l]->evaluateValue(valVY_eval, vy_coeff_bdry);
+        // Compute gradient of squared L2-norm of diff
+        scalar_view intHessX("intHessX", numCellsSide, fv);
+        fst::integrate(intHessX,valVX_eval,feVelBdry_[l]->NdetJ(),false);
+        scalar_view intHessY("intHessY", numCellsSide, fv);
+        fst::integrate(intHessY,valVY_eval,feVelBdry_[l]->NdetJ(),false);
+        // Add to integral value
+        for (int i = 0; i < numCellsSide; ++i) {
+          int cidx = bdryCellLocIds_[l][i];
+          for (int j = 0; j < fv; ++j) {
+            (G[0])(cidx,j) += intHessX(i,j);
+            (G[1])(cidx,j) += intHessY(i,j);
+          }
+        }
+      }
+    }
+
+    fieldHelper_->combineFieldCoeff(hess, G);
+  }
+
+}; // QoI_L2Penalty_NavierStokes
+
+template <class Real>
+class StdObjective_NavierStokes : public ROL::StdObjective<Real> {
+private:
+  Real alpha_;
+  std::string stateObj_;
+
+public:
+  StdObjective_NavierStokes(Teuchos::ParameterList &parlist) {
+    alpha_    = parlist.sublist("Problem").get("Control penalty parameter",1.e-4);
+    stateObj_ = parlist.sublist("Problem").get("Objective type","Vorticity");
+    if ( stateObj_ != "Vorticity" && stateObj_ != "Circulation" && stateObj_ != "Directional") {
+      throw Exception::NotImplemented(">>> (StdObjective_NavierStokes): Unknown objective type."); 
+    }
+  }
+
+  Real value(const std::vector<Real> &x, Real &tol) {
+    Real val = alpha_*x[1];
+    if ( stateObj_ == "Vorticity" || stateObj_ == "Directional" ) {
+      val += x[0];
+    }
+    else {
+      val += static_cast<Real>(0.5)*x[0]*x[0];
+    }
+    return val;
+  }
+
+  void gradient(std::vector<Real> &g, const std::vector<Real> &x, Real &tol) {
+    const Real one(1);
+    if ( stateObj_ == "Vorticity" || stateObj_ == "Directional" ) {
+      g[0] = one;
+    }
+    else {
+      g[0] = x[0];
+    }
+    g[1] = alpha_;
+  }
+
+  void hessVec(std::vector<Real> &hv, const std::vector<Real> &v, const std::vector<Real> &x, Real &tol) {
+    const Real zero(0);
+    if ( stateObj_ == "Vorticity" || stateObj_ == "Directional" ) {
+      hv[0] = zero;
+    }
+    else {
+      hv[0] = v[0];
+    }
+    hv[1] = zero;
+  }
+
+}; // OBJ_SCALAR
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/navier-stokes/pde_navier-stokesK.hpp b/packages/rol/example/PDE-OPT/navier-stokes/pde_navier-stokesK.hpp
new file mode 100644
index 000000000000..cbd20604efa3
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/navier-stokes/pde_navier-stokesK.hpp
@@ -0,0 +1,1117 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  pde_navier-stokes.hpp
+    \brief Implements the local PDE interface for the Navier-Stokes control problem.
+*/
+
+#ifndef PDE_NAVIERSTOKESK_HPP
+#define PDE_NAVIERSTOKESK_HPP
+
+#include "../TOOLS/pdeK.hpp"
+#include "../TOOLS/feK.hpp"
+#include "../TOOLS/fieldhelperK.hpp"
+
+#include "Intrepid2_HGRAD_QUAD_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_QUAD_C2_FEM.hpp"
+#include "Intrepid2_DefaultCubatureFactory.hpp"
+#include "Intrepid2_FunctionSpaceTools.hpp"
+#include "Intrepid2_CellTools.hpp"
+
+#include "ROL_Ptr.hpp"
+
+template <class Real, class DeviceType>
+class PDE_NavierStokes : public PDE<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  // Finite element basis information
+  basis_ptr basisPtrVel_, basisPtrPrs_;
+  std::vector<basis_ptr> basisPtrs_;
+  // Cell cubature information
+  ROL::Ptr<Intrepid2::Cubature<DeviceType,Real,Real>> cellCub_, bdryCub_;
+  // Cell node information
+  scalar_view volCellNodes_;
+  std::vector<std::vector<scalar_view>> bdryCellNodes_;
+  std::vector<std::vector<std::vector<int>>> bdryCellLocIds_;
+  // Finite element definition
+  ROL::Ptr<fe_type> feVel_, fePrs_;
+  std::vector<ROL::Ptr<fe_type>> feVelBdry_;
+  // Local degrees of freedom on boundary, for each side of the reference cell (first index).
+  std::vector<std::vector<int>> fvidx_, fpidx_;
+  // Coordinates of degrees freedom on boundary cells.
+  // Indexing:  [sideset number][local side id](cell number, value at dof)
+  std::vector<std::vector<scalar_view>> bdryCellDofValues_;
+  // Field pattern, offsets, etc.
+  std::vector<std::vector<int>> fieldPattern_; // local Field/DOF pattern; set from DOF manager 
+  int numFields_;                              // number of fields (equations in the PDE)
+  int numDofs_;                                // total number of degrees of freedom for all (local) fields
+  std::vector<int> offset_;                    // for each field, a counting offset
+  std::vector<int> numFieldDofs_;              // for each field, number of degrees of freedom
+  
+  // Problem parameters.
+  Real viscosity_;
+  bool horizontalControl_;
+  Real DirichletControlPenalty_;
+  bool useDirichletControl_;
+
+  ROL::Ptr<FieldHelper<Real,DeviceType>> fieldHelper_;
+
+  // Extract velocity coefficients on boundary.
+  scalar_view getBoundaryCoeff(const scalar_view cell_coeff, int sideSet, int cell) const {
+    std::vector<int> bdryCellLocId = bdryCellLocIds_[sideSet][cell];
+    const int numCellsSide = bdryCellLocId.size();
+    const int f = basisPtrVel_->getCardinality();
+    
+    scalar_view bdry_coeff("bdry_coeff", numCellsSide, f);
+    for (int i = 0; i < numCellsSide; ++i) {
+      for (int j = 0; j < f; ++j)
+        bdry_coeff(i, j) = cell_coeff(bdryCellLocId[i], j);
+    }
+    return bdry_coeff;
+  }
+
+  Real DirichletFunc(const std::vector<Real> &coords, int sideset, int locSideId, int dir) const {
+    const std::vector<Real> param = PDE<Real,DeviceType>::getParameter();
+    Real val(0);
+    if ((sideset==4) && (dir==0)) {
+      if ( param.size() ) {
+        Real zero(0), half(0.5), one(1), two(2), pi(M_PI), four(4), c1(1), c2(-0.5);
+        if ( param[0] == zero )
+          val = half*std::sin(two*pi * (c1*coords[1] + c2));
+        else {
+          Real num = (std::exp(four*param[0]*(c1*coords[1] + c2)) - one);
+          Real den = (std::exp(two*param[0])-one);
+          val = half*std::sin(pi * num/den);
+        }
+      }
+      else {
+        val = static_cast<Real>(8)
+             *(coords[1]-static_cast<Real>(0.5))
+             *(static_cast<Real>(1)-coords[1]);
+      }
+    }
+    return val;
+  }
+
+  void computeDirichlet(void) {
+    // Compute Dirichlet values at DOFs.
+    int d = basisPtrVel_->getBaseCellTopology().getDimension();
+    int numSidesets = bdryCellLocIds_.size();
+    bdryCellDofValues_.resize(numSidesets);
+    for (int i=0; i<numSidesets; ++i) {
+      int numLocSides = bdryCellLocIds_[i].size();
+      bdryCellDofValues_[i].resize(numLocSides);
+      for (int j=0; j<numLocSides; ++j) {
+        int c = bdryCellLocIds_[i][j].size();
+        int f = basisPtrVel_->getCardinality();
+        bdryCellDofValues_[i][j] = scalar_view("bdryCellDofValues", c, f, d);
+        scalar_view coords("coords", c, f, d);
+        if (c > 0) {
+          feVel_->computeDofCoords(coords, bdryCellNodes_[i][j]);
+        }
+        for (int k=0; k<c; ++k) {
+          for (int l=0; l<f; ++l) {
+            std::vector<Real> dofpoint(d);
+            //std::cout << "Sideset " << i << " LocalSide " << j << "  Cell " << k << "  Field " << l << "  Coord ";
+            for (int m=0; m<d; ++m) {
+              dofpoint[m] = coords(k, l, m);
+              //std::cout << dofpoint[m] << "  ";
+            }
+
+            for (int m=0; m<d; ++m) {
+              (bdryCellDofValues_[i][j])(k, l, m) = DirichletFunc(dofpoint, i, j, m);
+              //std::cout << "  " << m << "-Value " << DirichletFunc(dofpoint, i, j, m);
+            }
+            //std::cout << std::endl;
+          }
+        }
+      }
+    }
+  }
+
+  Real viscosityFunc(const std::vector<Real> & coords) const {
+    const std::vector<Real> param = PDE<Real,DeviceType>::getParameter();
+    if ( param.size() ) {
+      Real five(5), three(3);
+      return (five + three*param[1])*static_cast<Real>(1e-3);
+    }
+    return viscosity_;
+  }
+
+  void computeViscosity(scalar_view &nu) const {
+    int c = feVel_->gradN().extent_int(0);
+    int p = feVel_->gradN().extent_int(2);
+    int d = feVel_->gradN().extent_int(3);
+    std::vector<Real> pt(d);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for ( int k = 0; k < d; ++k)
+          pt[k] = (feVel_->cubPts())(i,j,k);
+        // Compute spatially dependent viscosity
+        nu(i,j) = viscosityFunc(pt);
+      }
+    }
+  }
+
+public:
+  PDE_NavierStokes(Teuchos::ParameterList &parlist) {
+    // Finite element fields.
+    basisPtrVel_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C2_FEM<DeviceType,Real,Real>>();
+    basisPtrPrs_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceType,Real,Real>>();
+    basisPtrs_.clear();
+    basisPtrs_.push_back(basisPtrVel_);  // Velocity X
+    basisPtrs_.push_back(basisPtrVel_);  // Velocity Y
+    basisPtrs_.push_back(basisPtrPrs_);  // Pressure
+    // Quadrature rules.
+    shards::CellTopology cellType = basisPtrs_[0]->getBaseCellTopology();    // get the cell type from any basis
+    Intrepid2::DefaultCubatureFactory cubFactory;                             // create cubature factory
+    int cubDegree = parlist.sublist("Problem").get("Cubature Degree", 4);    // set cubature degree, e.g., 4
+    cellCub_ = cubFactory.create<DeviceType,Real,Real>(cellType, cubDegree); // create default cubature
+
+    int d = cellType.getDimension();
+    shards::CellTopology bdryCellType = cellType.getCellTopologyData(d-1, 0);
+    int bdryCubDegree = parlist.sublist("Problem").get("Boundary Cubature Degree",4); // set cubature degree, e.g., 4
+    bdryCub_ = cubFactory.create<DeviceType,Real,Real>(bdryCellType, bdryCubDegree);
+
+    // Other problem parameters.
+    viscosity_ = parlist.sublist("Problem").get("Viscosity", 5e-3);
+    horizontalControl_ = parlist.sublist("Problem").get("Horizontal control",false);
+    DirichletControlPenalty_ = parlist.sublist("Problem").get("Dirichlet control penalty",1e-5);
+    useDirichletControl_ = parlist.sublist("Problem").get("Dirichlet control",false);
+
+    numDofs_ = 0;
+    numFields_ = basisPtrs_.size();
+    offset_.resize(numFields_);
+    numFieldDofs_.resize(numFields_);
+    for (int i=0; i<numFields_; ++i) {
+      if (i==0)
+        offset_[i]  = 0;
+      else
+        offset_[i]  = offset_[i-1] + basisPtrs_[i-1]->getCardinality();
+      numFieldDofs_[i] = basisPtrs_[i]->getCardinality();
+      numDofs_ += numFieldDofs_[i];
+    }
+  }
+
+  void residual(scalar_view & res,
+                const scalar_view u_coeff,
+                const scalar_view z_coeff = scalar_view(),
+                const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    int c  = u_coeff.extent_int(0);
+    int p  = cellCub_->getNumPoints();
+    int fv = basisPtrVel_->getCardinality();
+    int fp = basisPtrPrs_->getCardinality();
+    int d  = cellCub_->getDimension();
+ 
+    // Initialize residuals.
+    scalar_view velX_res("velX_res", c, fv);
+    scalar_view velY_res("velY_res", c, fv);
+    scalar_view pres_res("pres_res", c, fp);
+    std::vector<scalar_view> R;
+    R.resize(numFields_);
+    R[0] = velX_res;
+    R[1] = velY_res;
+    R[2] = pres_res;
+
+    // Split u_coeff into components.
+    std::vector<scalar_view> U;
+    std::vector<scalar_view> Z;
+    fieldHelper_->splitFieldCoeff(U, u_coeff);
+    fieldHelper_->splitFieldCoeff(Z, z_coeff);
+
+    // Evaluate/interpolate finite element fields on cells.
+    scalar_view nu("nu", c, p);
+    scalar_view nuGradVelX_eval("nuGradVelX_eval", c, p, d);
+    scalar_view nuGradVelY_eval("nuGradVelY_eval", c, p, d);
+    scalar_view valVel_eval("valVel_eval", c, p, d);
+    scalar_view valVelX_eval("valVelX_eval", c, p);
+    scalar_view valVelY_eval("valVelY_eval", c, p);
+    scalar_view valPres_eval("valPres_eval", c, p);
+    scalar_view gradVelX_eval("gradVelX_eval", c, p, d);
+    scalar_view gradVelY_eval("gradVelY_eval", c, p, d);
+    scalar_view divVel_eval("divVel_eval", c, p);
+    scalar_view valVelDotgradVelX_eval("valVelDotgradVelX_eval", c, p);
+    scalar_view valVelDotgradVelY_eval("valVelDotgradVelY_eval", c, p);
+    feVel_->evaluateValue(valVelX_eval, U[0]);
+    feVel_->evaluateValue(valVelY_eval, U[1]);
+    fePrs_->evaluateValue(valPres_eval, U[2]);
+    feVel_->evaluateGradient(gradVelX_eval, U[0]);
+    feVel_->evaluateGradient(gradVelY_eval, U[1]);
+    computeViscosity(nu);
+
+    // Assemble the velocity vector and its divergence.
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        valVel_eval(i,j,0) = valVelX_eval(i,j);
+        valVel_eval(i,j,1) = valVelY_eval(i,j);
+        divVel_eval(i,j)   = gradVelX_eval(i,j,0) + gradVelY_eval(i,j,1);
+      }
+    }
+    // Negative pressure
+    rst::scale(valPres_eval,static_cast<Real>(-1));
+
+    // Multiply velocity gradients with viscosity.
+    fst::tensorMultiplyDataData(nuGradVelX_eval,nu,gradVelX_eval);
+    fst::tensorMultiplyDataData(nuGradVelY_eval,nu,gradVelY_eval);
+
+    // Compute nonlinear terms in the Navier-Stokes equations.
+    fst::dotMultiplyDataData(valVelDotgradVelX_eval, valVel_eval, gradVelX_eval);
+    fst::dotMultiplyDataData(valVelDotgradVelY_eval, valVel_eval, gradVelY_eval);
+
+    /*** Evaluate weak form of the residual. ***/
+    // X component of velocity equation.
+    fst::integrate(velX_res,nuGradVelX_eval,feVel_->gradNdetJ(),false);
+    fst::integrate(velX_res,valVelDotgradVelX_eval,feVel_->NdetJ(),true);
+    fst::integrate(velX_res,valPres_eval,feVel_->DNDdetJ(0),true);
+    // Y component of velocity equation.
+    fst::integrate(velY_res,nuGradVelY_eval,feVel_->gradNdetJ(),false);
+    fst::integrate(velY_res,valVelDotgradVelY_eval,feVel_->NdetJ(),true);
+    fst::integrate(velY_res,valPres_eval,feVel_->DNDdetJ(1),true);
+    // Pressure equation.
+    fst::integrate(pres_res,divVel_eval,fePrs_->NdetJ(),false);
+    rst::scale(pres_res,static_cast<Real>(-1));
+
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      if ( !useDirichletControl_ ) {
+        const int numCubPerSide = bdryCub_->getNumPoints();
+        for (int i = 0; i < numSideSets; ++i) {
+          if (i==10) {
+            int numLocalSideIds = bdryCellLocIds_[i].size();
+            for (int j = 0; j < numLocalSideIds; ++j) {
+              int numCellsSide = bdryCellLocIds_[i][j].size();
+              if (numCellsSide) {
+                for (int k = 0; k < d; ++k) {
+                  // Get U coefficients on Robin boundary
+                  scalar_view u_coeff_bdry = getBoundaryCoeff(U[k], i, j);
+                  // Evaluate U on FE basis
+                  scalar_view valU_eval_bdry("valU_eval_bdry", numCellsSide, numCubPerSide);
+                  feVelBdry_[j]->evaluateValue(valU_eval_bdry, u_coeff_bdry);
+                  // Get Z coefficients on Robin boundary
+                  scalar_view z_coeff_bdry = getBoundaryCoeff(Z[k], i, j);
+                  // Evaluate Z on FE basis
+                  scalar_view valZ_eval_bdry("valZ_eval_bdry", numCellsSide, numCubPerSide);
+                  if (horizontalControl_) {
+                    if (k==0) // control in horizontal direction only
+                      feVelBdry_[j]->evaluateValue(valZ_eval_bdry, z_coeff_bdry);
+                    // else valZ_eval_bdry=0
+                  }
+                  else // control in both directions
+                    feVelBdry_[j]->evaluateValue(valZ_eval_bdry, z_coeff_bdry);
+                  // Compute Robin residual
+                  scalar_view robinVal("robinVal", numCellsSide, numCubPerSide);
+                  rst::subtract(robinVal,valU_eval_bdry,valZ_eval_bdry);
+                  rst::scale(robinVal,static_cast<Real>(1)/DirichletControlPenalty_);
+                  scalar_view robinRes("robinRes", numCellsSide, fv);
+                  fst::integrate(robinRes,robinVal,feVelBdry_[j]->NdetJ(),false);
+                  // Add Robin residual to volume residual
+                  for (int l = 0; l < numCellsSide; ++l) {
+                    int cidx = bdryCellLocIds_[i][j][l];
+                    for (int m = 0; m < fv; ++m)
+                      (R[k])(cidx,m) += robinRes(l,m);
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+      // APPLY DIRICHLET CONDITIONS
+      computeDirichlet();
+      for (int i = 0; i < numSideSets; ++i) {
+        if ( useDirichletControl_ ) {
+          // Apply Dirichlet control
+          if (i==6) {
+            int numLocalSideIds = bdryCellLocIds_[i].size();
+            for (int j = 0; j < numLocalSideIds; ++j) {
+              int numCellsSide = bdryCellLocIds_[i][j].size();
+              int numBdryDofs = fvidx_[j].size();
+              for (int k = 0; k < numCellsSide; ++k) {
+                int cidx = bdryCellLocIds_[i][j][k];
+                for (int l = 0; l < numBdryDofs; ++l) {
+                  //std::cout << "\n   j=" << j << "  l=" << l << "  " << fidx[j][l];
+                  if ( !horizontalControl_ ) {
+                    for (int m=0; m < d; ++m) {
+                      (R[m])(cidx,fvidx_[j][l]) = (U[m])(cidx,fvidx_[j][l]) - (Z[m])(cidx,fvidx_[j][l]);
+                    }
+                  }
+                  else {
+                    (R[0])(cidx,fvidx_[j][l]) = (U[0])(cidx,fvidx_[j][l]) - (Z[0])(cidx,fvidx_[j][l]);
+                    (R[1])(cidx,fvidx_[j][l]) = (U[1])(cidx,fvidx_[j][l]);
+                  }
+                }
+              }
+            }
+          }
+          // Apply Dirichlet conditions
+          if (i==8) {
+            int numLocalSideIds = bdryCellLocIds_[i].size();
+            for (int j = 0; j < numLocalSideIds; ++j) {
+              int numCellsSide = bdryCellLocIds_[i][j].size();
+              int numBdryDofs = fvidx_[j].size();
+              for (int k = 0; k < numCellsSide; ++k) {
+                int cidx = bdryCellLocIds_[i][j][k];
+                for (int l = 0; l < numBdryDofs; ++l) {
+                  //std::cout << "\n  i=" << i << "   cidx=" << cidx << "   j=" << j << "  l=" << l << "  " << fvidx_[j][l] << " " << (*bdryCellDofValues_[i][j])(k,fvidx_[j][l],0);
+                  //std::cout << "\n  i=" << i << "   cidx=" << cidx << "   j=" << j << "  l=" << l << "  " << fvidx_[j][l] << " " << (*bdryCellDofValues_[i][j])(k,fvidx_[j][l],1);
+                  for (int m=0; m < d; ++m) {
+                    (R[m])(cidx,fvidx_[j][l]) = (U[m])(cidx,fvidx_[j][l]) - (bdryCellDofValues_[i][j])(k,fvidx_[j][l],m);
+                  }
+                }
+              }
+            }
+          }
+        }
+        // Apply Dirichlet conditions
+        if ((i==0) || (i==3) || (i==4) || (i==5)) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                //std::cout << "\n  i=" << i << "   cidx=" << cidx << "   j=" << j << "  l=" << l << "  " << fvidx_[j][l] << " " << (*bdryCellDofValues_[i][j])(k,fvidx_[j][l],0);
+                //std::cout << "\n  i=" << i << "   cidx=" << cidx << "   j=" << j << "  l=" << l << "  " << fvidx_[j][l] << " " << (*bdryCellDofValues_[i][j])(k,fvidx_[j][l],1);
+                for (int m=0; m < d; ++m) {
+                  (R[m])(cidx,fvidx_[j][l]) = (U[m])(cidx,fvidx_[j][l]) - (bdryCellDofValues_[i][j])(k,fvidx_[j][l],m);
+                }
+              }
+            }
+          }
+        }
+        // Step corner
+        if (i==7) {
+          int j = 0, l = 0;
+          if (bdryCellLocIds_[i][0].size() > 0) { 
+            int cidx = bdryCellLocIds_[i][0][0];
+            for (int m=0; m < d; ++m) {
+              (R[m])(cidx,fvidx_[j][l]) = (U[m])(cidx,fvidx_[j][l]) - (bdryCellDofValues_[i][j])(0,fvidx_[j][l],m);
+            }
+          }
+        }
+        // Pressure pinning
+        if (i==9) {
+          int j = 2, l = 0;
+          if (bdryCellLocIds_[i][0].size() > 0) { 
+            int cidx = bdryCellLocIds_[i][0][0];
+            (R[d])(cidx,fpidx_[j][l]) = (U[d])(cidx,fpidx_[j][l]);
+          }
+        }
+      }
+    }
+
+    // Combine the residuals.
+    fieldHelper_->combineFieldCoeff(res, R);
+  }
+
+  void Jacobian_1(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    int c  = u_coeff.extent_int(0);
+    int p  = cellCub_->getNumPoints();
+    int fv = basisPtrVel_->getCardinality();
+    int fp = basisPtrPrs_->getCardinality();
+    int d  = cellCub_->getDimension();
+ 
+    // Initialize residuals.
+    scalar_view velXvelX_jac("velXvelX_jac", c, fv, fv);
+    scalar_view velXvelY_jac("velXvelY_jac", c, fv, fv);
+    scalar_view velXpres_jac("velXpres_jac", c, fv, fp);
+    scalar_view velYvelX_jac("velYvelX_jac", c, fv, fv);
+    scalar_view velYvelY_jac("velYvelY_jac", c, fv, fv);
+    scalar_view velYpres_jac("velYpres_jac", c, fv, fp);
+    scalar_view presvelX_jac("presvelX_jac", c, fp, fv);
+    scalar_view presvelY_jac("presvelY_jac", c, fp, fv);
+    scalar_view prespres_jac("prespres_jac", c, fp, fp);
+    std::vector<std::vector<scalar_view>> J;
+    J.resize(numFields_);
+    J[0].resize(numFields_);
+    J[1].resize(numFields_);
+    J[2].resize(numFields_);
+    J[0][0] = velXvelX_jac; J[0][1] = velXvelY_jac; J[0][2] = velXpres_jac;  
+    J[1][0] = velYvelX_jac; J[1][1] = velYvelY_jac; J[1][2] = velYpres_jac;  
+    J[2][0] = presvelX_jac; J[2][1] = presvelY_jac; J[2][2] = prespres_jac;  
+
+    // Split u_coeff into components.
+    std::vector<scalar_view> U;
+    fieldHelper_->splitFieldCoeff(U, u_coeff);
+
+    // Evaluate/interpolate finite element fields on cells.
+    scalar_view nu("nu", c, p);
+    scalar_view nuGradPhiX_eval("nuGradPhiX", c, fv, p, d);
+    scalar_view nuGradPhiY_eval("nuGradPhiY", c, fv, p, d);
+    scalar_view valVel_eval("valVel_eval", c, p, d);
+    scalar_view valVelX_eval("valVelX_eval", c, p);
+    scalar_view valVelY_eval("valVelY_eval", c, p);
+    scalar_view gradVelX_eval("gradVelX_eval", c, p, d);
+    scalar_view ddxVelX_eval("ddxVelX_eval", c, p);
+    scalar_view ddyVelX_eval("ddyVelX_eval", c, p);
+    scalar_view ddxVelXPhiX_eval("ddxVelXPhiX_eval", c, fv, p);
+    scalar_view ddyVelXPhiY_eval("ddyVelXPhiY_eval", c, fv, p);
+    scalar_view gradVelY_eval("gradVelY_eval", c, p, d);
+    scalar_view ddxVelY_eval("ddxVelY_eval", c, p);
+    scalar_view ddyVelY_eval("ddyVelY_eval", c, p);
+    scalar_view ddyVelYPhiY_eval("ddyVelYPhiY_eval", c, fv, p);
+    scalar_view ddxVelYPhiX_eval("ddxVelYPhiX_eval", c, fv, p);
+    scalar_view valVelDotgradPhiX_eval("valVelDotgradPhiX_eval", c, fv, p);
+    scalar_view valVelDotgradPhiY_eval("valVelDotgradPhiY_eval", c, fv, p);
+    feVel_->evaluateValue(valVelX_eval, U[0]);
+    feVel_->evaluateValue(valVelY_eval, U[1]);
+    feVel_->evaluateGradient(gradVelX_eval, U[0]);
+    feVel_->evaluateGradient(gradVelY_eval, U[1]);
+    computeViscosity(nu);
+
+    // Assemble the velocity vector and its divergence.
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        valVel_eval(i,j,0) = valVelX_eval(i,j);
+        valVel_eval(i,j,1) = valVelY_eval(i,j);
+        ddxVelX_eval(i,j)  = gradVelX_eval(i,j,0);
+        ddyVelX_eval(i,j)  = gradVelX_eval(i,j,1);
+        ddxVelY_eval(i,j)  = gradVelY_eval(i,j,0);
+        ddyVelY_eval(i,j)  = gradVelY_eval(i,j,1);
+      }
+    }
+
+    // Multiply velocity gradients with viscosity.
+    fst::tensorMultiplyDataField(nuGradPhiX_eval,nu,feVel_->gradN());
+    fst::tensorMultiplyDataField(nuGradPhiY_eval,nu,feVel_->gradN());
+
+    // Compute nonlinear terms in the Navier-Stokes equations.
+    fst::dotMultiplyDataField(valVelDotgradPhiX_eval, valVel_eval, feVel_->gradN());
+    fst::dotMultiplyDataField(valVelDotgradPhiY_eval, valVel_eval, feVel_->gradN());
+    fst::scalarMultiplyDataField(ddxVelXPhiX_eval, ddxVelX_eval, feVel_->N());
+    fst::scalarMultiplyDataField(ddyVelXPhiY_eval, ddyVelX_eval, feVel_->N());
+    fst::scalarMultiplyDataField(ddxVelYPhiX_eval, ddxVelY_eval, feVel_->N());
+    fst::scalarMultiplyDataField(ddyVelYPhiY_eval, ddyVelY_eval, feVel_->N());
+
+    // Negative pressure basis.
+    scalar_view negPrsN("negPrsN", c, fp, p);
+    Kokkos::deep_copy(negPrsN,fePrs_->N());
+    rst::scale(negPrsN,static_cast<Real>(-1));
+
+    /*** Evaluate weak form of the Jacobian. ***/
+    // X component of velocity equation.
+    fst::integrate(velXvelX_jac,nuGradPhiX_eval,feVel_->gradNdetJ(),false);
+    fst::integrate(velXvelX_jac,feVel_->NdetJ(),valVelDotgradPhiX_eval,true);
+    fst::integrate(velXvelX_jac,feVel_->NdetJ(),ddxVelXPhiX_eval,true);
+    fst::integrate(velXvelY_jac,feVel_->NdetJ(),ddyVelXPhiY_eval,false);
+    fst::integrate(velXpres_jac,feVel_->DNDdetJ(0),negPrsN,false);
+    // Y component of velocity equation.
+    fst::integrate(velYvelY_jac,nuGradPhiY_eval,feVel_->gradNdetJ(),false);
+    fst::integrate(velYvelY_jac,feVel_->NdetJ(),valVelDotgradPhiY_eval,true);
+    fst::integrate(velYvelY_jac,feVel_->NdetJ(),ddyVelYPhiY_eval,true);
+    fst::integrate(velYvelX_jac,feVel_->NdetJ(),ddxVelYPhiX_eval,false);
+    fst::integrate(velYpres_jac,feVel_->DNDdetJ(1),negPrsN,false);
+    // Pressure equation.
+    fst::integrate(presvelX_jac,fePrs_->NdetJ(),feVel_->DND(0),false);
+    fst::integrate(presvelY_jac,fePrs_->NdetJ(),feVel_->DND(1),false);
+    rst::scale(presvelX_jac,static_cast<Real>(-1));
+    rst::scale(presvelY_jac,static_cast<Real>(-1));
+
+    // APPLY DIRICHLET CONDITIONS
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      if (!useDirichletControl_) {
+        for (int i = 0; i < numSideSets; ++i) {
+          if (i==10) {
+            int numLocalSideIds = bdryCellLocIds_[i].size();
+            const int numCubPerSide = bdryCub_->getNumPoints();
+            for (int j = 0; j < numLocalSideIds; ++j) {
+              int numCellsSide = bdryCellLocIds_[i][j].size();
+              if (numCellsSide) {
+                for (int k = 0; k < d; ++k) {
+                  // Compute Robin residual
+                  scalar_view RobinDerivUN("RobinDerivUN", numCellsSide, fv, numCubPerSide);
+                  rst::scale(RobinDerivUN,feVelBdry_[j]->N(),static_cast<Real>(1)/DirichletControlPenalty_);
+                  scalar_view RobinJac("RobinJac", numCellsSide, fv, fv);
+                  fst::integrate(RobinJac,RobinDerivUN,feVelBdry_[j]->NdetJ(),false);
+                  // Add Stefan-Boltzmann residual to volume residual
+                  for (int l = 0; l < numCellsSide; ++l) {
+                    int cidx = bdryCellLocIds_[i][j][l];
+                    for (int m = 0; m < fv; ++m) { 
+                      for (int n = 0; n < fv; ++n) { 
+                        (J[k][k])(cidx,m,n) += RobinJac(l,m,n);
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+      for (int i = 0; i < numSideSets; ++i) {
+        if ( useDirichletControl_ ) {
+          if ((i==6) || (i==8)) {
+            int numLocalSideIds = bdryCellLocIds_[i].size();
+            for (int j = 0; j < numLocalSideIds; ++j) {
+              int numCellsSide = bdryCellLocIds_[i][j].size();
+              int numBdryDofs = fvidx_[j].size();
+              for (int k = 0; k < numCellsSide; ++k) {
+                int cidx = bdryCellLocIds_[i][j][k];
+                for (int l = 0; l < numBdryDofs; ++l) {
+                  for (int m=0; m < fv; ++m) {
+                    for (int n=0; n < d; ++n) {
+                      for (int q=0; q < d; ++q) {
+                        (J[n][q])(cidx,fvidx_[j][l],m) = static_cast<Real>(0);
+                      }
+                      (J[n][n])(cidx,fvidx_[j][l],fvidx_[j][l]) = static_cast<Real>(1);
+                    }
+                  }
+                  for (int m=0; m < fp; ++m) {
+                    for (int n=0; n < d; ++n) {
+                      (J[n][2])(cidx,fvidx_[j][l],m) = static_cast<Real>(0);
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+        if ((i==0) || (i==3) || (i==4) || (i==5)) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                for (int m=0; m < fv; ++m) {
+                  for (int n=0; n < d; ++n) {
+                    for (int q=0; q < d; ++q) {
+                      (J[n][q])(cidx,fvidx_[j][l],m) = static_cast<Real>(0);
+                    }
+                    (J[n][n])(cidx,fvidx_[j][l],fvidx_[j][l]) = static_cast<Real>(1);
+                  }
+                }
+                for (int m=0; m < fp; ++m) {
+                  for (int n=0; n < d; ++n) {
+                    (J[n][2])(cidx,fvidx_[j][l],m) = static_cast<Real>(0);
+                  }
+                }
+              }
+            }
+          }
+        }
+        // Step corner
+        if (i==7) {
+          int j = 0, l = 0;
+          if (bdryCellLocIds_[i][0].size() > 0) { 
+            int cidx = bdryCellLocIds_[i][0][0];
+            for (int m=0; m < fv; ++m) {
+              for (int n=0; n < d; ++n) {
+                for (int q=0; q < d; ++q) {
+                  (J[n][q])(cidx,fvidx_[j][l],m) = static_cast<Real>(0);
+                }
+                (J[n][n])(cidx,fvidx_[j][l],fvidx_[j][l]) = static_cast<Real>(1);
+              }
+            }
+            for (int m=0; m < fp; ++m) {
+              for (int n=0; n < d; ++n) {
+                (J[n][d])(cidx,fvidx_[j][l],m) = static_cast<Real>(0);
+              }
+            }
+          }
+        }
+        // Pressure pinning
+        if (i==9) {
+          int j = 2, l = 0;
+          if (bdryCellLocIds_[i][0].size() > 0) { 
+            int cidx = bdryCellLocIds_[i][0][0];
+            for (int m = 0; m < fv; ++m) {
+              for (int n = 0; n < d; ++n) {
+                (J[d][n])(cidx,fpidx_[j][l],m) = static_cast<Real>(0);
+              }
+            }
+            for (int m = 0; m < fp; ++m) {
+              (J[d][d])(cidx,fpidx_[j][l],m) = static_cast<Real>(0);
+            }
+            (J[d][d])(cidx,fpidx_[j][l],fpidx_[j][l]) = static_cast<Real>(1);
+          }
+        }
+      }
+    }
+
+    // Combine the jacobians.
+    fieldHelper_->combineFieldCoeff(jac, J);
+
+  }
+
+
+  void Jacobian_2(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    int c  = u_coeff.extent_int(0);
+    int fv = basisPtrVel_->getCardinality();
+    int fp = basisPtrPrs_->getCardinality();
+    int d  = cellCub_->getDimension();
+
+    // Initialize residuals.
+    scalar_view velXvelX_jac("velXvelX_jac", c, fv, fv);
+    scalar_view velXvelY_jac("velXvelY_jac", c, fv, fv);
+    scalar_view velXpres_jac("velXpres_jac", c, fv, fp);
+    scalar_view velYvelX_jac("velYvelX_jac", c, fv, fv);
+    scalar_view velYvelY_jac("velYvelY_jac", c, fv, fv);
+    scalar_view velYpres_jac("velYpres_jac", c, fv, fp);
+    scalar_view presvelX_jac("presvelX_jac", c, fp, fv);
+    scalar_view presvelY_jac("presvelY_jac", c, fp, fv);
+    scalar_view prespres_jac("prespres_jac", c, fp, fp);
+    std::vector<std::vector<scalar_view>> J;
+    J.resize(numFields_);
+    J[0].resize(numFields_);
+    J[1].resize(numFields_);
+    J[2].resize(numFields_);
+    J[0][0] = velXvelX_jac; J[0][1] = velXvelY_jac; J[0][2] = velXpres_jac;
+    J[1][0] = velYvelX_jac; J[1][1] = velYvelY_jac; J[1][2] = velYpres_jac;
+    J[2][0] = presvelX_jac; J[2][1] = presvelY_jac; J[2][2] = prespres_jac;
+
+    // APPLY DIRICHLET CONDITIONS
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      if (!useDirichletControl_) {
+        for (int i = 0; i < numSideSets; ++i) {
+          if (i==10) {
+            int numLocalSideIds = bdryCellLocIds_[i].size();
+            const int numCubPerSide = bdryCub_->getNumPoints();
+            for (int j = 0; j < numLocalSideIds; ++j) {
+              int numCellsSide = bdryCellLocIds_[i][j].size();
+              if (numCellsSide) {
+                for (int k = 0; k < d; ++k) {
+                  // Compute Robin Jacobian
+                  scalar_view RobinDerivZN("RobinDerivZN", numCellsSide, fv, numCubPerSide);
+                  if (horizontalControl_) {
+                    if (k==0) { // control in horizontal direction only
+                      rst::scale(RobinDerivZN,feVelBdry_[j]->N(),static_cast<Real>(-1)/DirichletControlPenalty_);
+                    }
+                    // else RobinDerivZN=0
+                  }
+                  else { // control in both directions
+                    rst::scale(RobinDerivZN,feVelBdry_[j]->N(),static_cast<Real>(-1)/DirichletControlPenalty_);
+                  }
+                  scalar_view RobinJac("RobinJac", numCellsSide, fv, fv);
+                  fst::integrate(RobinJac,RobinDerivZN,feVelBdry_[j]->NdetJ(),false);
+                  // Add Robin Jacobian to volume Jacobian
+                  for (int l = 0; l < numCellsSide; ++l) {
+                    int cidx = bdryCellLocIds_[i][j][l];
+                    for (int m = 0; m < fv; ++m) { 
+                      for (int n = 0; n < fv; ++n) { 
+                        (J[k][k])(cidx,m,n) += RobinJac(l,m,n);
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+      for (int i = 0; i < numSideSets; ++i) {
+        if ( useDirichletControl_ ) {
+          // Apply Dirichlet controls
+          if (i==6) {
+            int numLocalSideIds = bdryCellLocIds_[i].size();
+            for (int j = 0; j < numLocalSideIds; ++j) {
+              int numCellsSide = bdryCellLocIds_[i][j].size();
+              int numBdryDofs = fvidx_[j].size();
+              for (int k = 0; k < numCellsSide; ++k) {
+                int cidx = bdryCellLocIds_[i][j][k];
+                for (int l = 0; l < numBdryDofs; ++l) {
+                  //std::cout << "\n   j=" << j << "  l=" << l << "  " << fidx[j][l];
+                  for (int m=0; m < fv; ++m) {
+                    if ( !horizontalControl_ ) {
+                      for (int n=0; n < d; ++n) {
+                        (J[n][n])(cidx,fvidx_[j][l],m) = static_cast<Real>(0);
+                      }
+                    }
+                    else {
+                      (J[0][0])(cidx,fvidx_[j][l],m) = static_cast<Real>(0);
+                    }
+                  }
+                  if ( !horizontalControl_ ) {
+                    for (int n=0; n < d; ++n) {
+                      (J[n][n])(cidx,fvidx_[j][l],fvidx_[j][l]) = static_cast<Real>(-1);
+                    }
+                  }
+                  else {
+                    (J[0][0])(cidx,fvidx_[j][l],fvidx_[j][l]) = static_cast<Real>(-1);
+                  }
+                }
+              }
+            }
+          }
+          // Apply Dirichlet conditions
+          if (i==8) {
+            int numLocalSideIds = bdryCellLocIds_[i].size();
+            for (int j = 0; j < numLocalSideIds; ++j) {
+              int numCellsSide = bdryCellLocIds_[i][j].size();
+              int numBdryDofs = fvidx_[j].size();
+              for (int k = 0; k < numCellsSide; ++k) {
+                int cidx = bdryCellLocIds_[i][j][k];
+                for (int l = 0; l < numBdryDofs; ++l) {
+                  //std::cout << "\n   j=" << j << "  l=" << l << "  " << fidx[j][l];
+                  for (int m=0; m < fv; ++m) {
+                    for (int n=0; n < d; ++n) {
+                      (J[n][n])(cidx,fvidx_[j][l],m) = static_cast<Real>(0);
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+        // Apply Dirichlet conditions
+        if ((i==0) || (i==3) || (i==4) || (i==5)) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                //std::cout << "\n   j=" << j << "  l=" << l << "  " << fidx[j][l];
+                for (int m=0; m < fv; ++m) {
+                  for (int n=0; n < d; ++n) {
+                    (J[n][n])(cidx,fvidx_[j][l],m) = static_cast<Real>(0);
+                  }
+                }
+              }
+            }
+          }
+        }
+        // Step corner
+        if (i==7) {
+          int j = 0, l = 0;
+          if (bdryCellLocIds_[i][0].size() > 0) { 
+            int cidx = bdryCellLocIds_[i][0][0];
+            for (int m=0; m < fv; ++m) {
+              for (int n=0; n < d; ++n) {
+                for (int p=0; p < d; ++p) {
+                  (J[n][p])(cidx,fvidx_[j][l],m) = static_cast<Real>(0);
+                }
+              }
+            }
+            for (int m=0; m < fp; ++m) {
+              for (int n=0; n < d; ++n) {
+                (J[n][d])(cidx,fvidx_[j][l],m) = static_cast<Real>(0);
+              }
+            }
+          }
+        }
+        // Pressure pinning
+        if (i==9) {
+          int j = 2, l = 0;
+          if (bdryCellLocIds_[i][0].size() > 0) { 
+            int cidx = bdryCellLocIds_[i][0][0];
+            for (int m = 0; m < fv; ++m) {
+              for (int n = 0; n < d; ++n) {
+                (J[d][n])(cidx,fpidx_[j][l],m) = static_cast<Real>(0);
+              }
+            }
+            for (int m = 0; m < fp; ++m) {
+              (J[d][d])(cidx,fpidx_[j][l],m) = static_cast<Real>(0);
+            }
+          }
+        }
+      }
+    }
+
+    // Combine the jacobians.
+    fieldHelper_->combineFieldCoeff(jac, J);
+
+  }
+
+  void Hessian_11(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+//    throw Exception::NotImplemented("");
+    // Retrieve dimensions.
+    int c  = u_coeff.extent_int(0);
+    int p  = cellCub_->getNumPoints();
+    int fv = basisPtrVel_->getCardinality();
+    int fp = basisPtrPrs_->getCardinality();
+    int d  = cellCub_->getDimension();
+ 
+    // Initialize residuals.
+    scalar_view velXvelX_jac("velXvelX_jac", c, fv, fv);
+    scalar_view velXvelY_jac("velXvelY_jac", c, fv, fv);
+    scalar_view velXpres_jac("velXpres_jac", c, fv, fp);
+    scalar_view velYvelX_jac("velYvelX_jac", c, fv, fv);
+    scalar_view velYvelY_jac("velYvelY_jac", c, fv, fv);
+    scalar_view velYpres_jac("velYpres_jac", c, fv, fp);
+    scalar_view presvelX_jac("presvelX_jac", c, fp, fv);
+    scalar_view presvelY_jac("presvelY_jac", c, fp, fv);
+    scalar_view prespres_jac("prespres_jac", c, fp, fp);
+    std::vector<std::vector<scalar_view>> J;
+    J.resize(numFields_);
+    J[0].resize(numFields_);
+    J[1].resize(numFields_);
+    J[2].resize(numFields_);
+    J[0][0] = velXvelX_jac; J[0][1] = velXvelY_jac; J[0][2] = velXpres_jac;  
+    J[1][0] = velYvelX_jac; J[1][1] = velYvelY_jac; J[1][2] = velYpres_jac;  
+    J[2][0] = presvelX_jac; J[2][1] = presvelY_jac; J[2][2] = prespres_jac;  
+
+    // Split u_coeff into components.
+    std::vector<scalar_view> L;
+    fieldHelper_->splitFieldCoeff(L, l_coeff);
+
+    // Apply Dirichlet conditions
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if ( useDirichletControl_ ) {
+          if ((i==6) || (i==8)) {
+            int numLocalSideIds = bdryCellLocIds_[i].size();
+            for (int j = 0; j < numLocalSideIds; ++j) {
+              int numCellsSide = bdryCellLocIds_[i][j].size();
+              int numBdryDofs = fvidx_[j].size();
+              for (int k = 0; k < numCellsSide; ++k) {
+                int cidx = bdryCellLocIds_[i][j][k];
+                for (int l = 0; l < numBdryDofs; ++l) {
+                  //std::cout << "\n   j=" << j << "  l=" << l << "  " << fidx[j][l];
+                  for (int m=0; m < d; ++m) {
+                    (L[m])(cidx,fvidx_[j][l]) = static_cast<Real>(0);
+                  }
+                }
+              }
+            }
+          }
+        }
+        if ((i==0) || (i==3) || (i==4) || (i==5)) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                //std::cout << "\n   j=" << j << "  l=" << l << "  " << fidx[j][l];
+                for (int m=0; m < d; ++m) {
+                  (L[m])(cidx,fvidx_[j][l]) = static_cast<Real>(0);
+                }
+              }
+            }
+          }
+        }
+        // Step corner
+        if (i==7) {
+          int j = 0, l = 0;
+          if (bdryCellLocIds_[i][0].size() > 0) { 
+            int cidx = bdryCellLocIds_[i][0][0];
+            for (int m=0; m < d; ++m) {
+              (L[m])(cidx,fvidx_[j][l]) = static_cast<Real>(0);
+            }
+          }
+        }
+        // Pressure pinning
+        if (i==9) {
+          int j = 2, l = 0;
+          if (bdryCellLocIds_[i][0].size() > 0) { 
+            int cidx = bdryCellLocIds_[i][0][0];
+            (L[d])(cidx,fpidx_[j][l]) = static_cast<Real>(0);
+          }
+        }
+      }
+    }
+
+    // Evaluate/interpolate finite element fields on cells.
+    scalar_view valVelX_eval("valVelX_eval", c, p);
+    scalar_view valVelY_eval("valVelY_eval", c, p);
+    scalar_view valVelXPhi_eval("valVelXPhi_eval", c, fv, p);
+    scalar_view valVelYPhi_eval("valVelYPhi_eval", c, fv, p);
+    feVel_->evaluateValue(valVelX_eval, L[0]);
+    feVel_->evaluateValue(valVelY_eval, L[1]);
+
+    // Compute nonlinear terms in the Navier-Stokes equations.
+    fst::scalarMultiplyDataField(valVelXPhi_eval, valVelX_eval, feVel_->N());
+    fst::scalarMultiplyDataField(valVelYPhi_eval, valVelY_eval, feVel_->N());
+
+    /*** Evaluate weak form of the Hessian. ***/
+    // X component of velocity equation.
+    fst::integrate(velXvelX_jac,valVelXPhi_eval,feVel_->DNDdetJ(0),false);
+    fst::integrate(velXvelX_jac,feVel_->DNDdetJ(0),valVelXPhi_eval,true);
+    fst::integrate(velXvelY_jac,feVel_->DNDdetJ(1),valVelXPhi_eval,false);
+    fst::integrate(velXvelY_jac,valVelYPhi_eval,feVel_->DNDdetJ(0),true);
+    // Y component of velocity equation.
+    fst::integrate(velYvelY_jac,valVelYPhi_eval,feVel_->DNDdetJ(1),false);
+    fst::integrate(velYvelY_jac,feVel_->DNDdetJ(1),valVelYPhi_eval,true);
+    fst::integrate(velYvelX_jac,feVel_->DNDdetJ(0),valVelYPhi_eval,false);
+    fst::integrate(velYvelX_jac,valVelXPhi_eval,feVel_->DNDdetJ(1),true);
+
+    // Combine the Hessians.
+    fieldHelper_->combineFieldCoeff(hess, J);
+  }
+
+  void Hessian_12(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_NavierStokes::Hessian_12): Hessian is zero.");
+  }
+
+  void Hessian_21(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_NavierStokes::Hessian_21): Hessian is zero.");
+  }
+
+  void Hessian_22(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_NavierStokes::Hessian_22): Hessian is zero.");
+  }
+
+  void RieszMap_1(scalar_view & riesz) override {
+    // Retrieve dimensions.
+    int c  = feVel_->N().extent_int(0);
+    int fv = basisPtrVel_->getCardinality();
+    int fp = basisPtrPrs_->getCardinality();
+ 
+    // Initialize residuals.
+    scalar_view velXvelX_jac("velXvelX_jac", c, fv, fv);
+    scalar_view velXvelY_jac("velXvelY_jac", c, fv, fv);
+    scalar_view velXpres_jac("velXpres_jac", c, fv, fp);
+    scalar_view velYvelX_jac("velYvelX_jac", c, fv, fv);
+    scalar_view velYvelY_jac("velYvelY_jac", c, fv, fv);
+    scalar_view velYpres_jac("velYpres_jac", c, fv, fp);
+    scalar_view presvelX_jac("presvelX_jac", c, fp, fv);
+    scalar_view presvelY_jac("presvelY_jac", c, fp, fv);
+    scalar_view prespres_jac("prespres_jac", c, fp, fp);
+    std::vector<std::vector<scalar_view>> J;
+    J.resize(numFields_);
+    J[0].resize(numFields_);
+    J[1].resize(numFields_);
+    J[2].resize(numFields_);
+    J[0][0] = velXvelX_jac; J[0][1] = velXvelY_jac; J[0][2] = velXpres_jac;  
+    J[1][0] = velYvelX_jac; J[1][1] = velYvelY_jac; J[1][2] = velYpres_jac;  
+    J[2][0] = presvelX_jac; J[2][1] = presvelY_jac; J[2][2] = prespres_jac;  
+
+    Kokkos::deep_copy(J[0][0],feVel_->stiffMat());
+    rst::add(J[0][0],feVel_->massMat());
+    Kokkos::deep_copy(J[1][1],feVel_->stiffMat());
+    rst::add(J[1][1],feVel_->massMat());
+    Kokkos::deep_copy(J[2][2],fePrs_->massMat());
+
+    // Combine the jacobians.
+    fieldHelper_->combineFieldCoeff(riesz, J);
+  }
+
+  void RieszMap_2(scalar_view & riesz) override {
+    // Retrieve dimensions.
+    int c  = feVel_->N().extent_int(0);
+    int fv = basisPtrVel_->getCardinality();
+    int fp = basisPtrPrs_->getCardinality();
+ 
+    // Initialize residuals.
+    scalar_view velXvelX_jac("velXvelX_jac", c, fv, fv);
+    scalar_view velXvelY_jac("velXvelY_jac", c, fv, fv);
+    scalar_view velXpres_jac("velXpres_jac", c, fv, fp);
+    scalar_view velYvelX_jac("velYvelX_jac", c, fv, fv);
+    scalar_view velYvelY_jac("velYvelY_jac", c, fv, fv);
+    scalar_view velYpres_jac("velYpres_jac", c, fv, fp);
+    scalar_view presvelX_jac("presvelX_jac", c, fp, fv);
+    scalar_view presvelY_jac("presvelY_jac", c, fp, fv);
+    scalar_view prespres_jac("prespres_jac", c, fp, fp);
+    std::vector<std::vector<scalar_view>> J;
+    J.resize(numFields_);
+    J[0].resize(numFields_);
+    J[1].resize(numFields_);
+    J[2].resize(numFields_);
+    J[0][0] = velXvelX_jac; J[0][1] = velXvelY_jac; J[0][2] = velXpres_jac;  
+    J[1][0] = velYvelX_jac; J[1][1] = velYvelY_jac; J[1][2] = velYpres_jac;  
+    J[2][0] = presvelX_jac; J[2][1] = presvelY_jac; J[2][2] = prespres_jac;  
+
+    Kokkos::deep_copy(J[0][0],feVel_->stiffMat());
+    rst::add(J[0][0],feVel_->massMat());
+    Kokkos::deep_copy(J[1][1],feVel_->stiffMat());
+    rst::add(J[1][1],feVel_->massMat());
+    Kokkos::deep_copy(J[2][2],fePrs_->massMat());
+
+    // Combine the jacobians.
+    fieldHelper_->combineFieldCoeff(riesz, J);
+  }
+
+  std::vector<basis_ptr> getFields() override {
+    return basisPtrs_;
+  }
+
+  void setCellNodes(const scalar_view &volCellNodes,
+                    const std::vector<std::vector<scalar_view>> &bdryCellNodes,
+                    const std::vector<std::vector<std::vector<int>>> &bdryCellLocIds) override {
+    volCellNodes_ = volCellNodes;
+    bdryCellNodes_ = bdryCellNodes;
+    bdryCellLocIds_ = bdryCellLocIds;
+    // Finite element definition.
+    feVel_ = ROL::makePtr<fe_type>(volCellNodes_,basisPtrVel_,cellCub_);
+    fePrs_ = ROL::makePtr<fe_type>(volCellNodes_,basisPtrPrs_,cellCub_);
+    fvidx_ = feVel_->getBoundaryDofs();
+    fpidx_ = fePrs_->getBoundaryDofs();
+    // Construct control boundary FE
+    int sideset = (useDirichletControl_) ? 6 : 10;
+    int numLocSides = bdryCellNodes[sideset].size();
+    feVelBdry_.resize(numLocSides);
+    for (int j = 0; j < numLocSides; ++j) {
+      if (bdryCellNodes[sideset][j] != scalar_view()) {
+        feVelBdry_[j] = ROL::makePtr<fe_type>(bdryCellNodes[sideset][j],basisPtrVel_,bdryCub_,j);
+      }
+    }
+  }
+
+  void setFieldPattern(const std::vector<std::vector<int>> & fieldPattern) override {
+    fieldPattern_ = fieldPattern;
+    fieldHelper_ = ROL::makePtr<FieldHelper<Real,DeviceType>>(numFields_, numDofs_, numFieldDofs_, fieldPattern_);
+  }
+
+  const ROL::Ptr<fe_type> getVelocityFE(void) const {
+    return feVel_;
+  }
+
+  const ROL::Ptr<fe_type> getPressureFE(void) const {
+    return fePrs_;
+  }
+
+  const std::vector<ROL::Ptr<fe_type>> getVelocityBdryFE(void) const {
+    return feVelBdry_;
+  }
+
+  const std::vector<std::vector<int>> getBdryCellLocIds(const int sideset = -1) const {
+    int side = sideset;
+    if ( sideset < 0 ) {
+      side = (useDirichletControl_) ? 6 : 10;
+    }
+    return bdryCellLocIds_[side];
+  }
+
+  const ROL::Ptr<FieldHelper<Real,DeviceType>> getFieldHelper(void) const {
+    return fieldHelper_;
+  }
+
+}; // PDE_NavierStokes
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/nonlinear-elliptic/CMakeLists.txt b/packages/rol/example/PDE-OPT/nonlinear-elliptic/CMakeLists.txt
index 8ea795e9479d..12ef4fd49628 100644
--- a/packages/rol/example/PDE-OPT/nonlinear-elliptic/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/nonlinear-elliptic/CMakeLists.txt
@@ -13,7 +13,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   # Need ROL_TpetraTeuchosBatchManager.hpp
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
 
-  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  ROL_ADD_EXECUTABLE_AND_TEST(
     example_01
     SOURCES example_01.cpp
     ARGS PrintItAll
@@ -23,7 +23,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  ROL_ADD_EXECUTABLE_AND_TEST(
     example_02
     SOURCES example_02.cpp
     ARGS PrintItAll
@@ -33,7 +33,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     NonlinearEllipticDataCopy
     SOURCE_FILES
       input.xml plotresults.m
@@ -41,5 +41,49 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
   )
 
+ENDIF()
+
+IF(${PROJECT_NAME}_ENABLE_Intrepid2 AND
+   ${PROJECT_NAME}_ENABLE_Ifpack2   AND
+   ${PROJECT_NAME}_ENABLE_MueLu     AND
+   ${PROJECT_NAME}_ENABLE_Amesos2   AND
+   ${PROJECT_NAME}_ENABLE_Tpetra )
+
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/teuchos/src/sol)
+
+  # Need ROL_TpetraMultiVector.hpp
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
+
+  # Need ROL_TpetraTeuchosBatchManager.hpp
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
+
+  ROL_ADD_EXECUTABLE_AND_TEST(
+    example_01_Kokkos
+    SOURCES example_01K.cpp
+    ARGS PrintItAll
+    NUM_MPI_PROCS 4
+    NUM_TOTAL_CORES_USED 4
+    PASS_REGULAR_EXPRESSION "TEST PASSED"
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_ADD_EXECUTABLE_AND_TEST(
+    example_02_Kokkos
+    SOURCES example_02K.cpp
+    ARGS PrintItAll
+    NUM_MPI_PROCS 4
+    NUM_TOTAL_CORES_USED 4
+    PASS_REGULAR_EXPRESSION "TEST PASSED"
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_COPY_FILES_TO_BINARY_DIR(
+    NonlinearEllipticDataCopyK
+    SOURCE_FILES
+      input.xml plotresults.m
+    SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}"
+    DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
+  )
+
 
 ENDIF()
diff --git a/packages/rol/example/PDE-OPT/nonlinear-elliptic/example_01.cpp b/packages/rol/example/PDE-OPT/nonlinear-elliptic/example_01.cpp
index bc64d612d190..b1f0d413cd2a 100644
--- a/packages/rol/example/PDE-OPT/nonlinear-elliptic/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/nonlinear-elliptic/example_01.cpp
@@ -13,8 +13,7 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -46,7 +45,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int> > comm
     = Tpetra::getDefaultComm();
   const int myRank = comm->getRank();
diff --git a/packages/rol/example/PDE-OPT/nonlinear-elliptic/example_01K.cpp b/packages/rol/example/PDE-OPT/nonlinear-elliptic/example_01K.cpp
new file mode 100644
index 000000000000..a6af686099e8
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/nonlinear-elliptic/example_01K.cpp
@@ -0,0 +1,154 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_01.cpp
+    \brief Shows how to solve the Poisson-Boltzmann problem.
+*/
+
+#include "Teuchos_Comm.hpp"
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+
+#include "ROL_TpetraMultiVector.hpp"
+#include "ROL_Algorithm.hpp"
+#include "ROL_TrustRegionStep.hpp"
+#include "ROL_StatusTest.hpp"
+#include "ROL_Reduced_Objective_SimOpt.hpp"
+#include "ROL_Bounds.hpp"
+
+#include "../TOOLS/meshmanagerK.hpp"
+#include "../TOOLS/pdeconstraintK.hpp"
+#include "../TOOLS/pdeobjectiveK.hpp"
+#include "../TOOLS/pdevectorK.hpp"
+
+#include "pde_nonlinear_ellipticK.hpp"
+#include "obj_nonlinear_ellipticK.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+int main(int argc, char *argv[]) {
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  Kokkos::ScopeGuard kokkosScope (argc, argv);
+  ROL::Ptr<const Teuchos::Comm<int> > comm
+    = Tpetra::getDefaultComm();
+  const int myRank = comm->getRank();
+  if ((iprint > 0) && (myRank == 0))
+    outStream = ROL::makePtrFromRef(std::cout);
+  else
+    outStream = ROL::makePtrFromRef(bhs);
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+
+    /*** Read in XML input ***/
+    std::string filename = "input.xml";
+    auto parlist = ROL::getParametersFromXmlFile(filename);
+
+    RealT controlPenalty = parlist->sublist("Problem").get("Control penalty parameter",static_cast<RealT>(1.e-4));
+
+    /*** Initialize main data structure. ***/
+    auto meshMgr = ROL::makePtr<MeshManager_Rectangle<RealT,DeviceT>>(*parlist);
+    // Initialize PDE describe Poisson's equation
+    auto pde = ROL::makePtr<PDE_Nonlinear_Elliptic<RealT,DeviceT>>(*parlist);
+    auto con = ROL::makePtr<PDE_Constraint<RealT,DeviceT>>(pde,meshMgr,comm,*parlist,*outStream);
+    auto assembler = con->getAssembler();
+
+    // Create state vector and set to zeroes
+    auto u_ptr = assembler->createStateVector(); u_ptr->randomize();
+    auto up    = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(u_ptr,pde,assembler);
+    // Create state vector and set to zeroes
+    auto p_ptr = assembler->createStateVector(); p_ptr->randomize();
+    auto pp    = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(p_ptr,pde,assembler);
+    // Create control vector and set to ones
+    auto z_ptr = assembler->createControlVector(); z_ptr->randomize();
+    auto zp    = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(z_ptr,pde,assembler);
+    // Create residual vector and set to zeros
+    auto r_ptr = assembler->createResidualVector(); r_ptr->putScalar(0.0);
+    auto rp    = ROL::makePtr<PDE_DualSimVector<RealT,DeviceT>>(r_ptr,pde,assembler);
+    // Create state direction vector and set to random
+    auto du_ptr = assembler->createStateVector(); du_ptr->randomize();
+    auto dup    = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(du_ptr,pde,assembler);
+    // Create control direction vector and set to random
+    auto dz_ptr = assembler->createControlVector(); dz_ptr->randomize();
+    auto dzp    = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(dz_ptr,pde,assembler);
+    // Create ROL SimOpt vectors
+    ROL::Vector_SimOpt<RealT> x(up,zp);
+    ROL::Vector_SimOpt<RealT> d(dup,dzp);
+
+    // Initialize bound constraints.
+    auto lo_ptr = assembler->createControlVector(); lo_ptr->putScalar(0.0);
+    auto hi_ptr = assembler->createControlVector(); hi_ptr->putScalar(1.0);
+    auto lop = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(lo_ptr,pde,assembler);
+    auto hip = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(hi_ptr,pde,assembler);
+    auto bnd = ROL::makePtr<ROL::Bounds<RealT>>(lop,hip);
+
+    // Initialize quadratic objective function
+    std::vector<ROL::Ptr<QoI<RealT,DeviceT>>> qoi_vec(2,ROL::nullPtr);
+    qoi_vec[0] = ROL::makePtr<QoI_StateTracking_Nonlinear_Elliptic<RealT,DeviceT>>(pde->getFE());
+    qoi_vec[1] = ROL::makePtr<QoI_ControlPenalty_Nonlinear_Elliptic<RealT,DeviceT>>(pde->getFE());
+    std::vector<RealT> weights = {static_cast<RealT>(1), controlPenalty};
+    auto obj = ROL::makePtr<PDE_Objective<RealT,DeviceT>>(qoi_vec,weights,assembler);
+    auto robj = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(obj,con,up,zp,pp,true,false);
+
+    // Run derivative checks
+    obj->checkGradient(x,d,true,*outStream);
+    obj->checkHessVec(x,d,true,*outStream);
+
+    con->checkApplyJacobian(x,d,*rp,true,*outStream);
+    con->checkApplyJacobian_1(*up,*zp,*dup,*rp,true,*outStream);
+    con->checkApplyJacobian_2(*up,*zp,*dzp,*rp,true,*outStream);
+    con->checkApplyAdjointHessian(x,*dup,d,x,true,*outStream);
+    con->checkAdjointConsistencyJacobian(*dup,d,x,true,*outStream);
+    con->checkInverseJacobian_1(*up,*up,*up,*zp,true,*outStream);
+    con->checkInverseAdjointJacobian_1(*up,*up,*up,*zp,true,*outStream);
+
+    auto step = ROL::makePtr<ROL::TrustRegionStep<RealT>>(*parlist);
+    auto status = ROL::makePtr<ROL::StatusTest<RealT>>(*parlist);
+    ROL::Algorithm<RealT> algo(step,status,false);
+    algo.run(*zp,*robj,*bnd,true,*outStream);
+
+    // Output.
+    assembler->printMeshData(*outStream);
+    RealT tol(1.e-8);
+    con->solve(*rp,*up,*zp,tol);
+    con->outputTpetraVector(u_ptr,"state.txt");
+    con->outputTpetraVector(z_ptr,"control.txt");
+
+    Teuchos::Array<RealT> res(1,0);
+    con->value(*rp,*up,*zp,tol);
+    r_ptr->norm2(res.view(0,1));
+    *outStream << "Residual Norm: " << res[0] << std::endl;
+    errorFlag += (res[0] > 1.e-6 ? 1 : 0);
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/nonlinear-elliptic/example_02.cpp b/packages/rol/example/PDE-OPT/nonlinear-elliptic/example_02.cpp
index d7ac05476466..def192a9af09 100644
--- a/packages/rol/example/PDE-OPT/nonlinear-elliptic/example_02.cpp
+++ b/packages/rol/example/PDE-OPT/nonlinear-elliptic/example_02.cpp
@@ -12,8 +12,7 @@
 */
 
 #include "Teuchos_Comm.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -46,7 +45,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int> > comm
     = Tpetra::getDefaultComm();
   ROL::Ptr<const Teuchos::Comm<int> > serial_comm
diff --git a/packages/rol/example/PDE-OPT/nonlinear-elliptic/example_02K.cpp b/packages/rol/example/PDE-OPT/nonlinear-elliptic/example_02K.cpp
new file mode 100644
index 000000000000..c9f3340eba9d
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/nonlinear-elliptic/example_02K.cpp
@@ -0,0 +1,162 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_01.cpp
+    \brief Shows how to solve the Poisson-Boltzmann problem.
+*/
+
+#include "Teuchos_Comm.hpp"
+#include "ROL_GlobalMPISession.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+
+#include "ROL_Stream.hpp"
+#include "ROL_OptimizationSolver.hpp"
+#include "ROL_TpetraMultiVector.hpp"
+#include "ROL_Reduced_Objective_SimOpt.hpp"
+#include "ROL_Bounds.hpp"
+#include "ROL_MonteCarloGenerator.hpp"
+#include "ROL_TpetraTeuchosBatchManager.hpp"
+
+#include "../TOOLS/meshmanagerK.hpp"
+#include "../TOOLS/pdeconstraintK.hpp"
+#include "../TOOLS/pdeobjectiveK.hpp"
+#include "../TOOLS/pdevectorK.hpp"
+
+#include "pde_nonlinear_ellipticK.hpp"
+#include "obj_nonlinear_ellipticK.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+int main(int argc, char *argv[]) {
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  Kokkos::ScopeGuard kokkosScope (argc, argv);
+  ROL::Ptr<const Teuchos::Comm<int> > comm
+    = Tpetra::getDefaultComm();
+  ROL::Ptr<const Teuchos::Comm<int> > serial_comm
+    = ROL::makePtr<Teuchos::SerialComm<int>>();
+  const int myRank = comm->getRank();
+  if ((iprint > 0) && (myRank == 0))
+    outStream = ROL::makePtrFromRef(std::cout);
+  else
+    outStream = ROL::makePtrFromRef(bhs);
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+
+    /*** Read in XML input ***/
+    std::string filename = "input.xml";
+    auto parlist = ROL::getParametersFromXmlFile(filename);
+
+    RealT controlPenalty = parlist->sublist("Problem").get("Control penalty parameter",static_cast<RealT>(1.e-4));
+
+    /*** Initialize main data structure. ***/
+    auto meshMgr = ROL::makePtr<MeshManager_Rectangle<RealT,DeviceT>>(*parlist);
+    // Initialize PDE describe Poisson's equation
+    auto pde = ROL::makePtr<PDE_Nonlinear_Elliptic<RealT,DeviceT>>(*parlist);
+    auto con = ROL::makePtr<PDE_Constraint<RealT,DeviceT>>(pde,meshMgr,serial_comm,*parlist,*outStream);
+    auto assembler = con->getAssembler();
+    con->setSolveParameters(*parlist);
+
+    // Create state vector and set to zeroes
+    auto u_ptr = assembler->createStateVector(); u_ptr->randomize();
+    auto up    = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(u_ptr,pde,assembler);
+    // Create state vector and set to zeroes
+    auto p_ptr = assembler->createStateVector(); p_ptr->randomize();
+    auto pp    = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(p_ptr,pde,assembler);
+    // Create control vector and set to ones
+    auto z_ptr = assembler->createControlVector(); z_ptr->randomize();
+    auto zp    = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(z_ptr,pde,assembler);
+    // Create residual vector and set to zeros
+    auto r_ptr = assembler->createResidualVector(); r_ptr->putScalar(0.0);
+    auto rp    = ROL::makePtr<PDE_DualSimVector<RealT,DeviceT>>(r_ptr,pde,assembler);
+    // Create state direction vector and set to random
+    auto du_ptr = assembler->createStateVector(); du_ptr->randomize();
+    auto dup    = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(du_ptr,pde,assembler);
+    // Create control direction vector and set to random
+    auto dz_ptr = assembler->createControlVector(); dz_ptr->randomize();
+    auto dzp    = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(dz_ptr,pde,assembler);
+    // Create ROL SimOpt vectors
+    ROL::Vector_SimOpt<RealT> x(up,zp);
+    ROL::Vector_SimOpt<RealT> d(dup,dzp);
+
+    // Initialize bound constraints.
+    auto lo_ptr = assembler->createControlVector(); lo_ptr->putScalar(0.0);
+    auto hi_ptr = assembler->createControlVector(); hi_ptr->putScalar(1.0);
+    auto lop = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(lo_ptr,pde,assembler);
+    auto hip = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(hi_ptr,pde,assembler);
+    auto bnd = ROL::makePtr<ROL::Bounds<RealT>>(lop,hip);
+
+    // Initialize quadratic objective function
+    std::vector<ROL::Ptr<QoI<RealT,DeviceT>>> qoi_vec(2,ROL::nullPtr);
+    qoi_vec[0] = ROL::makePtr<QoI_StateTracking_Nonlinear_Elliptic<RealT,DeviceT>>(pde->getFE());
+    qoi_vec[1] = ROL::makePtr<QoI_ControlPenalty_Nonlinear_Elliptic<RealT,DeviceT>>(pde->getFE());
+    std::vector<RealT> weights = {static_cast<RealT>(1), controlPenalty};
+    auto obj = ROL::makePtr<PDE_Objective<RealT,DeviceT>>(qoi_vec,weights,assembler);
+    auto robj = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(obj,con,up,zp,pp,true,false);
+
+    /*************************************************************************/
+    /***************** BUILD SAMPLER *****************************************/
+    /*************************************************************************/
+    int stochDim = 20;
+    int nsamp = parlist->sublist("Problem").get("Number of samples",100);
+    std::vector<RealT> tmp = {static_cast<RealT>(-1),static_cast<RealT>(1)};
+    std::vector<std::vector<RealT>> bounds(stochDim,tmp);
+    auto bman = ROL::makePtr<ROL::TpetraTeuchosBatchManager<RealT>>(comm);
+    auto sampler = ROL::makePtr<ROL::MonteCarloGenerator<RealT>>(nsamp,bounds,bman);
+
+    /*************************************************************************/
+    /***************** BUILD STOCHASTIC PROBLEM ******************************/
+    /*************************************************************************/
+    zp->zero();
+    ROL::OptimizationProblem<RealT> opt(robj,zp,bnd);
+    parlist->sublist("SOL").set("Initial Statistic", static_cast<RealT>(1));
+    opt.setStochasticObjective(*parlist,sampler);
+
+    parlist->sublist("Step").set("Type","Trust Region");
+    ROL::OptimizationSolver<RealT> solver(opt,*parlist);
+    solver.solve(*outStream);
+
+    // Output.
+    assembler->printMeshData(*outStream);
+    RealT tol(1.e-8);
+    con->solve(*rp,*up,*zp,tol);
+    con->outputTpetraVector(u_ptr,"state.txt");
+    con->outputTpetraVector(z_ptr,"control.txt");
+
+    Teuchos::Array<RealT> res(1,0);
+    con->value(*rp,*up,*zp,tol);
+    r_ptr->norm2(res.view(0,1));
+    *outStream << "Residual Norm: " << res[0] << std::endl;
+    errorFlag += (res[0] > 1.e-6 ? 1 : 0);
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/nonlinear-elliptic/obj_nonlinear_ellipticK.hpp b/packages/rol/example/PDE-OPT/nonlinear-elliptic/obj_nonlinear_ellipticK.hpp
new file mode 100644
index 000000000000..b13ba66418eb
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/nonlinear-elliptic/obj_nonlinear_ellipticK.hpp
@@ -0,0 +1,250 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  obj.hpp
+    \brief Provides the interface for local (cell-based) objective function computations.
+*/
+
+#ifndef PDEOPT_QOI_NONLINEAR_ELLIPTICK_HPP
+#define PDEOPT_QOI_NONLINEAR_ELLIPTICK_HPP
+
+#include "../TOOLS/qoiK.hpp"
+#include "pde_nonlinear_ellipticK.hpp"
+
+template <class Real, class DeviceType>
+class QoI_StateTracking_Nonlinear_Elliptic : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  ROL::Ptr<fe_type> fe_;
+  scalar_view target_;
+
+protected:
+  void computeTracking(void) {
+    int c = fe_->cubPts().extent_int(0);
+    int p = fe_->cubPts().extent_int(1);
+    int d = fe_->cubPts().extent_int(2);
+    std::vector<Real> pt(d);
+    target_ = scalar_view("target_", c,p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k)
+          pt[k] = (fe_->cubPts())(i,j,k);
+        target_(i,j) = targetFunc(pt);
+      }
+    }
+  }
+
+public:
+  QoI_StateTracking_Nonlinear_Elliptic(const ROL::Ptr<fe_type> &fe) : fe_(fe) {
+    computeTracking();
+  }
+
+  virtual Real targetFunc(const std::vector<Real> & x) const {
+    const Real zero(0), one(1);
+    Real val(0);
+    for (auto xi : x)
+      val = std::max(val, std::min(xi-zero, one-xi));
+    return val;
+    //const int size = x.size();
+    //bool flag = true;
+    //for (int i = 0; i < size; ++i) {
+    //  flag *= (x[i] < static_cast<Real>(0.6))*(x[i] > static_cast<Real>(0.4));
+    //}
+    //return (flag ? static_cast<Real>(1) : static_cast<Real>(0));
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = u_coeff.extent_int(0);
+    int p = fe_->cubPts().extent_int(1);
+    // Initialize output val
+    val = scalar_view("val", c);
+    // Evaluate state on FE basis
+    scalar_view valU_eval("valU_eval", c, p);
+    fe_->evaluateValue(valU_eval, u_coeff);
+    // Compute difference between state and target
+    rst::subtract(valU_eval,target_);
+    // Compute squared L2-norm of diff
+    fe_->computeIntegral(val,valU_eval,valU_eval);
+    // Scale by one half
+    rst::scale(val,static_cast<Real>(0.5));
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = u_coeff.extent_int(0);
+    int p = fe_->cubPts().extent_int(1);
+    int f = fe_->N().extent_int(1);
+    // Initialize output grad
+    grad = scalar_view("grad", c, f);
+    // Evaluate state on FE basis
+    scalar_view valU_eval("valU_eval", c, p);
+    fe_->evaluateValue(valU_eval, u_coeff);
+    // Compute difference between state and target
+    rst::subtract(valU_eval,target_);
+    // Compute gradient of squared L2-norm of diff
+    fst::integrate(grad,valU_eval,fe_->NdetJ(),false);
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_L2Tracking_Nonlinear_Elliptic::gradient_2 is zero.");
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    int c = v_coeff.extent_int(0);
+    int p = fe_->cubPts().extent_int(1);
+    int f = fe_->N().extent_int(1);
+    hess = scalar_view("hess", c, f);
+    scalar_view valV_eval("valV_eval", c, p);
+    fe_->evaluateValue(valV_eval, v_coeff);
+    fst::integrate(hess,valV_eval,fe_->NdetJ(),false);
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_L2Tracking_Nonlinear_Elliptic::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_L2Tracking_Nonlinear_Elliptic::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_L2Tracking_Nonlinear_Elliptic::HessVec_22 is zero.");
+  }
+
+}; // QoI_L2Tracking
+
+template <class Real, class DeviceType>
+class QoI_ControlPenalty_Nonlinear_Elliptic : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  ROL::Ptr<fe_type> fe_;
+  scalar_view ones_;
+
+public:
+  QoI_ControlPenalty_Nonlinear_Elliptic(const ROL::Ptr<fe_type> &fe) : fe_(fe) {
+    // Get relevant dimensions
+    int c = fe_->gradN().extent_int(0);
+    int p = fe_->gradN().extent_int(2);
+    // Build field container of ones
+    ones_ = scalar_view("ones_",c,p);
+    Kokkos::deep_copy(ones_,static_cast<Real>(1));
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = fe_->gradN().extent_int(0);
+    int p = fe_->gradN().extent_int(2);
+    // Initialize output val
+    val = scalar_view("val", c);
+    // Build local state tracking term
+    scalar_view valZ_eval("valZ_eval", c, p);
+    fe_->evaluateValue(valZ_eval, z_coeff);
+    fe_->computeIntegral(val,ones_,valZ_eval);
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_L2Penalty_Nonlinear_Elliptic::gradient_1 is zero.");
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    // Initialize output grad
+    grad = scalar_view("grad", c, f);
+    // Build local gradient of state tracking term
+    fst::integrate(grad,ones_,fe_->NdetJ(),false);
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_L2Penalty_Nonlinear_Elliptic::HessVec_11 is zero.");
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_L2Penalty_Nonlinear_Elliptic::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_L2Penalty_Nonlinear_Elliptic::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_L2Penalty_Nonlinear_Elliptic::HessVec_22 is zero.");
+  }
+
+}; // QoI_L2Penalty
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/nonlinear-elliptic/pde_nonlinear_ellipticK.hpp b/packages/rol/example/PDE-OPT/nonlinear-elliptic/pde_nonlinear_ellipticK.hpp
new file mode 100644
index 000000000000..74a91b92a638
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/nonlinear-elliptic/pde_nonlinear_ellipticK.hpp
@@ -0,0 +1,407 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  pde.hpp
+    \brief Implements the local PDE interface for the Poisson-Boltzmann control problem.
+*/
+
+#ifndef PDE_NONLINEAR_ELLIPTICK_HPP
+#define PDE_NONLINEAR_ELLIPTICK_HPP
+
+#include "../TOOLS/pdeK.hpp"
+#include "../TOOLS/feK.hpp"
+
+#include "Intrepid2_HGRAD_QUAD_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_QUAD_C2_FEM.hpp"
+#include "Intrepid2_DefaultCubatureFactory.hpp"
+#include "Intrepid2_FunctionSpaceTools.hpp"
+#include "Intrepid2_CellTools.hpp"
+
+#include "ROL_Ptr.hpp"
+
+template <class Real, class DeviceType>
+class PDE_Nonlinear_Elliptic : public PDE<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  // Finite element basis information
+  basis_ptr basisPtr_;
+  std::vector<basis_ptr> basisPtrs_;
+  // Cell cubature information
+  ROL::Ptr<Intrepid2::Cubature<DeviceType,Real,Real>> cellCub_;
+  // Cell node information
+  scalar_view volCellNodes_;
+  std::vector<std::vector<scalar_view>> bdryCellNodes_;
+  std::vector<std::vector<std::vector<int>>> bdryCellLocIds_;
+  // Finite element definition
+  ROL::Ptr<fe_type> fe_;
+
+  void computeBetaGradU(scalar_view & BgradU, const scalar_view gradU) const {
+    // GET DIMENSIONS
+    int c = fe_->gradN().extent_int(0);
+    int p = fe_->gradN().extent_int(2);
+    int d = fe_->gradN().extent_int(3);
+    std::vector<Real> U(d), BU(d);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k)
+          U[k] = gradU(i,j,k);
+        beta_value(BU,U);
+        for (int k = 0; k < d; ++k)
+          BgradU(i,j,k) = BU[k];
+      }
+    }
+  }
+
+  void computeBetaJacobianGradU(scalar_view BgradU, const scalar_view gradU) const {
+    // GET DIMENSIONS
+    int c = fe_->gradN().extent_int(0);
+    int p = fe_->gradN().extent_int(2);
+    int d = fe_->gradN().extent_int(3);
+    std::vector<Real> U(d);
+    std::vector<std::vector<Real>> BU(d, U);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k)
+          U[k] = gradU(i,j,k);
+        beta_jacobian(BU,U);
+        for (int k = 0; k < d; ++k) {
+          for (int m = 0; m < d; ++m)
+            BgradU(i,j,k,m) = BU[k][m];
+        }
+      }
+    }
+  }
+
+  void computeBetaHessianGradUGradL(scalar_view & BgradUgradL,
+                                    const scalar_view gradU,
+                                    const scalar_view gradL) const {
+    // GET DIMENSIONS
+    int c = fe_->gradN().extent_int(0);
+    int p = fe_->gradN().extent_int(2);
+    int d = fe_->gradN().extent_int(3);
+    std::vector<Real> U(d), L(d);
+    std::vector<std::vector<Real>> BU(d, U);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k) {
+          U[k] = gradU(i,j,k);
+          L[k] = gradL(i,j,k);
+        }
+        beta_hessian(BU,U,L);
+        for (int k = 0; k < d; ++k) {
+          for (int m = 0; m < d; ++m) {
+            BgradUgradL(i,j,k,m) = BU[k][m];
+          }
+        }
+      }
+    }
+  }
+
+  void computeDiffusivity(scalar_view &kappa) const {
+    // GET DIMENSIONS
+    int c = fe_->gradN().extent_int(0);
+    int p = fe_->gradN().extent_int(2);
+    int d = fe_->gradN().extent_int(3);
+    std::vector<Real> pt(d);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for ( int k = 0; k < d; ++k)
+          pt[k] = (fe_->cubPts())(i,j,k);
+        // Compute diffusivity kappa
+        kappa(i,j) = kappa_value(pt);
+      }
+    }
+  }
+
+  void computeRHS(scalar_view &rhs) const {
+    // GET DIMENSIONS
+    int c = fe_->gradN().extent_int(0);
+    int p = fe_->gradN().extent_int(2);
+    int d = fe_->gradN().extent_int(3);
+    std::vector<Real> pt(d);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for ( int k = 0; k < d; ++k)
+          pt[k] = (fe_->cubPts())(i,j,k);
+        // Compute diffusivity kappa
+        rhs(i,j) = -rhs_value(pt);
+      }
+    }
+  }
+
+public:
+  PDE_Nonlinear_Elliptic(ROL::ParameterList &parlist) {
+    // Finite element fields.
+    int basisOrder = parlist.sublist("Problem").get("Basis Order",1);
+    if (basisOrder == 1)
+      basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceType,Real,Real>>();
+    else if (basisOrder == 2)
+      basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C2_FEM<DeviceType,Real,Real>>();
+    basisPtrs_.clear(); basisPtrs_.push_back(basisPtr_);
+    // Quadrature rules.
+    shards::CellTopology cellType = basisPtr_->getBaseCellTopology();        // get the cell type from any basis
+    Intrepid2::DefaultCubatureFactory cubFactory;                            // create cubature factory
+    int cubDegree = parlist.sublist("Problem").get("Cubature Degree",2);     // set cubature degree, e.g., 2
+    cellCub_ = cubFactory.create<DeviceType,Real,Real>(cellType, cubDegree); // create default cubature
+  }
+
+  virtual void beta_value(std::vector<Real> &BU,
+                          const std::vector<Real> &U) const {
+    const int size = U.size();
+    Real magU(0);
+    for (int i = 0; i < size; ++i)
+      magU += U[i] * U[i];
+    for (int i = 0; i < size; ++i)
+      BU[i] = magU * U[i];
+  }
+
+  virtual void beta_jacobian(std::vector<std::vector<Real>> &BU,
+                             const std::vector<Real> &U) const {
+    const int size = U.size();
+    Real magU(0);
+    for (int i = 0; i < size; ++i)
+      magU += U[i] * U[i];
+    for (int i = 0; i < size; ++i) {
+      for (int j = 0; j < size; ++j)
+        BU[i][j] = static_cast<Real>(2) * U[j] * U[i];
+      BU[i][i] += magU;
+    }
+  }
+
+  virtual void beta_hessian(std::vector<std::vector<Real>> &BU,
+                            const std::vector<Real> &U,
+                            const std::vector<Real> &L) const {
+    const int size = U.size();
+    Real UL(0);
+    for (int i = 0; i < size; ++i)
+      UL += static_cast<Real>(2) * U[i] * L[i];
+    for (int i = 0; i < size; ++i) {
+      for (int j = 0; j < size; ++j)
+        BU[i][j] = static_cast<Real>(2) * (L[j] * U[i] + L[i] * U[j]);
+      BU[i][i] += UL;
+    }
+  }
+
+  Real kappa_value(const std::vector<Real> &x) const {
+    const std::vector<Real> param = PDE<Real,DeviceType>::getParameter();
+    const int ns = param.size(), d = x.size();
+    if (ns > d+1) { 
+      // random diffusion coefficient from i. babuska, f. nobile, r. tempone 2010.
+      // simplified model for random stratified media.
+      const Real one(1), two(2), three(3), eight(8), sixteen(16), half(0.5);
+      const Real lc = one/sixteen, sqrtpi = std::sqrt(M_PI);
+      const Real xi = std::sqrt(sqrtpi*lc), sqrt3 = std::sqrt(three);
+      Real f(0), phi(0);                                          
+      Real val = one + sqrt3*param[0]*std::sqrt(sqrtpi*lc*half);
+      Real arg = one + sqrt3*std::sqrt(sqrtpi*lc*half);
+      for (int i = 1; i < ns-(d+1); ++i) {
+        f = floor(half*static_cast<Real>(i+1));     
+        phi = ((i+1)%2 ? std::sin(f*M_PI*x[0]) : std::cos(f*M_PI*x[0]));
+        val += xi*std::exp(-std::pow(f*M_PI*lc,two)/eight)*phi*sqrt3*param[i];
+        arg += xi*std::exp(-std::pow(f*M_PI*lc,two)/eight)*std::abs(phi)*sqrt3;
+      }
+      return half + two*std::exp(val)/std::exp(arg);
+    }
+    return static_cast<Real>(1);
+  }
+
+  Real rhs_value(const std::vector<Real> &x) const {
+    const std::vector<Real> param = PDE<Real,DeviceType>::getParameter();
+    const int ns = param.size(), d = x.size();
+    if (ns) {
+      const int nk = ns - (d+1);
+      const Real ten(10), two(2), pi(M_PI);
+      Real val = std::pow(ten,two*param[nk]-two);
+      for (int i = 0; i < d; ++i)
+        val *= std::sin(two*param[nk+i+1]*pi*x[i]);
+      return val;
+    }
+    return static_cast<Real>(0);
+  }
+
+  void residual(scalar_view & res,
+                const scalar_view u_coeff,
+                const scalar_view z_coeff = scalar_view(),
+                const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // GET DIMENSIONS
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    int p = fe_->gradN().extent_int(2);
+    int d = fe_->gradN().extent_int(3);
+    // INITIALIZE RESIDUAL
+    res = scalar_view("res", c, f);
+    // COMPUTE STIFFNESS TERM
+    scalar_view gradU_eval("gradU_eval", c, p, d);
+    fe_->evaluateGradient(gradU_eval, u_coeff);
+    scalar_view kappa("kappa", c, p);
+    computeDiffusivity(kappa);
+    scalar_view KgradU("KgradU", c, p, d);
+    fst::scalarMultiplyDataData(KgradU,kappa,gradU_eval);
+    fst::integrate(res,KgradU,fe_->gradNdetJ(),false);
+    // ADD MASS TERM
+    scalar_view valU_eval("valU_eval", c, p);
+    fe_->evaluateValue(valU_eval, u_coeff);
+    fst::integrate(res,valU_eval,fe_->NdetJ(),true);
+    // ADD NONLINEAR TERM
+    scalar_view BgradU("BgradU", c, p, d);
+    computeBetaGradU(BgradU,gradU_eval);
+    fst::integrate(res,BgradU,fe_->gradNdetJ(),true);
+    // ADD RHS
+    scalar_view rhs("rhs", c, p);
+    computeRHS(rhs);
+    fst::integrate(res,rhs,fe_->NdetJ(),true);
+    // ADD CONTROL TERM
+    if ( z_coeff != scalar_view() ) {
+      scalar_view valZ_eval("valZ_eval", c, p);
+      fe_->evaluateValue(valZ_eval, z_coeff);
+      rst::scale(valZ_eval,static_cast<Real>(-1));
+      fst::integrate(res,valZ_eval,fe_->NdetJ(),true);
+    }
+  }
+
+  void Jacobian_1(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // GET DIMENSIONS
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    int p = fe_->gradN().extent_int(2);
+    int d = fe_->gradN().extent_int(3);
+    // INITIALIZE JACOBIAN
+    jac = scalar_view("jac", c, f, f);
+    // COMPUTE STIFFNESS TERM
+    scalar_view kappa("kappa", c, p);
+    computeDiffusivity(kappa);
+    scalar_view KgradN("KgradN", c, f, p, d);
+    fst::scalarMultiplyDataField(KgradN,kappa,fe_->gradN());
+    fst::integrate(jac,KgradN,fe_->gradNdetJ(),false);
+    // ADD MASS TERM
+    fst::integrate(jac,fe_->N(),fe_->NdetJ(),true);
+    // ADD NONLINEAR TERM
+    scalar_view gradU_eval("gradU_eval", c, p, d);
+    fe_->evaluateGradient(gradU_eval, u_coeff);
+    scalar_view BgradU("BgradU", c, p, d, d);
+    computeBetaJacobianGradU(BgradU,gradU_eval);
+    scalar_view BgradUgradN("BgradUgradN", c, f, p, d);
+    fst::tensorMultiplyDataField(BgradUgradN,BgradU,fe_->gradN());
+    fst::integrate(jac,BgradUgradN,fe_->gradNdetJ(),true);
+  }
+
+  void Jacobian_2(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if ( z_coeff != scalar_view() ) {
+      // GET DIMENSIONS
+      int c = fe_->N().extent_int(0);
+      int f = fe_->N().extent_int(1);
+      // INITIALIZE JACOBIAN
+      jac = scalar_view("jac", c, f, f);
+      // ADD CONTROL TERM
+      fst::integrate(jac,fe_->N(),fe_->NdetJ(),false);
+      rst::scale(jac,static_cast<Real>(-1));
+    }
+  }
+
+  void Hessian_11(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // GET DIMENSIONS
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    int p = fe_->gradN().extent_int(2);
+    int d = fe_->gradN().extent_int(3);
+    // INITIALIZE JACOBIAN
+    hess = scalar_view("hess", c, f, f);
+    // COMPUTE NONLINEAR TERM
+    scalar_view gradU_eval("gradU_eval", c, p, d);
+    fe_->evaluateGradient(gradU_eval, u_coeff);
+    scalar_view gradL_eval("gradL_eval", c, p, d);
+    fe_->evaluateGradient(gradL_eval, l_coeff);
+    scalar_view BgradUgradL("BgradUgradL", c, p, d, d);
+    computeBetaHessianGradUGradL(BgradUgradL,gradU_eval,gradL_eval);
+    scalar_view BgradUgradLgradN("BgradUgradLgradN", c, f, p, d);
+    fst::tensorMultiplyDataField(BgradUgradLgradN,BgradUgradL,fe_->gradN());
+    fst::integrate(hess,BgradUgradLgradN,fe_->gradNdetJ(),false);
+  }
+
+  void Hessian_12(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Nonlinear_Elliptic:Hessian_12: Hessian is zero.");
+  }
+
+  void Hessian_21(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Nonlinear_Elliptic:Hessian_21: Hessian is zero.");
+  }
+
+  void Hessian_22(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Nonlinear_Elliptic:Hessian_22: Hessian is zero.");
+  }
+
+  void RieszMap_1(scalar_view & riesz) override {
+    // GET DIMENSIONS
+    int c = fe_->N().extent_int(0);
+    int f = fe_->N().extent_int(1);
+    // INITIALIZE RIESZ
+    riesz = scalar_view("riesz2", c, f, f);
+    Kokkos::deep_copy(riesz,fe_->stiffMat());
+    rst::add(riesz,fe_->massMat());
+  }
+
+  void RieszMap_2(scalar_view & riesz) override {
+    // GET DIMENSIONS
+    int c = fe_->N().extent_int(0);
+    int f = fe_->N().extent_int(1);
+    // INITIALIZE RIESZ
+    riesz = scalar_view("riesz2", c, f, f);
+    Kokkos::deep_copy(riesz,fe_->massMat());
+  }
+
+  std::vector<basis_ptr> getFields() override {
+    return basisPtrs_;
+  }
+
+  void setCellNodes(const scalar_view &volCellNodes,
+                    const std::vector<std::vector<scalar_view>> &bdryCellNodes,
+                    const std::vector<std::vector<std::vector<int>>> &bdryCellLocIds) override {
+    volCellNodes_ = volCellNodes;
+    bdryCellNodes_ = bdryCellNodes;
+    bdryCellLocIds_ = bdryCellLocIds;
+    // Finite element definition.
+    fe_ = ROL::makePtr<fe_type>(volCellNodes_,basisPtr_,cellCub_);
+  }
+
+  const ROL::Ptr<fe_type> getFE(void) const {
+    return fe_;
+  }
+
+}; // PDE_Nonlinear_Elliptic
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/obstacle/CMakeLists.txt b/packages/rol/example/PDE-OPT/obstacle/CMakeLists.txt
index 850e2ea7b9db..08e00b6b9bef 100644
--- a/packages/rol/example/PDE-OPT/obstacle/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/obstacle/CMakeLists.txt
@@ -8,7 +8,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   # Need ROL_TpetraMultiVector.hpp
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
 
-  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  ROL_ADD_EXECUTABLE_AND_TEST(
     example_01
     SOURCES example_01.cpp
     ARGS PrintItAll
@@ -18,7 +18,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     ObstacleDataCopy
     SOURCE_FILES
       input.xml plotresults.m
@@ -26,5 +26,33 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
   )
 
+ENDIF()
+
+IF(${PROJECT_NAME}_ENABLE_Intrepid2 AND
+   ${PROJECT_NAME}_ENABLE_Ifpack2   AND
+   ${PROJECT_NAME}_ENABLE_MueLu     AND
+   ${PROJECT_NAME}_ENABLE_Amesos2   AND
+   ${PROJECT_NAME}_ENABLE_Tpetra )
+
+  # Need ROL_TpetraMultiVector.hpp
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
+
+  ROL_ADD_EXECUTABLE_AND_TEST(
+    example_01_Kokkos
+    SOURCES example_01K.cpp
+    ARGS PrintItAll
+    NUM_MPI_PROCS 4
+    NUM_TOTAL_CORES_USED 4
+    PASS_REGULAR_EXPRESSION "TEST PASSED"
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_COPY_FILES_TO_BINARY_DIR(
+    ObstacleDataCopyK
+    SOURCE_FILES
+      input.xml plotresults.m
+    SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}"
+    DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
+  )
 
 ENDIF()
diff --git a/packages/rol/example/PDE-OPT/obstacle/energy_objectiveK.hpp b/packages/rol/example/PDE-OPT/obstacle/energy_objectiveK.hpp
new file mode 100644
index 000000000000..4ffbfe1f652f
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/obstacle/energy_objectiveK.hpp
@@ -0,0 +1,134 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+#ifndef ROL_PDEOPT_ENERGY_OBJECTIVEK_HPP
+#define ROL_PDEOPT_ENERGY_OBJECTIVEK_HPP
+
+#include "ROL_Objective.hpp"
+#include "../TOOLS/assemblerK.hpp"
+
+// Do not instantiate the template in this translation unit.
+extern template class Assembler<double,Kokkos::HostSpace>;
+
+template<class Real, class DeviceType>
+class EnergyObjective : public ROL::Objective<Real> {
+private:
+  const ROL::Ptr<PDE<Real,DeviceType>> pde_;
+  ROL::Ptr<Assembler<Real,DeviceType>> assembler_;
+  bool assembleRHS_, assembleJ1_;
+
+  ROL::Ptr<Tpetra::MultiVector<>> cvec_;
+  ROL::Ptr<Tpetra::MultiVector<>> uvec_;
+
+  ROL::Ptr<Tpetra::MultiVector<>> res_;
+  ROL::Ptr<Tpetra::CrsMatrix<>>   jac_;
+
+  void assemble(void) {
+    // Assemble affine term.
+    if (assembleRHS_)
+      assembler_->assemblePDEResidual(res_,pde_,uvec_);
+    assembleRHS_ = false;
+    // Assemble jacobian_1.
+    if (assembleJ1_)
+      assembler_->assemblePDEJacobian1(jac_,pde_,uvec_);
+    assembleJ1_ = false;
+  }
+
+public:
+  EnergyObjective(const ROL::Ptr<PDE<Real,DeviceType>> &pde,
+                  const ROL::Ptr<MeshManager<Real,DeviceType>> &meshMgr,
+                  const ROL::Ptr<const Teuchos::Comm<int>> &comm,
+                  ROL::ParameterList &parlist,
+                  std::ostream &outStream = std::cout)
+    : pde_(pde), assembleRHS_(true), assembleJ1_(true) {
+    // Construct assembler.
+    assembler_ = ROL::makePtr<Assembler<Real,DeviceType>>(pde_->getFields(),meshMgr,comm,parlist,outStream);
+    assembler_->setCellNodes(*pde_);
+    // Initialize zero vectors.
+    cvec_ = assembler_->createResidualVector();
+    uvec_ = assembler_->createStateVector();
+    uvec_->putScalar(static_cast<Real>(0));
+    assemble();
+  }
+
+  const ROL::Ptr<Assembler<Real,DeviceType>> getAssembler(void) const {
+    return assembler_;
+  }
+
+  Real value(const ROL::Vector<Real> &u, Real &tol) {
+    auto uf = getConstField(u);
+    const Real half(0.5), one(1);
+    jac_->apply(*uf,*cvec_);
+    cvec_->update(one,*res_,half);
+    Teuchos::Array<Real> val(1,0);
+    cvec_->dot(*uf, val.view(0,1));
+    return val[0];
+  }
+
+  void gradient(ROL::Vector<Real> &g, const ROL::Vector<Real> &u, Real &tol) {
+    auto gf = getField(g);
+    auto uf = getConstField(u);
+    const Real one(1);
+    gf->scale(one,*res_);
+    jac_->apply(*uf,*cvec_);
+    gf->update(one,*cvec_,one);
+  }
+
+  void hessVec(ROL::Vector<Real> &hv, const ROL::Vector<Real> &v, const ROL::Vector<Real> &u, Real &tol) {
+    auto hvf = getField(hv);
+    auto vf = getConstField(v);
+    auto uf = getConstField(u);
+    jac_->apply(*vf,*hvf);
+  }
+
+  void precond(ROL::Vector<Real> &Pv, const ROL::Vector<Real> &v, const ROL::Vector<Real> &u, Real &tol) {
+    Pv.set(v.dual());
+  }
+
+private: // Vector accessor functions
+
+  ROL::Ptr<const Tpetra::MultiVector<>> getConstField(const ROL::Vector<Real> &x) const {
+    ROL::Ptr<const Tpetra::MultiVector<>> xp;
+    try {
+      xp = dynamic_cast<const ROL::TpetraMultiVector<Real>&>(x).getVector();
+    }
+    catch (std::exception &e) {
+      ROL::Ptr<const ROL::TpetraMultiVector<Real>> xvec
+        = dynamic_cast<const PDE_OptVector<Real>&>(x).getField();
+      if (xvec == ROL::nullPtr) {
+        xp = ROL::nullPtr;
+      }
+      else {
+        xp = xvec->getVector();
+      }
+    }
+    return xp;
+  }
+
+  ROL::Ptr<Tpetra::MultiVector<> > getField(ROL::Vector<Real> &x) const {
+    ROL::Ptr<Tpetra::MultiVector<>> xp;
+    try {
+      xp = dynamic_cast<ROL::TpetraMultiVector<Real>&>(x).getVector();
+    }
+    catch (std::exception &e) {
+      ROL::Ptr<ROL::TpetraMultiVector<Real>> xvec
+        = dynamic_cast<PDE_OptVector<Real>&>(x).getField();
+      if ( xvec == ROL::nullPtr ) {
+        xp = ROL::nullPtr;
+      }
+      else {
+        xp = xvec->getVector();
+      }
+    }
+    return xp;
+  }
+
+};
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/obstacle/example_01.cpp b/packages/rol/example/PDE-OPT/obstacle/example_01.cpp
index dbc756bf5ce6..8627a3da79b9 100644
--- a/packages/rol/example/PDE-OPT/obstacle/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/obstacle/example_01.cpp
@@ -13,8 +13,7 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -94,7 +93,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int>> comm
     = Tpetra::getDefaultComm();
   const int myRank = comm->getRank();
diff --git a/packages/rol/example/PDE-OPT/obstacle/example_01K.cpp b/packages/rol/example/PDE-OPT/obstacle/example_01K.cpp
new file mode 100644
index 000000000000..25ec069c30e6
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/obstacle/example_01K.cpp
@@ -0,0 +1,177 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_01.cpp
+    \brief Shows how to solve the obstacle problem.
+*/
+
+#include "Teuchos_Comm.hpp"
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+
+#include "ROL_Solver.hpp"
+#include "ROL_Bounds.hpp"
+
+#include "../TOOLS/meshmanagerK.hpp"
+#include "../TOOLS/pdeobjectiveK.hpp"
+#include "../TOOLS/pdevectorK.hpp"
+#include "pde_obstacleK.hpp"
+#include "energy_objectiveK.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+template <class Real>
+Real evaluateUpperBound(const std::vector<Real> & coord) {
+  Real distance(0), one(1);
+  if (coord[1] < coord[0]) {
+    if (coord[1] < one - coord[0]) distance = coord[1];
+    else                           distance = one - coord[0];
+  }
+  else {
+    if (coord[1] < one - coord[0]) distance = coord[0];
+    else                           distance = one - coord[1];
+  }
+  return distance;
+}
+
+
+template <class Real,class DeviceType>
+void computeUpperBound(const ROL::Ptr<Tpetra::MultiVector<>> & ubVec,
+                       const ROL::Ptr<const FE<Real,DeviceType>> & fe,
+                       const Kokkos::DynRankView<Real,DeviceType> cellNodes,
+                       const Kokkos::DynRankView<int,DeviceType> cellDofs,
+                       const Teuchos::Array<typename Tpetra::Map<>::global_ordinal_type> & cellIds) {
+  int c = fe->gradN().extent_int(0);
+  int f = fe->gradN().extent_int(1);
+  int d = fe->gradN().extent_int(3);
+  Kokkos::DynRankView<Real,DeviceType> dofPoints("dofPoints",c,f,d);
+  fe->computeDofCoords(dofPoints, cellNodes);
+  
+  std::vector<Real> coord(d);
+  for (int i = 0; i < c; ++i) {
+    for (int j = 0; j < f; ++j) {
+      int fidx = cellDofs(cellIds[i],j);
+      if (ubVec->getMap()->isNodeGlobalElement(fidx)) {
+        for (int k = 0; k < d; ++k)
+          coord[k] = dofPoints(i,j,k);
+        ubVec->replaceGlobalValue(fidx,
+                                  0,
+                                  evaluateUpperBound<Real>(coord));
+      }
+    }
+  }
+} 
+
+
+int main(int argc, char *argv[]) {
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  Kokkos::ScopeGuard kokkosScope (argc, argv);
+  ROL::Ptr<const Teuchos::Comm<int>> comm
+    = Tpetra::getDefaultComm();
+  const int myRank = comm->getRank();
+  if ((iprint > 0) && (myRank == 0))
+    outStream = ROL::makePtrFromRef(std::cout); 
+  else
+    outStream = ROL::makePtrFromRef(bhs);
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+
+    /*** Read in XML input ***/
+    std::string filename = "input.xml";
+    auto parlist = ROL::getParametersFromXmlFile(filename);
+
+    /*** Initialize PDE describing the obstacle problem ***/
+    auto meshMgr = ROL::makePtr<MeshManager_Rectangle<RealT,DeviceT>>(*parlist);
+    auto pde = ROL::makePtr<PDE_Obstacle<RealT,DeviceT>>(*parlist);
+    auto obj = ROL::makePtr<EnergyObjective<RealT,DeviceT>>(pde,meshMgr,comm,*parlist,*outStream);
+    auto assembler = obj->getAssembler();
+
+    // Create state vectors
+    auto u_ptr = assembler->createStateVector();
+    auto du_ptr = assembler->createStateVector();
+    u_ptr->randomize(); du_ptr->randomize();
+    auto up  = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(u_ptr,pde,assembler);
+    auto dup = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(du_ptr,pde,assembler);
+
+    // Build bound constraints
+    auto lo_ptr = assembler->createStateVector();
+    auto hi_ptr = assembler->createStateVector();
+    lo_ptr->putScalar(0.0); hi_ptr->putScalar(1.0);
+    computeUpperBound<RealT,DeviceT>(hi_ptr,pde->getFE(),
+                             pde->getCellNodes(),
+                             assembler->getDofManager()->getCellDofs(),
+                             assembler->getCellIds());
+    auto lop = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(lo_ptr,pde,assembler);
+    auto hip = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(hi_ptr,pde,assembler);
+    auto bnd = ROL::makePtr<ROL::Bounds<RealT>>(lop,hip);
+
+    // Run derivative checks
+    obj->checkGradient(*up,*dup,true,*outStream);
+    obj->checkHessVec(*up,*dup,true,*outStream);
+
+    // Build optimization problem
+    du_ptr->putScalar(0.4);
+    auto opt = ROL::makePtr<ROL::Problem<RealT>>(obj,up);
+    opt->addBoundConstraint(bnd);
+    opt->finalize(false,true,*outStream);
+
+    up->set(*dup);
+    parlist->sublist("Step").set("Type","Trust Region");
+    ROL::Solver<RealT> solverTR(opt,*parlist);
+    std::clock_t timerTR = std::clock();
+    solverTR.solve(*outStream);
+    *outStream << "Trust Region Time: "
+               << static_cast<RealT>(std::clock()-timerTR)/static_cast<RealT>(CLOCKS_PER_SEC)
+               << " seconds." << std::endl << std::endl;
+
+    up->set(*dup);
+    parlist->sublist("Step").set("Type","Primal-Dual Active Set");
+    ROL::Solver<RealT> solverPDAS(opt,*parlist);
+    std::clock_t timerPDAS = std::clock();
+    solverPDAS.solve(*outStream);
+    *outStream << "Primal Dual Active Set Time: "
+               << static_cast<RealT>(std::clock()-timerPDAS)/static_cast<RealT>(CLOCKS_PER_SEC)
+               << " seconds." << std::endl << std::endl;
+
+    // Output.
+    assembler->printMeshData(*outStream);
+    assembler->outputTpetraVector(u_ptr,"state.txt");
+    assembler->outputTpetraVector(hi_ptr,"upperBound.txt");
+
+    // Get a summary from the time monitor.
+    Teuchos::TimeMonitor::summarize();
+
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/obstacle/pde_obstacleK.hpp b/packages/rol/example/PDE-OPT/obstacle/pde_obstacleK.hpp
new file mode 100644
index 000000000000..bb684a0eb197
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/obstacle/pde_obstacleK.hpp
@@ -0,0 +1,176 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  pde.hpp
+    \brief Implements the local PDE interface for the obstacle problem.
+*/
+
+#ifndef PDE_POISSON_OBSTACLEK_HPP
+#define PDE_POISSON_OBSTACLEK_HPP
+
+#include "../TOOLS/pdeK.hpp"
+#include "../TOOLS/feK.hpp"
+
+#include "Intrepid2_HGRAD_QUAD_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_QUAD_C2_FEM.hpp"
+#include "Intrepid2_DefaultCubatureFactory.hpp"
+#include "Intrepid2_FunctionSpaceTools.hpp"
+#include "Intrepid2_CellTools.hpp"
+
+#include "ROL_Ptr.hpp"
+
+template <class Real, class DeviceType>
+class PDE_Obstacle : public PDE<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  // Finite element basis information
+  basis_ptr basisPtr_;
+  std::vector<basis_ptr> basisPtrs_;
+  // Cell cubature information
+  ROL::Ptr<Intrepid2::Cubature<DeviceType,Real,Real>> cellCub_;
+  // Cell node information
+  scalar_view volCellNodes_;
+  // Finite element definition
+  ROL::Ptr<fe_type> fe_vol_;
+  // Load term
+  scalar_view load_;
+  Real coeff_;
+  // Use Riesz map for state variables?
+  bool useStateRiesz_;
+
+  Real evaluateLoad(const std::vector<Real> &coord) const {
+    return static_cast<Real>(2)*coeff_;
+  }
+
+public:
+  PDE_Obstacle(ROL::ParameterList &parlist) {
+    // Finite element fields.
+    int basisOrder = parlist.sublist("Problem").get("Basis Order",1);
+    if (basisOrder == 1)
+      basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceType,Real,Real>>();
+    else if (basisOrder == 2)
+      basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C2_FEM<DeviceType,Real,Real>>();
+    basisPtrs_.clear(); basisPtrs_.push_back(basisPtr_);
+    // Quadrature rules.
+    shards::CellTopology cellType = basisPtr_->getBaseCellTopology();                  // get the cell type from any basis
+    Intrepid2::DefaultCubatureFactory cubFactory;                                      // create cubature factory
+    int cubDegree = parlist.sublist("Problem").get("Cubature Degree", 2);              // set cubature degree, e.g., 2
+    cellCub_ = cubFactory.create<DeviceType,Real,Real>(cellType, cubDegree);           // create default cubature
+    coeff_ = parlist.sublist("Problem").get("Load Magnitude", 1.0);                    // Pointwise load magnitude
+    useStateRiesz_ = parlist.sublist("Problem").get("Use State Riesz Map", true);      // use Riesz map for state variables?
+  }
+
+  void residual(scalar_view & res,
+                const scalar_view u_coeff,
+                const scalar_view z_coeff = scalar_view(),
+                const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // GET DIMENSIONS
+    int c = fe_vol_->gradN().extent_int(0);
+    int f = fe_vol_->gradN().extent_int(1);
+    int p = fe_vol_->gradN().extent_int(2);
+    int d = fe_vol_->gradN().extent_int(3);
+    // INITIALIZE RESIDUAL
+    res = scalar_view("res", c, f);
+    // COMPUTE STIFFNESS TERM
+    scalar_view gradU_eval("gradU_eval", c, p, d);
+    fe_vol_->evaluateGradient(gradU_eval, u_coeff);
+    fst::integrate(res,gradU_eval,fe_vol_->gradNdetJ(),false);
+//    // ADD MASS TERM
+//    scalar_view valU_eval("valU_eval", c, p);
+//    fe_vol_->evaluateValue(valU_eval, u_coeff);
+//    fst::integrate(res,valU_eval,fe_vol_->NdetJ(),true);
+    // ADD LOAD TERM
+    fst::integrate(res,load_,fe_vol_->NdetJ(),true);
+  }
+
+  void Jacobian_1(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // GET DIMENSIONS
+    int c = fe_vol_->gradN().extent_int(0);
+    int f = fe_vol_->gradN().extent_int(1);
+    // INITIALIZE JACOBIAN
+    jac = scalar_view("jac", c, f, f);
+    // COMPUTE STIFFNESS TERM
+    Kokkos::deep_copy(jac,fe_vol_->stiffMat());
+//    // ADD MASS TERM
+//    rst::add(jac,fe_vol_->massMat());
+  }
+
+  void Hessian_11(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Obstacle::Hessian_11): Hessian is zero.");
+  }
+
+  void RieszMap_1(scalar_view & riesz) override {
+    // Optionally disable Riesz map ...
+    if (!useStateRiesz_)
+      throw Exception::NotImplemented(">>> (PDE_Obstacle::RieszMap_1): Not implemented.");
+
+    // ...otherwise ...
+
+    int c = fe_vol_->N().extent_int(0);
+    int f = fe_vol_->N().extent_int(1);
+    // INITIALIZE RIESZ MAP
+    riesz = scalar_view("riesz1", c, f, f);
+    Kokkos::deep_copy(riesz,fe_vol_->stiffMat());
+    rst::add(riesz,fe_vol_->massMat());
+  }
+
+  std::vector<basis_ptr> getFields() override {
+    return basisPtrs_;
+  }
+
+  void setCellNodes(const scalar_view &volCellNodes,
+                    const std::vector<std::vector<scalar_view>> &bdryCellNodes,
+                    const std::vector<std::vector<std::vector<int>>> &bdryCellLocIds) override {
+    volCellNodes_ = volCellNodes;
+    // Finite element definition.
+    fe_vol_ = ROL::makePtr<fe_type>(volCellNodes_,basisPtr_,cellCub_);
+    // Compute load function
+    computeLoad();
+  }
+
+  void computeLoad(void) {
+    int c = fe_vol_->gradN().extent_int(0);
+    int p = fe_vol_->gradN().extent_int(2);
+    int d = fe_vol_->gradN().extent_int(3);
+    std::vector<Real> coord(d);
+    load_ = scalar_view("load_", c,p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k)
+          coord[k] = (fe_vol_->cubPts())(i,j,k);
+        load_(i,j) = -evaluateLoad(coord);
+      }
+    }
+  }
+
+  const ROL::Ptr<fe_type> getFE(void) const {
+    return fe_vol_;
+  }
+
+  const scalar_view getCellNodes(void) const {
+    return volCellNodes_;
+  }
+
+}; // PDE_Obstacle
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/poisson-boltzmann/CMakeLists.txt b/packages/rol/example/PDE-OPT/poisson-boltzmann/CMakeLists.txt
index 080cc58f5c61..e24dd3826fb8 100644
--- a/packages/rol/example/PDE-OPT/poisson-boltzmann/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/poisson-boltzmann/CMakeLists.txt
@@ -1,8 +1,7 @@
-
 IF(${PROJECT_NAME}_ENABLE_Intrepid AND
-   ${PROJECT_NAME}_ENABLE_Ifpack2  AND
-   ${PROJECT_NAME}_ENABLE_MueLu    AND
-   ${PROJECT_NAME}_ENABLE_Amesos2  AND
+   ${PROJECT_NAME}_ENABLE_Ifpack2   AND
+   ${PROJECT_NAME}_ENABLE_MueLu     AND
+   ${PROJECT_NAME}_ENABLE_Amesos2   AND
    ${PROJECT_NAME}_ENABLE_Tpetra )
 
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/teuchos/src/sol)
@@ -13,7 +12,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   # Need ROL_TpetraTeuchosBatchManager.hpp
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
 
-  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  ROL_ADD_EXECUTABLE_AND_TEST(
     example_01
     SOURCES example_01.cpp
     ARGS PrintItAll
@@ -23,19 +22,19 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_02
     SOURCES example_02.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_03
     SOURCES example_03.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     PoissonBoltzmannDataCopy
     SOURCE_FILES
       input.xml input_ex02.xml input_ex03.xml plotresults.m
@@ -43,5 +42,51 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
   )
 
+ENDIF()
+
+
+IF(${PROJECT_NAME}_ENABLE_Intrepid2 AND
+   ${PROJECT_NAME}_ENABLE_Ifpack2   AND
+   ${PROJECT_NAME}_ENABLE_MueLu     AND
+   ${PROJECT_NAME}_ENABLE_Amesos2   AND
+   ${PROJECT_NAME}_ENABLE_Tpetra )
+
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/teuchos/src/sol)
+
+  # Need ROL_TpetraMultiVector.hpp
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
+
+  # Need ROL_TpetraTeuchosBatchManager.hpp
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
+
+  ROL_ADD_EXECUTABLE_AND_TEST(
+    example_01_Kokkos
+    SOURCES example_01K.cpp
+    ARGS PrintItAll
+    NUM_MPI_PROCS 4
+    NUM_TOTAL_CORES_USED 4
+    PASS_REGULAR_EXPRESSION "TEST PASSED"
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_ADD_EXECUTABLE(
+    example_02_Kokkos
+    SOURCES example_02K.cpp
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_ADD_EXECUTABLE(
+    example_03_Kokkos
+    SOURCES example_03K.cpp
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_COPY_FILES_TO_BINARY_DIR(
+    PoissonBoltzmannDataCopyK
+    SOURCE_FILES
+      input.xml input_ex02.xml input_ex03.xml plotresults.m
+    SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}"
+    DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
+  )
 
 ENDIF()
diff --git a/packages/rol/example/PDE-OPT/poisson-boltzmann/dopingK.hpp b/packages/rol/example/PDE-OPT/poisson-boltzmann/dopingK.hpp
new file mode 100644
index 000000000000..ba48ef8a50ab
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/poisson-boltzmann/dopingK.hpp
@@ -0,0 +1,171 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+#ifndef DOPINGK_HPP
+#define DOPINGK_HPP
+
+#include "../TOOLS/feK.hpp"
+#include "Teuchos_ParameterList.hpp"
+#include "Tpetra_MultiVector.hpp"
+#include "ROL_Types.hpp"
+
+template <class Real, class DeviceType>
+class Doping {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using int_view = Kokkos::DynRankView<int, DeviceType>;
+  using fe_type = FE<Real, DeviceType>;
+
+private:
+  typedef Tpetra::Map<>::global_ordinal_type GO;
+
+  const ROL::Ptr<fe_type> fe_;
+  scalar_view cellNodes_;
+  int_view cellDofs_;
+  const Teuchos::Array<GO> cellIds_;
+  Real a_, b_, wx_, wy_;
+
+public:
+  Doping(const ROL::Ptr<fe_type> & fe,
+         const scalar_view cellNodes,
+         const int_view cellDofs,
+         const Teuchos::Array<GO> & cellIds,
+         Teuchos::ParameterList &parlist)
+    : fe_(fe), cellNodes_(cellNodes), cellDofs_(cellDofs), cellIds_(cellIds) {
+    // Get parameters from parameter list
+    a_  = parlist.sublist("Problem").get("Desired Lower Doping Value", 0.3);
+    b_  = parlist.sublist("Problem").get("Desired Upper Doping Value", 1.0);
+    wx_ = parlist.sublist("Geometry").get("Width",0.6);
+    wy_ = parlist.sublist("Geometry").get("Height",0.2);
+  }
+
+  Real evaluate(const std::vector<Real> &x) const {
+    Real zero(0), dx(wx_/6.0), dy(wy_/4.0);
+    Real eps = std::sqrt(ROL::ROL_EPSILON<Real>());
+    bool inRegion1 = ((x[0] > zero-eps)   && (x[0] < dx+eps)  && (x[1] > wy_-dy-eps) && (x[1] < wy_+eps));
+    bool inRegion2 = ((x[0] > wx_-dx-eps) && (x[0] < wx_+eps) && (x[1] > wy_-dy-eps) && (x[1] < wy_+eps));
+    return (inRegion1 || inRegion2 ? b_ : a_);
+  }
+
+  void build(const ROL::Ptr<Tpetra::MultiVector<>> &dpVec) const {
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    int d = fe_->gradN().extent_int(3);
+    scalar_view dofPoints("dofPoints", c ,f, d);
+    fe_->computeDofCoords(dofPoints, cellNodes_);
+
+    std::vector<Real> coord(d);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < f; ++j) {
+        int fidx = (cellDofs_)(cellIds_[i], j);
+        if (dpVec->getMap()->isNodeGlobalElement(fidx)) {
+          for (int k = 0; k < d; ++k) {
+            coord[k] = (dofPoints)(i, j, k);
+          }
+          dpVec->replaceGlobalValue(fidx,
+                                    0,
+                                    evaluate(coord));
+        }
+      }
+    }
+  }
+};
+
+template <class Real, class DeviceType>
+class DopingBounds {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using int_view = Kokkos::DynRankView<int, DeviceType>;
+  using fe_type = FE<Real, DeviceType>;
+
+private:
+  typedef Tpetra::Map<>::global_ordinal_type GO;
+
+  const ROL::Ptr<fe_type> fe_;
+  scalar_view cellNodes_;
+  int_view cellDofs_;
+  const Teuchos::Array<GO> cellIds_;
+  Real a_, b_, wx_, wy_, lo_, hi_;
+  bool useConstant_;
+
+public:
+  DopingBounds(const ROL::Ptr<fe_type> &fe,
+               const scalar_view cellNodes,
+               const int_view cellDofs,
+               const Teuchos::Array<GO> & cellIds,
+               Teuchos::ParameterList &parlist)
+    : fe_(fe), cellNodes_(cellNodes), cellDofs_(cellDofs), cellIds_(cellIds) {
+    // Get parameters from parameter list
+    a_  = parlist.sublist("Problem").get("Desired Lower Doping Value", 0.3);
+    b_  = parlist.sublist("Problem").get("Desired Upper Doping Value", 1.0);
+    wx_ = parlist.sublist("Geometry").get("Width",0.6);
+    wy_ = parlist.sublist("Geometry").get("Height",0.2);
+    lo_ = parlist.sublist("Problem").get("Lower Bound Value",  0.0);
+    hi_ = parlist.sublist("Problem").get("Upper Bound Value", 10.0);
+    useConstant_ = parlist.sublist("Problem").get("Use Constant Bounds",true);
+  }
+
+  Real evaluateLower(const std::vector<Real> &x) const {
+    if (!useConstant_) {
+      const Real zero(0), two(2), four(4), dx(wx_/6.0);
+      const Real eps = std::sqrt(ROL::ROL_EPSILON<Real>());
+      const bool inRegion1 = ((x[0] > zero-eps)   && (x[0] < dx+eps)      && (x[1] > wy_-eps) && (x[1] < wy_+eps));
+      const bool inRegion2 = ((x[0] > two*dx-eps) && (x[0] < four*dx+eps) && (x[1] > wy_-eps) && (x[1] < wy_+eps));
+      const bool inRegion3 = ((x[0] > wx_-dx-eps) && (x[0] < wx_+eps)     && (x[1] > wy_-eps) && (x[1] < wy_+eps));
+      return (inRegion1 || inRegion3 ? b_ : (inRegion2 ? a_ : lo_));
+    }
+    return lo_;
+  }
+
+  Real evaluateUpper(const std::vector<Real> &x) const {
+    if (!useConstant_) {
+      const Real zero(0), two(2), four(4), dx(wx_/6.0);
+      const Real eps = std::sqrt(ROL::ROL_EPSILON<Real>());
+      const bool inRegion1 = ((x[0] > zero-eps)   && (x[0] < dx+eps)      && (x[1] > wy_-eps) && (x[1] < wy_+eps));
+      const bool inRegion2 = ((x[0] > two*dx-eps) && (x[0] < four*dx+eps) && (x[1] > wy_-eps) && (x[1] < wy_+eps));
+      const bool inRegion3 = ((x[0] > wx_-dx-eps) && (x[0] < wx_+eps)     && (x[1] > wy_-eps) && (x[1] < wy_+eps));
+      return (inRegion1 || inRegion3 ? b_ : (inRegion2 ? a_ : hi_));
+    }
+    return hi_;
+  }
+
+  void build(const ROL::Ptr<Tpetra::MultiVector<>> & loVec,
+             const ROL::Ptr<Tpetra::MultiVector<>> & hiVec) const {
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    int d = fe_->gradN().extent_int(3);
+    scalar_view dofPoints("dofPoint", c, f, d);
+    fe_->computeDofCoords(dofPoints, cellNodes_);
+
+    std::vector<Real> coord(d);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < f; ++j) {
+        int fidx = (cellDofs_)(cellIds_[i], j);
+        if (loVec->getMap()->isNodeGlobalElement(fidx)) {
+          for (int k = 0; k < d; ++k) {
+            coord[k] = dofPoints(i, j, k);
+          }
+          loVec->replaceGlobalValue(fidx,
+                                    0,
+                                    evaluateLower(coord));
+        }
+        if (hiVec->getMap()->isNodeGlobalElement(fidx)) {
+          for (int k = 0; k < d; ++k) {
+            coord[k] = dofPoints(i,j,k);
+          }
+          hiVec->replaceGlobalValue(fidx,
+                                    0,
+                                    evaluateUpper(coord));
+        }
+      }
+    }
+  }
+};
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/poisson-boltzmann/example_01.cpp b/packages/rol/example/PDE-OPT/poisson-boltzmann/example_01.cpp
index 7c9365d170ec..b7c81dc2720a 100644
--- a/packages/rol/example/PDE-OPT/poisson-boltzmann/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/poisson-boltzmann/example_01.cpp
@@ -13,8 +13,7 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -40,7 +39,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int>> comm
     = Tpetra::getDefaultComm();
   const int myRank = comm->getRank();
@@ -117,8 +116,11 @@ int main(int argc, char *argv[]) {
       problem = ROL::makePtr<ROL::Problem<RealT>>(obj,x);
     problem->addConstraint("PDE",con,pp);
     problem->finalize(false,true,*outStream);
+    Teuchos::Time algoTimer("Algorithm Time", true);
     ROL::Solver<RealT> solver(problem,*parlist);
     solver.solve(*outStream);
+    algoTimer.stop();
+    *outStream << "Total optimization time = " << algoTimer.totalElapsedTime() << " seconds.\n";
 
     // Output.
     con->getAssembler()->printMeshData(*outStream);
diff --git a/packages/rol/example/PDE-OPT/poisson-boltzmann/example_01K.cpp b/packages/rol/example/PDE-OPT/poisson-boltzmann/example_01K.cpp
new file mode 100644
index 000000000000..3f93a1d5fe47
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/poisson-boltzmann/example_01K.cpp
@@ -0,0 +1,143 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_01.cpp
+    \brief Shows how to solve the Poisson-Boltzmann problem.
+*/
+
+#include "Teuchos_Comm.hpp"
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+
+#include "../TOOLS/meshmanagerK.hpp"
+#include "../TOOLS/pdeconstraintK.hpp"
+#include "../TOOLS/pdeobjectiveK.hpp"
+#include "../TOOLS/pdevectorK.hpp"
+#include "pde_poisson_boltzmannK.hpp"
+#include "obj_poisson_boltzmannK.hpp"
+
+#include "ROL_Solver.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+int main(int argc, char *argv[]) {
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  Kokkos::ScopeGuard kokkosScope (argc, argv);
+  ROL::Ptr<const Teuchos::Comm<int>> comm = Tpetra::getDefaultComm();
+  const int myRank = comm->getRank();
+  if ((iprint > 0) && (myRank == 0)) {
+    outStream = ROL::makePtrFromRef(std::cout);
+  }
+  else {
+    outStream = ROL::makePtrFromRef(bhs);
+  }
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+
+    /*** Read in XML input ***/
+    std::string filename = "input.xml";
+    auto parlist = Teuchos::rcp( new Teuchos::ParameterList() );
+    Teuchos::updateParametersFromXmlFile( filename, parlist.ptr() );
+
+    /*** Initialize main data structure. ***/
+    auto meshMgr = ROL::makePtr<MeshManager_Rectangle<RealT, DeviceT>>(*parlist);
+    // Initialize PDE describe Poisson's equation
+    auto pde = ROL::makePtr<PDE_Poisson_Boltzmann<RealT, DeviceT>>(*parlist);
+    auto con = ROL::makePtr<PDE_Constraint<RealT, DeviceT>>(pde, meshMgr, comm, *parlist, *outStream);
+    // Initialize quadratic objective function
+    std::vector<ROL::Ptr<QoI<RealT, DeviceT>>> qoi_vec(2, ROL::nullPtr);
+    qoi_vec[0] = ROL::makePtr<QoI_L2Tracking_Poisson_Boltzmann<RealT, DeviceT>>(pde->getFE());
+    qoi_vec[1] = ROL::makePtr<QoI_L2Penalty_Poisson_Boltzmann<RealT, DeviceT>>(pde->getFE());
+    auto std_obj = ROL::makePtr<StdObjective_Poisson_Boltzmann<RealT>>(*parlist);
+    auto obj = ROL::makePtr<PDE_Objective<RealT, DeviceT>>(qoi_vec, std_obj, con->getAssembler());
+
+    // Create vectors
+    auto u_ptr = con->getAssembler()->createStateVector();
+    auto z_ptr = con->getAssembler()->createControlVector();
+    auto p_ptr = con->getAssembler()->createStateVector();
+    auto r_ptr = con->getAssembler()->createResidualVector();
+    auto du_ptr = con->getAssembler()->createStateVector();
+    auto dz_ptr = con->getAssembler()->createControlVector();
+    u_ptr->randomize();
+    z_ptr->putScalar(1.0);
+    p_ptr->putScalar(0.0);
+    r_ptr->putScalar(0.0);
+    du_ptr->randomize();
+    dz_ptr->putScalar(0.0);
+    ROL::Ptr<ROL::Vector<RealT>> up, zp, pp, rp, dup, dzp;
+    up  = ROL::makePtr<PDE_PrimalSimVector<RealT, DeviceT>>(u_ptr,pde,con->getAssembler());
+    zp  = ROL::makePtr<PDE_PrimalOptVector<RealT, DeviceT>>(z_ptr,pde,con->getAssembler());
+    pp  = ROL::makePtr<PDE_PrimalSimVector<RealT, DeviceT>>(p_ptr,pde,con->getAssembler());
+    rp  = ROL::makePtr<PDE_DualSimVector<RealT, DeviceT>>(r_ptr,pde,con->getAssembler());
+    dup = ROL::makePtr<PDE_PrimalSimVector<RealT, DeviceT>>(du_ptr,pde,con->getAssembler());
+    dzp = ROL::makePtr<PDE_PrimalOptVector<RealT, DeviceT>>(dz_ptr,pde,con->getAssembler());
+    // Create ROL SimOpt vectors
+    ROL::Ptr<ROL::Vector_SimOpt<RealT>> x, d;
+    x = ROL::makePtr<ROL::Vector_SimOpt<RealT>>(up, zp);
+    d = ROL::makePtr<ROL::Vector_SimOpt<RealT>>(dup, dzp);
+
+    // Run derivative checks
+    obj->checkGradient(*x,*d,true,*outStream);
+    obj->checkHessVec(*x,*d,true,*outStream);
+    con->checkApplyJacobian(*x,*d,*up,true,*outStream);
+    con->checkApplyAdjointHessian(*x,*dup,*d,*x,true,*outStream);
+    con->checkAdjointConsistencyJacobian(*dup,*d,*x,true,*outStream);
+    con->checkInverseJacobian_1(*up,*up,*up,*zp,true,*outStream);
+    con->checkInverseAdjointJacobian_1(*up,*up,*up,*zp,true,*outStream);
+
+    RealT tol(1.e-8);
+    con->solve(*rp, *up, *zp, tol);
+    auto problem = ROL::makePtr<ROL::Problem<RealT>>(obj, x);
+    problem->addConstraint("PDE", con, pp);
+    problem->finalize(false, true, *outStream);
+    Teuchos::Time algoTimer("Algorithm Time", true);
+    ROL::Solver<RealT> solver(problem, *parlist);
+    solver.solve(*outStream);
+    algoTimer.stop();
+    *outStream << "Total optimization time = " << algoTimer.totalElapsedTime() << " seconds.\n";
+
+    // Output.
+    con->getAssembler()->printMeshData(*outStream);
+    con->solve(*rp, *up, *zp, tol);
+    con->outputTpetraVector(u_ptr, "state.txt");
+    con->outputTpetraVector(z_ptr, "control.txt");
+
+    Teuchos::Array<RealT> res(1, 0);
+    con->value(*rp, *up, *zp, tol);
+    r_ptr->norm2(res.view(0, 1));
+    *outStream << "Residual Norm: " << res[0] << std::endl;
+    errorFlag += (res[0] > 1.e-6 ? 1 : 0);
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/poisson-boltzmann/example_02.cpp b/packages/rol/example/PDE-OPT/poisson-boltzmann/example_02.cpp
index 13eabb4084ea..ca5b75f28a0f 100644
--- a/packages/rol/example/PDE-OPT/poisson-boltzmann/example_02.cpp
+++ b/packages/rol/example/PDE-OPT/poisson-boltzmann/example_02.cpp
@@ -13,8 +13,7 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -60,7 +59,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int>> comm
     = Tpetra::getDefaultComm();
   ROL::Ptr<const Teuchos::Comm<int>> serial_comm
diff --git a/packages/rol/example/PDE-OPT/poisson-boltzmann/example_02K.cpp b/packages/rol/example/PDE-OPT/poisson-boltzmann/example_02K.cpp
new file mode 100644
index 000000000000..37d1655339f6
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/poisson-boltzmann/example_02K.cpp
@@ -0,0 +1,378 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_02.cpp
+    \brief Shows how to solve the Poisson-Boltzmann problem.
+*/
+
+#include "Teuchos_Comm.hpp"
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+
+#include "ROL_Bounds.hpp"
+#include "ROL_Reduced_Objective_SimOpt.hpp"
+#include "ROL_MonteCarloGenerator.hpp"
+#include "ROL_StochasticProblem.hpp"
+#include "ROL_Solver.hpp"
+#include "ROL_TpetraTeuchosBatchManager.hpp"
+#include "ROL_CompositeConstraint_SimOpt.hpp"
+
+#include "../TOOLS/linearpdeconstraintK.hpp"
+#include "../TOOLS/pdeconstraintK.hpp"
+#include "../TOOLS/pdeobjectiveK.hpp"
+#include "../TOOLS/pdevectorK.hpp"
+#include "../TOOLS/batchmanagerK.hpp"
+#include "pde_poisson_boltzmann_ex02K.hpp"
+#include "mesh_poisson_boltzmann_ex02K.hpp"
+#include "obj_poisson_boltzmann_ex02K.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+template<class Real>
+Real random(const Teuchos::Comm<int> &comm,
+            const Real a = -1, const Real b = 1) {
+  Real val(0), u(0);
+  if ( Teuchos::rank<int>(comm)==0 ) {
+    u   = static_cast<Real>(rand())/static_cast<Real>(RAND_MAX);
+    val = (b-a)*u + a;
+  }
+  Teuchos::broadcast<int,Real>(comm,0,1,&val);
+  return val;
+}
+
+int main(int argc, char *argv[]) {
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  Kokkos::ScopeGuard kokkosScope (argc, argv);
+  ROL::Ptr<const Teuchos::Comm<int>> comm = Tpetra::getDefaultComm();
+  ROL::Ptr<const Teuchos::Comm<int>> serial_comm
+    = ROL::makePtr<Teuchos::SerialComm<int>>();
+  const int myRank = comm->getRank();
+  if ((iprint > 0) && (myRank == 0)) {
+    outStream = ROL::makePtrFromRef(std::cout);
+  }
+  else {
+    outStream = ROL::makePtrFromRef(bhs);
+  }
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+
+    /*** Read in XML input ***/
+    std::string filename = "input_ex02.xml";
+    auto parlist = Teuchos::rcp( new Teuchos::ParameterList() );
+    Teuchos::updateParametersFromXmlFile( filename, parlist.ptr() );
+
+    /*** Initialize main data structure. ***/
+    ROL::Ptr<MeshManager<RealT, DeviceT>>
+      meshMgr = ROL::makePtr<MeshManager_Example02<RealT, DeviceT>>(*parlist);
+    // Initialize PDE describe Poisson's equation
+    auto pde = ROL::makePtr<PDE_Poisson_Boltzmann_ex02<RealT, DeviceT>>(*parlist);
+    ROL::Ptr<ROL::Constraint_SimOpt<RealT>>
+      con = ROL::makePtr<PDE_Constraint<RealT, DeviceT>>(pde, meshMgr, serial_comm, *parlist, *outStream);
+    ROL::Ptr<PDE_Constraint<RealT, DeviceT>>
+      pdeCon = ROL::dynamicPtrCast<PDE_Constraint<RealT, DeviceT>>(con);
+    ROL::Ptr<PDE_Doping<RealT, DeviceT>>
+      pdeDoping = ROL::makePtr<PDE_Doping<RealT, DeviceT>>(*parlist);
+    ROL::Ptr<ROL::Constraint_SimOpt<RealT>>
+      conDoping = ROL::makePtr<Linear_PDE_Constraint<RealT, DeviceT>>(pdeDoping, meshMgr, serial_comm, *parlist, *outStream, true);
+    const ROL::Ptr<Assembler<RealT, DeviceT>> assembler = pdeCon->getAssembler();
+    assembler->printMeshData(*outStream);
+    con->setSolveParameters(*parlist);
+
+    /*************************************************************************/
+    /***************** BUILD VECTORS *****************************************/
+    /*************************************************************************/
+    auto u_ptr = assembler->createStateVector();
+    auto p_ptr = assembler->createStateVector();
+    auto r_ptr = assembler->createResidualVector();
+    auto z_ptr = assembler->createControlVector();
+    ROL::Ptr<ROL::Vector<RealT>> up, pp, rp, zp;
+    u_ptr->randomize();  //u_ptr->putScalar(static_cast<RealT>(1));
+    p_ptr->randomize();  //p_ptr->putScalar(static_cast<RealT>(1));
+    r_ptr->randomize();  //r_ptr->putScalar(static_cast<RealT>(1));
+    z_ptr->randomize();  //z_ptr->putScalar(static_cast<RealT>(1));
+    up = ROL::makePtr<PDE_PrimalSimVector<RealT, DeviceT>>(u_ptr,pde,assembler,*parlist);
+    pp = ROL::makePtr<PDE_PrimalSimVector<RealT, DeviceT>>(p_ptr,pde,assembler,*parlist);
+    rp = ROL::makePtr<PDE_DualSimVector<RealT, DeviceT>>(r_ptr,pde,assembler,*parlist);
+    zp = ROL::makePtr<PDE_PrimalOptVector<RealT, DeviceT>>(z_ptr,pde,assembler,*parlist);
+
+    /*************************************************************************/
+    /***************** BUILD SAMPLER *****************************************/
+    /*************************************************************************/
+    int stochDim = 3;
+    std::vector<ROL::Ptr<ROL::Distribution<RealT>>> distVec(stochDim);
+    // Build lambda2 distribution
+    Teuchos::ParameterList LvolList;
+    LvolList.sublist("Distribution").set("Name","Uniform");
+    LvolList.sublist("Distribution").sublist("Uniform").set("Lower Bound", -2.0);
+    LvolList.sublist("Distribution").sublist("Uniform").set("Upper Bound", -1.0);
+    distVec[0] = ROL::DistributionFactory<RealT>(LvolList);
+    // Build delta2 distribution
+    Teuchos::ParameterList DvolList;
+    DvolList.sublist("Distribution").set("Name","Uniform");
+    DvolList.sublist("Distribution").sublist("Uniform").set("Lower Bound", -1.0);
+    DvolList.sublist("Distribution").sublist("Uniform").set("Upper Bound",  0.0);
+    distVec[1] = ROL::DistributionFactory<RealT>(DvolList);
+    // Build doping diffusion distribution
+    Teuchos::ParameterList dopeList;
+    dopeList.sublist("Distribution").set("Name","Uniform");
+    dopeList.sublist("Distribution").sublist("Uniform").set("Lower Bound", 2e-6);
+    dopeList.sublist("Distribution").sublist("Uniform").set("Upper Bound", 2e-4);
+    distVec[2] = ROL::DistributionFactory<RealT>(dopeList);
+    // Build sampler
+    int nsamp = parlist->sublist("Problem").get("Number of Samples",100);
+    ROL::Ptr<ROL::BatchManager<RealT>>
+      bman = ROL::makePtr<ROL::TpetraTeuchosBatchManager<RealT>>(comm);
+    ROL::Ptr<ROL::SampleGenerator<RealT>>
+      sampler = ROL::makePtr<ROL::MonteCarloGenerator<RealT>>(nsamp,distVec,bman);
+    // Print samples
+    std::vector<RealT> sample(stochDim), Lmean(stochDim), Gmean(stochDim);
+    std::stringstream name_samp;
+    name_samp << "samples_" << bman->batchID() << ".txt";
+    std::ofstream file_samp;
+    file_samp.open(name_samp.str());
+    for (int i = 0; i < sampler->numMySamples(); ++i) {
+      sample = sampler->getMyPoint(i);
+      file_samp << std::scientific << std::setprecision(15);
+      for (int j = 0; j < stochDim; ++j) {
+        file_samp << std::setw(25) << std::left << sample[j];
+        Lmean[j] += sampler->getMyWeight(i)*sample[j];
+      }
+      file_samp << std::endl;
+    }
+    file_samp.close();
+    bman->sumAll(&Lmean[0],&Gmean[0],stochDim);
+
+    /*************************************************************************/
+    /***************** BUILD REFERENCE DOPING AND POTENTIAL ******************/
+    /*************************************************************************/
+    auto ru_ptr = assembler->createStateVector();
+    auto rz_ptr = assembler->createControlVector();
+    ROL::Ptr<ROL::Vector<RealT>> rup, rzp;
+    rup = ROL::makePtr<PDE_PrimalSimVector<RealT, DeviceT>>(ru_ptr,pde,assembler,*parlist);
+    rzp = ROL::makePtr<PDE_PrimalOptVector<RealT, DeviceT>>(rz_ptr,pde,assembler,*parlist);
+    ROL::Ptr<Doping<RealT, DeviceT>>
+      dope = ROL::makePtr<Doping<RealT, DeviceT>>(pde->getFE(), pde->getCellNodes(),
+                                                  assembler->getDofManager()->getCellDofs(),
+                                                  assembler->getCellIds(), *parlist);
+    // Initialize "filtered" of "unfiltered" constraint.
+    ROL::Ptr<ROL::Constraint_SimOpt<RealT>>
+      pdeWithDoping = ROL::makePtr<ROL::CompositeConstraint_SimOpt<RealT>>(con,
+                      conDoping,*rp, *rp, *up, *zp, *zp, true, true);
+    pdeWithDoping->setSolveParameters(*parlist);
+    dope->build(rz_ptr);
+    RealT tol(1.e-8);
+    pdeWithDoping->setParameter(Gmean);
+    pdeWithDoping->solve(*rp,*rup,*rzp,tol);
+    pdeCon->outputTpetraVector(ru_ptr,"reference_state.txt");
+    pdeCon->outputTpetraVector(rz_ptr,"reference_control.txt");
+
+    /*************************************************************************/
+    /***************** BUILD COST FUNCTIONAL *********************************/
+    /*************************************************************************/
+    std::vector<ROL::Ptr<QoI<RealT, DeviceT>>> qoi_vec(3,ROL::nullPtr);
+    // Current flow over drain
+    qoi_vec[0] = ROL::makePtr<QoI_State_Cost_1_Poisson_Boltzmann<RealT, DeviceT>>(
+                 pde->getFE(),pde->getBdryFE(),pde->getBdryCellLocIds(),*parlist);
+    ROL::Ptr<IntegralObjective<RealT, DeviceT>>
+      stateObj = ROL::makePtr<IntegralObjective<RealT, DeviceT>>(qoi_vec[0],assembler);
+    // Deviation from reference doping
+    qoi_vec[1] = ROL::makePtr<QoI_Control_Cost_1_Poisson_Boltzmann<RealT, DeviceT>>(pde->getFE(),dope);
+    ROL::Ptr<IntegralObjective<RealT, DeviceT>>
+      ctrlObj1 = ROL::makePtr<IntegralObjective<RealT, DeviceT>>(qoi_vec[1],assembler);
+    // H1-Seminorm of doping
+    qoi_vec[2] = ROL::makePtr<QoI_Control_Cost_2_Poisson_Boltzmann<RealT, DeviceT>>(pde->getFE());
+    ROL::Ptr<IntegralObjective<RealT, DeviceT>>
+      ctrlObj2 = ROL::makePtr<IntegralObjective<RealT, DeviceT>>(qoi_vec[2],assembler);
+    // Build standard vector objective function
+    RealT currentWeight = parlist->sublist("Problem").get("Desired Current Scale",1.5);
+    RealT J = stateObj->value(*rup,*rzp,tol); // Reference current flow over drain
+    J *= currentWeight; // Increase current flow by 50%
+    RealT w1 = parlist->sublist("Problem").get("State Cost Parameter",1e-3);
+    RealT w2 = parlist->sublist("Problem").get("Control Misfit Parameter",1e-2);
+    RealT w3 = parlist->sublist("Problem").get("Control Cost Parameter",1e-8);
+    ROL::Ptr<ROL::StdObjective<RealT>>
+      std_obj = ROL::makePtr<StdObjective_Poisson_Boltzmann<RealT>>(J,w1,w2,w3);
+    // Build full-space objective
+    auto obj = ROL::makePtr<PDE_Objective<RealT, DeviceT>>(qoi_vec,std_obj,assembler);
+    // Build reduced-space objective
+    bool storage = parlist->sublist("Problem").get("Use state storage",true);
+    auto stateStore = ROL::makePtr<ROL::VectorController<RealT>>();
+    auto objReduced = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(
+                      obj,pdeWithDoping,stateStore,up,zp,pp,storage);
+
+    /*************************************************************************/
+    /***************** BUILD BOUND CONSTRAINT ********************************/
+    /*************************************************************************/
+    auto lo_ptr = assembler->createControlVector();
+    auto hi_ptr = assembler->createControlVector();
+    ROL::Ptr<DopingBounds<RealT, DeviceT>>
+      dopeBnd = ROL::makePtr<DopingBounds<RealT, DeviceT>>(pde->getFE(), pde->getCellNodes(),
+                                                           assembler->getDofManager()->getCellDofs(),
+                                                           assembler->getCellIds(), *parlist);
+    dopeBnd->build(lo_ptr, hi_ptr);
+    ROL::Ptr<ROL::Vector<RealT>> lop, hip;
+    lop = ROL::makePtr<PDE_PrimalOptVector<RealT, DeviceT>>(lo_ptr, pde, assembler);
+    hip = ROL::makePtr<PDE_PrimalOptVector<RealT, DeviceT>>(hi_ptr, pde, assembler);
+    ROL::Ptr<ROL::BoundConstraint<RealT>>
+      bnd = ROL::makePtr<ROL::Bounds<RealT>>(lop,hip);
+    bool deactivate = parlist->sublist("Problem").get("Deactivate Bound Constraints",false);
+    if (deactivate) bnd->deactivate();
+
+    /*************************************************************************/
+    /***************** BUILD STOCHASTIC PROBLEM ******************************/
+    /*************************************************************************/
+    ROL::Ptr<ROL::StochasticProblem<RealT>>
+      opt = ROL::makePtr<ROL::StochasticProblem<RealT>>(objReduced,zp);
+    opt->addBoundConstraint(bnd);
+    parlist->sublist("SOL").set("Initial Statistic", static_cast<RealT>(1));
+    opt->makeObjectiveStochastic(*parlist,sampler);
+
+    /*************************************************************************/
+    /***************** RUN VECTOR AND DERIVATIVE CHECKS **********************/
+    /*************************************************************************/
+    bool checkDeriv = parlist->sublist("Problem").get("Check Derivatives",false);
+    if ( checkDeriv ) {
+      auto du_ptr = assembler->createStateVector();
+      auto dz_ptr = assembler->createControlVector();
+      du_ptr->randomize(); //du_ptr->putScalar(static_cast<RealT>(0));
+      dz_ptr->randomize(); //dz_ptr->putScalar(static_cast<RealT>(1));
+      ROL::Ptr<ROL::Vector<RealT>> dup, dzp;
+      dup = ROL::makePtr<PDE_PrimalSimVector<RealT, DeviceT>>(du_ptr,pde,assembler,*parlist);
+      dzp = ROL::makePtr<PDE_PrimalOptVector<RealT, DeviceT>>(dz_ptr,pde,assembler,*parlist);
+      // Create ROL SimOpt vectors
+      ROL::Vector_SimOpt<RealT> x(up,zp);
+      ROL::Vector_SimOpt<RealT> d(dup,dzp);
+
+      objReduced->setParameter(Gmean);
+      *outStream << "\n\nCheck Gradient of Full Objective Function\n";
+      obj->checkGradient(x,d,true,*outStream);
+      *outStream << "\n\nCheck Hessian of Full Objective Function\n";
+      obj->checkHessVec(x,d,true,*outStream);
+      *outStream << "\n\nCheck Full Jacobian of PDE Constraint\n";
+      pdeWithDoping->checkApplyJacobian(x,d,*rp,true,*outStream);
+      *outStream << "\n\nCheck Jacobian_1 of PDE Constraint\n";
+      pdeWithDoping->checkApplyJacobian_1(*up,*zp,*dup,*rp,true,*outStream);
+      *outStream << "\n\nCheck Jacobian_2 of PDE Constraint\n";
+      pdeWithDoping->checkApplyJacobian_2(*up,*zp,*dzp,*rp,true,*outStream);
+      *outStream << "\n\nCheck Full Hessian of PDE Constraint\n";
+      pdeWithDoping->checkApplyAdjointHessian(x,*pp,d,x,true,*outStream);
+      *outStream << "\n";
+      pdeWithDoping->checkAdjointConsistencyJacobian(*dup,d,x,true,*outStream);
+      *outStream << "\n";
+      pdeWithDoping->checkInverseJacobian_1(*rp,*dup,*up,*zp,true,*outStream);
+      *outStream << "\n";
+      *outStream << "\n\nCheck Gradient of Reduced Objective Function\n";
+      objReduced->checkGradient(*zp,*dzp,true,*outStream);
+      *outStream << "\n\nCheck Hessian of Reduced Objective Function\n";
+      objReduced->checkHessVec(*zp,*dzp,true,*outStream);
+
+      opt->check(true,*outStream);
+    }
+
+    /*************************************************************************/
+    /***************** SOLVE OPTIMIZATION PROBLEM ****************************/
+    /*************************************************************************/
+    parlist->sublist("Step").set("Type","Trust Region");
+    opt->finalize(false,true,*outStream);
+    ROL::Solver<RealT> solver(opt,*parlist);
+    zp->set(*rzp);
+    std::clock_t timer = std::clock();
+    solver.solve(*outStream);
+    *outStream << "Optimization time: "
+               << static_cast<RealT>(std::clock()-timer)/static_cast<RealT>(CLOCKS_PER_SEC)
+               << " seconds." << std::endl << std::endl;
+
+    /*************************************************************************/
+    /***************** OUTPUT RESULTS ****************************************/
+    /*************************************************************************/
+    std::clock_t timer_print = std::clock();
+    // Output control to file
+    pdeCon->outputTpetraVector(z_ptr,"control.txt");
+    // Output expected state and samples to file
+    auto Lu_ptr = assembler->createStateVector();
+    auto Lv_ptr = assembler->createStateVector();
+    auto Gu_ptr = assembler->createStateVector();
+    auto Gv_ptr = assembler->createStateVector();
+    ROL::Ptr<ROL::Vector<RealT>> Lup, Gup, Lvp, Gvp;
+    Lup = ROL::makePtr<PDE_PrimalSimVector<RealT, DeviceT>>(Lu_ptr,pde,assembler,*parlist);
+    Lvp = ROL::makePtr<PDE_PrimalSimVector<RealT, DeviceT>>(Lv_ptr,pde,assembler,*parlist);
+    Gup = ROL::makePtr<PDE_PrimalSimVector<RealT, DeviceT>>(Gu_ptr,pde,assembler,*parlist);
+    Gvp = ROL::makePtr<PDE_PrimalSimVector<RealT, DeviceT>>(Gv_ptr,pde,assembler,*parlist);
+    ROL::Elementwise::Power<RealT> sqr(2.0);
+    for (int i = 0; i < sampler->numMySamples(); ++i) {
+      sample = sampler->getMyPoint(i);
+      con->setParameter(sample);
+      con->solve(*rp,*Gup,*zp,tol);
+      Gvp->set(*Gup); Gvp->applyUnary(sqr);
+      Lup->axpy(sampler->getMyWeight(i),*Gup);
+      Lvp->axpy(sampler->getMyWeight(i),*Gvp);
+    }
+    bman->sumAll(*Lup,*Gup);
+    pdeCon->outputTpetraVector(Gu_ptr,"mean_state.txt");
+    bman->sumAll(*Lvp,*Gvp);
+    Gup->applyUnary(sqr);
+    Gvp->axpy(-1.0,*Gup);
+    pdeCon->outputTpetraVector(Gv_ptr,"variance_state.txt");
+    // Build objective function distribution
+    RealT val(0), val1(0), val2(0);
+    int nsamp_dist = parlist->sublist("Problem").get("Number of Output Samples",100);
+    ROL::Ptr<ROL::SampleGenerator<RealT>> sampler_dist
+      = ROL::makePtr<ROL::MonteCarloGenerator<RealT>>(nsamp_dist,distVec,bman);
+    std::stringstream name;
+    name << "obj_samples_" << bman->batchID() << ".txt";
+    std::ofstream file;
+    file.open(name.str());
+    for (int i = 0; i < sampler_dist->numMySamples(); ++i) {
+      sample = sampler_dist->getMyPoint(i);
+      objReduced->setParameter(sample);
+      val  = objReduced->value(*zp,tol);
+      val1 = ctrlObj1->value(*up,*zp,tol);
+      val2 = ctrlObj2->value(*up,*zp,tol);
+      file << std::scientific << std::setprecision(15);
+      for (int j = 0; j < stochDim; ++j) {
+        file << std::setw(25) << std::left << sample[j];
+      }
+      file << std::setw(25) << std::left << val;
+      file << std::setw(25) << std::left << (val-w2*val1-w3*val2)/w1;
+      file << std::setw(25) << std::left << val1;
+      file << std::setw(25) << std::left << val2;
+      file << std::endl;
+    }
+    file.close();
+    *outStream << "Output time: "
+               << static_cast<RealT>(std::clock()-timer_print)/static_cast<RealT>(CLOCKS_PER_SEC)
+               << " seconds." << std::endl << std::endl;
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/poisson-boltzmann/example_03.cpp b/packages/rol/example/PDE-OPT/poisson-boltzmann/example_03.cpp
index fe40f0473594..d963a0caab39 100644
--- a/packages/rol/example/PDE-OPT/poisson-boltzmann/example_03.cpp
+++ b/packages/rol/example/PDE-OPT/poisson-boltzmann/example_03.cpp
@@ -13,8 +13,7 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -45,7 +44,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int>> comm
     = Tpetra::getDefaultComm();
   const int myRank = comm->getRank();
diff --git a/packages/rol/example/PDE-OPT/poisson-boltzmann/example_03K.cpp b/packages/rol/example/PDE-OPT/poisson-boltzmann/example_03K.cpp
new file mode 100644
index 000000000000..35d568c1835c
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/poisson-boltzmann/example_03K.cpp
@@ -0,0 +1,237 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_03.cpp
+    \brief Shows how to solve the Poisson-Boltzmann problem.
+*/
+
+#include "Teuchos_Comm.hpp"
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+
+#include "ROL_Solver.hpp"
+#include "ROL_UnaryFunctions.hpp"
+#include "ROL_Bounds.hpp"
+#include "ROL_BoundConstraint_SimOpt.hpp"
+#include "ROL_CompositeConstraint_SimOpt.hpp"
+
+#include "../TOOLS/linearpdeconstraintK.hpp"
+#include "../TOOLS/pdeconstraintK.hpp"
+#include "../TOOLS/pdeobjectiveK.hpp"
+#include "../TOOLS/pdevectorK.hpp"
+#include "pde_poisson_boltzmann_ex02K.hpp"
+#include "mesh_poisson_boltzmann_ex02K.hpp"
+#include "obj_poisson_boltzmann_ex02K.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+int main(int argc, char *argv[]) {
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  Kokkos::ScopeGuard kokkosScope (argc, argv);
+  ROL::Ptr<const Teuchos::Comm<int>> comm = Tpetra::getDefaultComm();
+  const int myRank = comm->getRank();
+  if ((iprint > 0) && (myRank == 0)) {
+    outStream = ROL::makePtrFromRef(std::cout);
+  }
+  else {
+    outStream = ROL::makePtrFromRef(bhs);
+  }
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+
+    /*** Read in XML input ***/
+    std::string filename = "input_ex03.xml";
+    auto parlist = Teuchos::rcp( new Teuchos::ParameterList() );
+    Teuchos::updateParametersFromXmlFile( filename, parlist.ptr() );
+
+    /*** Initialize main data structure. ***/
+    ROL::Ptr<MeshManager<RealT, DeviceT>>
+      meshMgr = ROL::makePtr<MeshManager_Example02<RealT, DeviceT>>(*parlist);
+    // Initialize PDE describe Poisson's equation
+    auto pde = ROL::makePtr<PDE_Poisson_Boltzmann_ex02<RealT, DeviceT>>(*parlist);
+    ROL::Ptr<ROL::Constraint_SimOpt<RealT>>
+      con = ROL::makePtr<PDE_Constraint<RealT, DeviceT>>(pde, meshMgr, comm, *parlist, *outStream);
+    ROL::Ptr<PDE_Constraint<RealT, DeviceT>>
+      pdeCon = ROL::dynamicPtrCast<PDE_Constraint<RealT, DeviceT>>(con);
+    ROL::Ptr<PDE_Doping<RealT, DeviceT>>
+      pdeDoping = ROL::makePtr<PDE_Doping<RealT, DeviceT>>(*parlist);
+    ROL::Ptr<ROL::Constraint_SimOpt<RealT>>
+      conDoping = ROL::makePtr<Linear_PDE_Constraint<RealT, DeviceT>>(pdeDoping, meshMgr, comm, *parlist, *outStream, true);
+    const ROL::Ptr<Assembler<RealT, DeviceT>> assembler = pdeCon->getAssembler();
+    assembler->printMeshData(*outStream);
+    con->setSolveParameters(*parlist);
+
+    /*************************************************************************/
+    /***************** BUILD VECTORS *****************************************/
+    /*************************************************************************/
+    auto u_ptr = assembler->createStateVector();
+    auto p_ptr = assembler->createStateVector();
+    auto r_ptr = assembler->createResidualVector();
+    auto z_ptr = assembler->createControlVector();
+    ROL::Ptr<ROL::Vector<RealT>> up, pp, rp, zp;
+    u_ptr->randomize();  //u_ptr->putScalar(static_cast<RealT>(1));
+    p_ptr->randomize();  //p_ptr->putScalar(static_cast<RealT>(1));
+    r_ptr->randomize();  //r_ptr->putScalar(static_cast<RealT>(1));
+    z_ptr->randomize();  //z_ptr->putScalar(static_cast<RealT>(1));
+    up = ROL::makePtr<PDE_PrimalSimVector<RealT, DeviceT>>(u_ptr, pde, assembler, *parlist);
+    pp = ROL::makePtr<PDE_PrimalSimVector<RealT, DeviceT>>(p_ptr, pde, assembler, *parlist);
+    rp = ROL::makePtr<PDE_DualSimVector<RealT, DeviceT>>(r_ptr, pde, assembler, *parlist);
+    zp = ROL::makePtr<PDE_PrimalOptVector<RealT, DeviceT>>(z_ptr, pde, assembler, *parlist);
+    ROL::Ptr<ROL::Vector<RealT>> xp
+      = ROL::makePtr<ROL::Vector_SimOpt<RealT>>(up,zp);
+
+    /*************************************************************************/
+    /***************** BUILD REFERENCE DOPING AND POTENTIAL ******************/
+    /*************************************************************************/
+    auto ru_ptr = assembler->createStateVector();
+    auto rz_ptr = assembler->createControlVector();
+    ROL::Ptr<ROL::Vector<RealT>> rup, rzp;
+    rup = ROL::makePtr<PDE_PrimalSimVector<RealT, DeviceT>>(ru_ptr,pde,assembler,*parlist);
+    rzp = ROL::makePtr<PDE_PrimalOptVector<RealT, DeviceT>>(rz_ptr,pde,assembler,*parlist);
+    ROL::Ptr<Doping<RealT, DeviceT>>
+      dope = ROL::makePtr<Doping<RealT, DeviceT>>(pde->getFE(), pde->getCellNodes(),
+                                                  assembler->getDofManager()->getCellDofs(),
+                                                  assembler->getCellIds(), *parlist);
+    // Initialize "filtered" of "unfiltered" constraint.
+    ROL::Ptr<ROL::Constraint_SimOpt<RealT>>
+      pdeWithDoping = ROL::makePtr<ROL::CompositeConstraint_SimOpt<RealT>>(
+                      con, conDoping, *rp, *rp, *up, *zp, *zp, true, true);
+    pdeWithDoping->setSolveParameters(*parlist);
+    dope->build(rz_ptr);
+    RealT tol(1.e-8);
+    pdeWithDoping->solve(*rp,*rup,*rzp,tol);
+    pdeCon->outputTpetraVector(ru_ptr,"reference_state.txt");
+    pdeCon->outputTpetraVector(rz_ptr,"reference_control.txt");
+
+    /*************************************************************************/
+    /***************** BUILD COST FUNCTIONAL *********************************/
+    /*************************************************************************/
+    std::vector<ROL::Ptr<QoI<RealT, DeviceT>>> qoi_vec(3,ROL::nullPtr);
+    // Current flow over drain
+    qoi_vec[0] = ROL::makePtr<QoI_State_Cost_1_Poisson_Boltzmann<RealT, DeviceT>>(pde->getFE(),
+                                    pde->getBdryFE(), pde->getBdryCellLocIds(), *parlist);
+    ROL::Ptr<IntegralObjective<RealT, DeviceT>>
+      stateObj = ROL::makePtr<IntegralObjective<RealT, DeviceT>>(qoi_vec[0], assembler);
+    // Deviation from reference doping
+    qoi_vec[1] = ROL::makePtr<QoI_Control_Cost_1_Poisson_Boltzmann<RealT, DeviceT>>(pde->getFE(), dope);
+    ROL::Ptr<IntegralObjective<RealT, DeviceT>>
+      ctrlObj1 = ROL::makePtr<IntegralObjective<RealT, DeviceT>>(qoi_vec[1], assembler);
+    // H1-Seminorm of doping
+    qoi_vec[2] = ROL::makePtr<QoI_Control_Cost_2_Poisson_Boltzmann<RealT, DeviceT>>(pde->getFE());
+    ROL::Ptr<IntegralObjective<RealT, DeviceT>>
+      ctrlObj2 = ROL::makePtr<IntegralObjective<RealT, DeviceT>>(qoi_vec[2], assembler);
+    // Build standard vector objective function
+    RealT currentWeight = parlist->sublist("Problem").get("Desired Current Scale",1.5);
+    RealT J = stateObj->value(*rup,*rzp,tol); // Reference current flow over drain
+    J *= currentWeight; // Increase current flow by 50%
+    RealT w1 = parlist->sublist("Problem").get("State Cost Parameter",1e-3);
+    RealT w2 = parlist->sublist("Problem").get("Control Misfit Parameter",1e-2);
+    RealT w3 = parlist->sublist("Problem").get("Control Cost Parameter",1e-8);
+    ROL::Ptr<ROL::StdObjective<RealT>>
+      std_obj = ROL::makePtr<StdObjective_Poisson_Boltzmann<RealT>>(J,w1,w2,w3);
+    // Build full-space objective
+    ROL::Ptr<PDE_Objective<RealT, DeviceT>>
+      obj = ROL::makePtr<PDE_Objective<RealT, DeviceT>>(qoi_vec, std_obj, assembler);
+ 
+    /*************************************************************************/
+    /***************** BUILD BOUND CONSTRAINT ********************************/
+    /*************************************************************************/
+    auto zlo_ptr = assembler->createControlVector();
+    auto zhi_ptr = assembler->createControlVector();
+    ROL::Ptr<DopingBounds<RealT, DeviceT>>
+      dopeBnd = ROL::makePtr<DopingBounds<RealT, DeviceT>>(pde->getFE(), pde->getCellNodes(),
+                                                           assembler->getDofManager()->getCellDofs(),
+                                                           assembler->getCellIds(), *parlist);
+    dopeBnd->build(zlo_ptr,zhi_ptr);
+    ROL::Ptr<ROL::Vector<RealT>> zlop, zhip;
+    zlop = ROL::makePtr<PDE_PrimalOptVector<RealT, DeviceT>>(zlo_ptr, pde, assembler);
+    zhip = ROL::makePtr<PDE_PrimalOptVector<RealT, DeviceT>>(zhi_ptr, pde, assembler);
+    ROL::Ptr<ROL::BoundConstraint<RealT>>
+      zbnd = ROL::makePtr<ROL::Bounds<RealT>>(zlop,zhip);
+    // State bounds
+    auto ulo_ptr = assembler->createStateVector();
+    auto uhi_ptr = assembler->createStateVector();
+    ulo_ptr->putScalar(ROL::ROL_NINF<RealT>()); uhi_ptr->putScalar(ROL::ROL_INF<RealT>());
+    ROL::Ptr<ROL::Vector<RealT>> ulop, uhip;
+    ulop = ROL::makePtr<PDE_PrimalSimVector<RealT, DeviceT>>(ulo_ptr, pde, assembler);
+    uhip = ROL::makePtr<PDE_PrimalSimVector<RealT, DeviceT>>(uhi_ptr, pde, assembler);
+    ROL::Ptr<ROL::BoundConstraint<RealT>>
+      ubnd = ROL::makePtr<ROL::Bounds<RealT>>(ulop,uhip);
+    ubnd->deactivate();
+    // SimOpt bounds
+    ROL::Ptr<ROL::BoundConstraint<RealT>>
+      bnd = ROL::makePtr<ROL::BoundConstraint_SimOpt<RealT>>(ubnd, zbnd);
+    bool deactivate = parlist->sublist("Problem").get("Deactivate Bound Constraints", false);
+    if (deactivate) bnd->deactivate();
+
+    /*************************************************************************/
+    /***************** BUILD OPTIMIZATION PROBLEM ****************************/
+    /*************************************************************************/
+    ROL::Ptr<ROL::Problem<RealT>>
+      opt = ROL::makePtr<ROL::Problem<RealT>>(obj, xp);
+    opt->addBoundConstraint(bnd);
+    opt->addConstraint("PDE", pdeWithDoping, pp);
+    opt->finalize(false, true, *outStream);
+    bool checkDeriv = parlist->sublist("Problem").get("Check Derivatives", false);
+    if ( checkDeriv ) opt->check(true, *outStream);
+
+    /*************************************************************************/
+    /***************** SOLVE OPTIMIZATION PROBLEM ****************************/
+    /*************************************************************************/
+    ROL::Solver<RealT> solver(opt, *parlist);
+    zp->set(*rzp);
+
+    bool initSolve = parlist->sublist("Problem").get("Solve state for full space", true);
+    if( initSolve ) pdeWithDoping->solve(*rp, *up, *zp, tol);
+
+    std::clock_t timer = std::clock();
+    solver.solve(*outStream);
+    *outStream << "Optimization time: "
+               << static_cast<RealT>(std::clock()-timer)/static_cast<RealT>(CLOCKS_PER_SEC)
+               << " seconds." << std::endl << std::endl;
+
+    /*************************************************************************/
+    /***************** OUTPUT RESULTS ****************************************/
+    /*************************************************************************/
+    std::clock_t timer_print = std::clock();
+    // Output control to file
+    pdeCon->outputTpetraVector(z_ptr, "control.txt");
+    // Output expected state and samples to file
+    pdeCon->outputTpetraVector(u_ptr, "state.txt");
+    *outStream << "Output time: "
+               << static_cast<RealT>(std::clock()-timer_print)/static_cast<RealT>(CLOCKS_PER_SEC)
+               << " seconds." << std::endl << std::endl;
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/poisson-boltzmann/mesh_poisson_boltzmann_ex02K.hpp b/packages/rol/example/PDE-OPT/poisson-boltzmann/mesh_poisson_boltzmann_ex02K.hpp
new file mode 100644
index 000000000000..f13e95e689b6
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/poisson-boltzmann/mesh_poisson_boltzmann_ex02K.hpp
@@ -0,0 +1,107 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+#include "../TOOLS/meshmanagerK.hpp"
+
+template <class Real, class DeviceType>
+class MeshManager_Example02 : public MeshManager_Rectangle<Real, DeviceType> {
+
+private:
+
+  int nx_;
+  int ny_;
+  Real width_;
+  ROL::Ptr<std::vector<std::vector<std::vector<int>>>>  meshSideSets_;
+
+public:
+
+  MeshManager_Example02(Teuchos::ParameterList &parlist) : MeshManager_Rectangle<Real, DeviceType>(parlist)
+  {
+    nx_ = parlist.sublist("Geometry").get("NX", 3);
+    ny_ = parlist.sublist("Geometry").get("NY", 3);
+    width_ = parlist.sublist("Geometry").get("Width", 1.0);
+    computeSideSets();
+  }
+
+
+  void computeSideSets() {
+
+    int numSideSets = 4;
+    meshSideSets_ = ROL::makePtr<std::vector<std::vector<std::vector<int>>>>(numSideSets);
+
+    Real patchFrac = static_cast<Real>(1)/static_cast<Real>(6);
+    int np1 = static_cast<int>(patchFrac * static_cast<Real>(nx_));   // Source
+    int np2 = static_cast<int>(patchFrac * static_cast<Real>(nx_));
+    int np3 = 2*static_cast<int>(patchFrac * static_cast<Real>(nx_)); // Gate
+    int np4 = static_cast<int>(patchFrac * static_cast<Real>(nx_));
+    int np5 = nx_-(np1+np2+np3+np4);                                  // Drain
+
+    // Neumann
+    (*meshSideSets_)[0].resize(4);
+    (*meshSideSets_)[0][0].resize(nx_);
+    (*meshSideSets_)[0][1].resize(ny_);
+    (*meshSideSets_)[0][2].resize(np2+np4);
+    (*meshSideSets_)[0][3].resize(ny_);
+    // Source
+    (*meshSideSets_)[1].resize(4);
+    (*meshSideSets_)[1][0].resize(0);
+    (*meshSideSets_)[1][1].resize(0);
+    (*meshSideSets_)[1][2].resize(np1);
+    (*meshSideSets_)[1][3].resize(0);
+    // Gate
+    (*meshSideSets_)[2].resize(4);
+    (*meshSideSets_)[2][0].resize(0);
+    (*meshSideSets_)[2][1].resize(0);
+    (*meshSideSets_)[2][2].resize(np3);
+    (*meshSideSets_)[2][3].resize(0);
+    // Drain
+    (*meshSideSets_)[3].resize(4);
+    (*meshSideSets_)[3][0].resize(0);
+    (*meshSideSets_)[3][1].resize(0);
+    (*meshSideSets_)[3][2].resize(np5);
+    (*meshSideSets_)[3][3].resize(0);
+
+    for (int i=0; i<nx_; ++i) {
+      (*meshSideSets_)[0][0][i] = i;
+    }
+    for (int i=0; i<ny_; ++i) {
+      (*meshSideSets_)[0][1][i] = (i+1)*nx_-1;
+    }
+    for (int i=0; i<ny_; ++i) {
+      (*meshSideSets_)[0][3][i] = i*nx_;
+    }
+    for (int i=0; i<np1; ++i) {
+      (*meshSideSets_)[1][2][i] = i + nx_*(ny_-1);
+    }
+    for (int i=0; i<np2; ++i) {
+      (*meshSideSets_)[0][2][i] = i + np1 + nx_*(ny_-1);
+    }
+    for (int i=0; i<np3; ++i) {
+      (*meshSideSets_)[2][2][i] = i + (np1+np2) + nx_*(ny_-1);
+    }
+    for (int i=0; i<np4; ++i) {
+      (*meshSideSets_)[0][2][i] = i + (np1+np2+np3) + nx_*(ny_-1);
+    }
+    for (int i=0; i<np5; ++i) {
+      (*meshSideSets_)[3][2][i] = i + (np1+np2+np3+np4) + nx_*(ny_-1);
+    }
+
+  } // computeSideSets
+
+  ROL::Ptr<std::vector<std::vector<std::vector<int>>>> getSideSets(
+              const bool verbose = false,
+              std::ostream & outStream = std::cout) const {
+    if ( verbose ) {
+      outStream << "Mesh_Example02: getSideSets called" << std::endl;
+      outStream << "Mesh_Example02: numSideSets = " << meshSideSets_->size() << std::endl;
+    }
+    return meshSideSets_;
+  }
+
+}; // MeshManager_Example02
diff --git a/packages/rol/example/PDE-OPT/poisson-boltzmann/obj_poisson_boltzmannK.hpp b/packages/rol/example/PDE-OPT/poisson-boltzmann/obj_poisson_boltzmannK.hpp
new file mode 100644
index 000000000000..188aa7a181ec
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/poisson-boltzmann/obj_poisson_boltzmannK.hpp
@@ -0,0 +1,268 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  obj.hpp
+    \brief Provides the interface for local (cell-based) objective function computations.
+*/
+
+#ifndef PDEOPT_QOI_L2TRACKING_POISSON_BOLTZMANNK_HPP
+#define PDEOPT_QOI_L2TRACKING_POISSON_BOLTZMANNK_HPP
+
+#include "../TOOLS/qoiK.hpp"
+#include "pde_poisson_boltzmannK.hpp"
+
+template <class Real, class DeviceType>
+class QoI_L2Tracking_Poisson_Boltzmann : public QoI<Real, DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using fe_type = FE<Real, DeviceType>;
+  using fst = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using rst = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  ROL::Ptr<fe_type> fe_;
+
+  scalar_view target_;
+
+  Real targetFunc(const std::vector<Real> & x) const {
+    int size = x.size();
+    Real val(0);
+    for (int i = 0; i < size; ++i) {
+      val += x[i]*x[i];
+    }
+    return val;
+  }
+
+public:
+  QoI_L2Tracking_Poisson_Boltzmann(const ROL::Ptr<fe_type> & fe) : fe_(fe) {
+    int c = fe_->cubPts().extent_int(0);
+    int p = fe_->cubPts().extent_int(1);
+    int d = fe_->cubPts().extent_int(2);
+    std::vector<Real> pt(d);
+    target_ = scalar_view("target_", c, p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k) {
+          pt[k] = (fe_->cubPts())(i, j, k);
+        }
+        (target_)(i,j) = targetFunc(pt);
+      }
+    }
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    // Get relevant dimensions
+    int c = u_coeff.extent_int(0);
+    int p = fe_->cubPts().extent_int(1);
+    // Initialize output val
+    val = scalar_view("val", c);
+    // Evaluate state on FE basis
+    scalar_view valU_eval("valU", c, p);
+    fe_->evaluateValue(valU_eval, u_coeff);
+    // Compute difference between state and target
+    rst::subtract(valU_eval,target_);
+    // Compute squared L2-norm of diff
+    fe_->computeIntegral(val,valU_eval,valU_eval);
+    // Scale by one half
+    rst::scale(val, static_cast<Real>(0.5));
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    // Get relevant dimensions
+    int c = u_coeff.extent_int(0);
+    int p = fe_->cubPts().extent_int(1);
+    int f = fe_->N().extent_int(1);
+    // Initialize output grad
+    grad = scalar_view("gradient_1", c, f);
+    // Evaluate state on FE basis
+    scalar_view valU_eval("valU_eval", c, p);
+    fe_->evaluateValue(valU_eval, u_coeff);
+    // Compute difference between state and target
+    rst::subtract(valU_eval, target_);
+    // Compute gradient of squared L2-norm of diff
+    fst::integrate(grad, valU_eval, fe_->NdetJ(), false);
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    throw Exception::Zero(">>> QoI_L2Tracking_Poisson_Boltzmann::gradient_2 is zero.");
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    int c = v_coeff.extent_int(0);
+    int p = fe_->cubPts().extent_int(1);
+    int f = fe_->N().extent_int(1);
+    scalar_view valV_eval("valV_eval", c, p);
+    hess = scalar_view("hess11", c, f);
+    fe_->evaluateValue(valV_eval, v_coeff);
+    fst::integrate(hess, valV_eval, fe_->NdetJ(), false);
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    throw Exception::Zero(">>> QoI_L2Tracking_Poisson_Boltzmann::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    throw Exception::Zero(">>> QoI_L2Tracking_Poisson_Boltzmann::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    throw Exception::Zero(">>> QoI_L2Tracking_Poisson_Boltzmann::HessVec_22 is zero.");
+  }
+
+}; // QoI_L2Tracking
+
+template <class Real, class DeviceType>
+class QoI_L2Penalty_Poisson_Boltzmann : public QoI<Real, DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using fe_type = FE<Real, DeviceType>;
+  using fst = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using rst = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  ROL::Ptr<fe_type> fe_;
+
+public:
+  QoI_L2Penalty_Poisson_Boltzmann(const ROL::Ptr<fe_type> & fe) : fe_(fe) {}
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    // Get relevant dimensions
+    int c = z_coeff.extent_int(0);
+    int p = fe_->cubPts().extent_int(1);
+    // Initialize output val
+    val = scalar_view("val", c);
+    // Build local state tracking term
+    scalar_view valZ_eval("valZ_eval", c, p);
+    fe_->evaluateValue(valZ_eval, z_coeff);
+    fe_->computeIntegral(val, valZ_eval, valZ_eval);
+    rst::scale(val, static_cast<Real>(0.5));
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    throw Exception::Zero(">>> QoI_L2Penalty_Poisson_Boltzmann::gradient_1 is zero.");
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    // Get relevant dimensions
+    int c = z_coeff.extent_int(0);
+    int p = fe_->cubPts().extent_int(1);
+    int f = fe_->N().extent_int(1);
+    // Initialize output grad
+    grad = scalar_view("grad", c, f);
+    // Build local gradient of state tracking term
+    scalar_view valZ_eval("valZ_eval", c, p);
+    fe_->evaluateValue(valZ_eval, z_coeff);
+    fst::integrate(grad, valZ_eval, fe_->NdetJ(), false);
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    throw Exception::Zero(">>> QoI_L2Penalty_Poisson_Boltzmann::HessVec_11 is zero.");
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    throw Exception::Zero(">>> QoI_L2Penalty_Poisson_Boltzmann::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    throw Exception::Zero(">>> QoI_L2Penalty_Poisson_Boltzmann::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) {
+    int c = v_coeff.extent_int(0);
+    int p = fe_->cubPts().extent_int(1);
+    int f = fe_->N().extent_int(1);
+    scalar_view valV_eval("valV_eval", c, p);
+    hess = scalar_view("hess22", c, f);
+    fe_->evaluateValue(valV_eval, v_coeff);
+    fst::integrate(hess, valV_eval, fe_->NdetJ(), false);
+  }
+
+}; // QoI_L2Penalty
+
+template <class Real>
+class StdObjective_Poisson_Boltzmann : public ROL::StdObjective<Real> {
+private:
+  Real alpha_;
+
+public:
+  StdObjective_Poisson_Boltzmann(Teuchos::ParameterList &parlist) {
+    alpha_ = parlist.sublist("Poisson Boltzmann Objective Function").get("Control Penalty",1.e-4);
+  }
+
+  Real value(const std::vector<Real> &x, Real &tol) {
+    return x[0] + alpha_*x[1];
+  }
+
+  void gradient(std::vector<Real> &g, const std::vector<Real> &x, Real &tol) {
+    const Real one(1);
+    g[0] = one;
+    g[1] = alpha_;
+  }
+
+  void hessVec(std::vector<Real> &hv, const std::vector<Real> &v, const std::vector<Real> &x, Real &tol) {
+    const Real zero(0);
+    hv[0] = zero;
+    hv[1] = zero;
+  }
+
+}; // OBJ_SCALAR
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/poisson-boltzmann/obj_poisson_boltzmann_ex02K.hpp b/packages/rol/example/PDE-OPT/poisson-boltzmann/obj_poisson_boltzmann_ex02K.hpp
new file mode 100644
index 000000000000..a814fbe39049
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/poisson-boltzmann/obj_poisson_boltzmann_ex02K.hpp
@@ -0,0 +1,558 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  obj.hpp
+    \brief Provides the interface for local (cell-based) objective function computations.
+*/
+
+#ifndef PDEOPT_QOI_L2TRACKING_POISSON_BOLTZMANN_EX02K_HPP
+#define PDEOPT_QOI_L2TRACKING_POISSON_BOLTZMANN_EX02K_HPP
+
+#include "../TOOLS/qoiK.hpp"
+#include "dopingK.hpp"
+#include "pde_poisson_boltzmann_ex02K.hpp"
+
+template <class Real, class DeviceType>
+class QoI_State_Cost_1_Poisson_Boltzmann : public QoI<Real, DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using fe_type = FE<Real, DeviceType>;
+  using fst = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using rst = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> fe_vol_;
+  const std::vector<std::vector<ROL::Ptr<fe_type>>> fe_bdry_;
+  const std::vector<std::vector<std::vector<int>>> bdryCellLocIds_;
+
+  scalar_view getBoundaryCoeff(
+      const scalar_view cell_coeff,
+      const int sideSet, const int locSideId) const {
+    std::vector<int> bdryCellLocId = bdryCellLocIds_[sideSet][locSideId];
+    const int numCellsSide = bdryCellLocId.size();
+    const int f = fe_vol_->N().extent_int(1);
+
+    scalar_view bdry_coeff("bdry_coeff", numCellsSide, f);
+    for (int i = 0; i < numCellsSide; ++i) {
+      for (int j = 0; j < f; ++j) {
+        bdry_coeff(i, j) = cell_coeff(bdryCellLocId[i], j);
+      }
+    }
+    return bdry_coeff;
+  }
+
+public:
+  QoI_State_Cost_1_Poisson_Boltzmann(const ROL::Ptr<fe_type> & fe_vol,
+                                     const std::vector<std::vector<ROL::Ptr<fe_type>>> & fe_bdry,
+                                     const std::vector<std::vector<std::vector<int>>> & bdryCellLocIds,
+                                     Teuchos::ParameterList &parlist)
+    : fe_vol_(fe_vol), fe_bdry_(fe_bdry), bdryCellLocIds_(bdryCellLocIds) {}
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    const int c = fe_vol_->gradN().extent_int(0);
+    const int d = fe_vol_->gradN().extent_int(3);
+    // Initialize output val
+    val = scalar_view("val", c);
+    // Compute cost integral
+    const int numSideSets = bdryCellLocIds_.size();
+    for (int i = 0; i < numSideSets; ++i) {
+      if ( i == 3 ) {
+        const int numLocSides = bdryCellLocIds_[i].size();
+        for (int j = 0; j < numLocSides; ++j) {
+          const int numCellsSide  = bdryCellLocIds_[i][j].size();
+          if ( numCellsSide ) {
+            const int numCubPerSide = fe_bdry_[i][j]->cubPts().extent_int(1);
+            // Evaluate control on FE basis
+            scalar_view u_coeff_bdry = getBoundaryCoeff(u_coeff, i, j);
+            scalar_view gradU_eval("gradU_eval", numCellsSide, numCubPerSide, d);
+            fe_bdry_[i][j]->evaluateGradient(gradU_eval, u_coeff_bdry);
+            // Compute cost
+            scalar_view costU("costU", numCellsSide, numCubPerSide);
+            scalar_view weightU("weightU", numCellsSide, numCubPerSide);
+            for (int k = 0; k < numCellsSide; ++k) {
+              for (int l = 0; l < numCubPerSide; ++l) {
+                costU(k,l) = gradU_eval(k, l, 1);
+                weightU(k,l) = static_cast<Real>(1);
+              }
+            }
+            // Integrate cell cost
+            scalar_view intVal("intVal", numCellsSide);
+            fe_bdry_[i][j]->computeIntegral(intVal, costU, weightU);
+            // Add to integral value
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              val(cidx) += intVal(k);
+            }
+          }
+        }
+      }
+    }
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    const int c = fe_vol_->gradN().extent_int(0);
+    const int f = fe_vol_->gradN().extent_int(1);
+    // Initialize output val
+    grad = scalar_view("grad", c, f);
+    // Compute cost integral
+    const int numSideSets = bdryCellLocIds_.size();
+    for (int i = 0; i < numSideSets; ++i) {
+      if ( i == 3 ) {
+        const int numLocSides = bdryCellLocIds_[i].size();
+        for (int j = 0; j < numLocSides; ++j) {
+          const int numCellsSide  = bdryCellLocIds_[i][j].size();
+          if ( numCellsSide ) {
+            const int numCubPerSide = fe_bdry_[i][j]->cubPts().extent_int(1);
+            // Compute cost
+            scalar_view weightU("weightU", numCellsSide, numCubPerSide);
+            for (int k = 0; k < numCellsSide; ++k) {
+              for (int l = 0; l < numCubPerSide; ++l) {
+                weightU(k,l) = static_cast<Real>(1);
+              }
+            }
+            // Compute gradient of squared L2-norm of diff
+            scalar_view intGrad("intGrad", numCellsSide, f);
+            fst::integrate(intGrad, weightU, (fe_bdry_[i][j]->DNDdetJ(1)), false);
+            // Add to integral value
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < f; ++l) {
+                grad(cidx, l) += intGrad(k,l);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::Zero(">>> QoI_ControlCost::gradient_2 is zero.");
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::Zero(">>> QoI_ControlCost::HessVec_11 is zero.");
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::Zero(">>> QoI_ControlCost::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::Zero(">>> QoI_ControlCost::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::Zero(">>> QoI_ControlCost::HessVec_22 is zero.");
+  }
+
+}; // QoI_State_Cost_2_Poisson_Boltzmann
+
+template <class Real, class DeviceType>
+class QoI_State_Cost_2_Poisson_Boltzmann : public QoI<Real, DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using fe_type = FE<Real, DeviceType>;
+  using fst = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using rst = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  ROL::Ptr<fe_type> fe_;
+
+  scalar_view target_;
+
+  Real targetFunc(const std::vector<Real> & x) const {
+    return static_cast<Real>(1);
+//    int size = x.size();
+//    Real val(0);
+//    for (int i = 0; i < size; ++i) {
+//      val += x[i]*x[i];
+//    }
+//    return val;
+  }
+
+public:
+  QoI_State_Cost_2_Poisson_Boltzmann(const ROL::Ptr<fe_type> & fe) : fe_(fe) {
+    int c = fe_->cubPts().extent_int(0);
+    int p = fe_->cubPts().extent_int(1);
+    int d = fe_->cubPts().extent_int(2);
+    std::vector<Real> pt(d);
+    target_ = scalar_view("target_", c, p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k) {
+          pt[k] = (fe_->cubPts())(i,j,k);
+        }
+        target_(i,j) = targetFunc(pt);
+      }
+    }
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    // Get relevant dimensions
+    int c = u_coeff.extent_int(0);
+    int p = fe_->cubPts().extent_int(1);
+    // Initialize output val
+    val = scalar_view("val", c);
+    // Evaluate state on FE basis
+    scalar_view valU_eval("valU_eval", c, p);
+    fe_->evaluateValue(valU_eval, u_coeff);
+    // Compute difference between state and target
+    rst::subtract(valU_eval, target_);
+    // Compute squared L2-norm of diff
+    fe_->computeIntegral(val, valU_eval, valU_eval);
+    // Scale by one half
+    rst::scale(val, static_cast<Real>(0.5));
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    // Get relevant dimensions
+    int c = u_coeff.extent_int(0);
+    int p = fe_->cubPts().extent_int(1);
+    int f = fe_->N().extent_int(1);
+    // Initialize output grad
+    grad = scalar_view("grad1", c, f);
+    // Evaluate state on FE basis
+    scalar_view valU_eval("valU_eval", c, p);
+    fe_->evaluateValue(valU_eval, u_coeff);
+    // Compute difference between state and target
+    rst::subtract(valU_eval, target_);
+    // Compute gradient of squared L2-norm of diff
+    fst::integrate(grad, valU_eval, fe_->NdetJ(), false);
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::Zero(">>> QoI_State_Cost_Poisson_Boltzmann::gradient_2 is zero.");
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    int c = v_coeff.extent_int(0);
+    int p = fe_->cubPts().extent_int(1);
+    int f = fe_->N().extent_int(1);
+    scalar_view valV_eval("valV_eval", c, p);
+    hess = scalar_view("hess11", c, f);
+    fe_->evaluateValue(valV_eval, v_coeff);
+    fst::integrate(hess, valV_eval, fe_->NdetJ(), false);
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::Zero(">>> QoI_State_Cost_Poisson_Boltzmann::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::Zero(">>> QoI_State_Cost_Poisson_Boltzmann::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::Zero(">>> QoI_State_Cost_Poisson_Boltzmann::HessVec_22 is zero.");
+  }
+
+}; // QoI_State_Cost
+
+template <class Real, class DeviceType>
+class QoI_Control_Cost_1_Poisson_Boltzmann : public QoI<Real, DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using fe_type = FE<Real, DeviceType>;
+  using fst = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using rst = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> fe_;
+  scalar_view target_;
+
+public:
+  QoI_Control_Cost_1_Poisson_Boltzmann(const ROL::Ptr<fe_type> & fe,
+                                       const ROL::Ptr<Doping<Real, DeviceType>> & dope)
+    : fe_(fe) {
+    // Compute reference doping profile
+    const int c = fe_->gradN().extent_int(0);
+    const int p = fe_->gradN().extent_int(2);
+    const int d = fe_->gradN().extent_int(3);
+    std::vector<Real> pt(d);
+    target_ = scalar_view("target_", c, p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k) {
+          pt[k] = (fe_->cubPts())(i,j,k);
+        }
+        target_(i,j) = dope->evaluate(pt);
+      }
+    }
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    // Get relevant dimensions
+    const int c = fe_->gradN().extent_int(0);
+    const int p = fe_->gradN().extent_int(2);
+    // Initialize output val
+    val = scalar_view("val", c);
+    // Evaluate state on FE basis
+    scalar_view valZ_eval("valZ_eval", c, p);
+    fe_->evaluateValue(valZ_eval, z_coeff);
+    // Compute difference between control and target
+    rst::subtract(valZ_eval, target_);
+    // Compute squared L2-norm of diff
+    fe_->computeIntegral(val, valZ_eval, valZ_eval);
+    // Scale by one half
+    rst::scale(val, static_cast<Real>(0.5));
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::Zero(">>> QoI_Control_Cost_1_Poisson_Boltzmann::gradient_1 is zero.");
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    // Get relevant dimensions
+    const int c = fe_->gradN().extent_int(0);
+    const int f = fe_->gradN().extent_int(1);
+    const int p = fe_->gradN().extent_int(2);
+    // Initialize output grad
+    grad = scalar_view("grad2", c, f);
+    // Evaluate state on FE basis
+    scalar_view valZ_eval("valZ_eval", c, p);
+    fe_->evaluateValue(valZ_eval, z_coeff);
+    // Compute difference between control and target
+    rst::subtract(valZ_eval, target_);
+    // Compute gradient of squared L2-norm of diff
+    fst::integrate(grad, valZ_eval, fe_->NdetJ(), false);
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::Zero(">>> QoI_Control_Cost_1_Poisson_Boltzmann::HessVec_11 is zero.");
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::Zero(">>> QoI_Control_Cost_1_Poisson_Boltzmann::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::Zero(">>> QoI_Control_Cost_1_Poisson_Boltzmann::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    const int c = fe_->gradN().extent_int(0);
+    const int f = fe_->gradN().extent_int(1);
+    const int p = fe_->gradN().extent_int(2);
+    scalar_view valV_eval("valV_eval", c, p);
+    hess = scalar_view("hess22", c, f);
+    fe_->evaluateValue(valV_eval, v_coeff);
+    fst::integrate(hess, valV_eval, fe_->NdetJ(), false);
+  }
+
+}; // QoI_Control_Cost_1
+
+template <class Real, class DeviceType>
+class QoI_Control_Cost_2_Poisson_Boltzmann : public QoI<Real, DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using fe_type = FE<Real, DeviceType>;
+  using fst = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using rst = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  ROL::Ptr<fe_type> fe_;
+
+public:
+  QoI_Control_Cost_2_Poisson_Boltzmann(const ROL::Ptr<fe_type> & fe) : fe_(fe) {}
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    // Get relevant dimensions
+    const int c = fe_->gradN().extent_int(0);
+    const int p = fe_->gradN().extent_int(2);
+    const int d = fe_->gradN().extent_int(3);
+    // Initialize output val
+    val = scalar_view("val", c);
+    // Evaluate state on FE basis
+    scalar_view gradZ_eval("gradZ_eval", c, p, d);
+    fe_->evaluateGradient(gradZ_eval, z_coeff);
+    // Compute squared L2-norm of grad
+    fe_->computeIntegral(val, gradZ_eval, gradZ_eval);
+    // Scale by one half
+    rst::scale(val, static_cast<Real>(0.5));
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::Zero(">>> QoI_Control_Cost_2_Poisson_Boltzmann::gradient_1 is zero.");
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    // Get relevant dimensions
+    const int c = fe_->gradN().extent_int(0);
+    const int f = fe_->gradN().extent_int(1);
+    const int p = fe_->gradN().extent_int(2);
+    const int d = fe_->gradN().extent_int(3);
+    // Initialize output grad
+    grad = scalar_view("grad", c, f);
+    // Evaluate state on FE basis
+    scalar_view gradZ_eval("gradZ_eval", c, p, d);
+    fe_->evaluateGradient(gradZ_eval, z_coeff);
+    // Compute gradient of squared L2-norm of gradient
+    fst::integrate(grad, gradZ_eval, fe_->gradNdetJ(), false);
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::Zero(">>> QoI_Control_Cost_2_Poisson_Boltzmann::HessVec_11 is zero.");
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::Zero(">>> QoI_Control_Cost_2_Poisson_Boltzmann::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    throw Exception::Zero(">>> QoI_Control_Cost_2_Poisson_Boltzmann::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) {
+    const int c = fe_->gradN().extent_int(0);
+    const int f = fe_->gradN().extent_int(1);
+    const int p = fe_->gradN().extent_int(2);
+    const int d = fe_->gradN().extent_int(3);
+    scalar_view gradV_eval("gradV_eval", c, p, d);
+    hess = scalar_view("hess22", c, f);
+    fe_->evaluateGradient(gradV_eval, v_coeff);
+    fst::integrate(hess, gradV_eval, fe_->gradNdetJ(), false);
+  }
+
+}; // QoI_Control_Cost_2
+
+template <class Real>
+class StdObjective_Poisson_Boltzmann : public ROL::StdObjective<Real> {
+private:
+  const Real J_, w1_, w2_, w3_;
+
+public:
+  StdObjective_Poisson_Boltzmann(const Real J, const Real w1, const Real w2, const Real w3)
+    : J_(J), w1_(w1), w2_(w2), w3_(w3) {}
+
+  Real value(const std::vector<Real> &x, Real &tol) {
+    const Real half(0.5);
+    return half*w1_*std::pow(x[0]-J_,2) + w2_*x[1] + w3_*x[2];
+  }
+
+  void gradient(std::vector<Real> &g, const std::vector<Real> &x, Real &tol) {
+    g[0] = w1_*(x[0]-J_);
+    g[1] = w2_;
+    g[2] = w3_;
+  }
+
+  void hessVec(std::vector<Real> &hv, const std::vector<Real> &v, const std::vector<Real> &x, Real &tol) {
+    const Real zero(0);
+    hv[0] = w1_*v[0];
+    hv[1] = zero;
+    hv[2] = zero;
+  }
+
+}; // OBJ_SCALAR
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/poisson-boltzmann/pde_poisson_boltzmannK.hpp b/packages/rol/example/PDE-OPT/poisson-boltzmann/pde_poisson_boltzmannK.hpp
new file mode 100644
index 000000000000..15dc250492ae
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/poisson-boltzmann/pde_poisson_boltzmannK.hpp
@@ -0,0 +1,232 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  pde.hpp
+    \brief Implements the local PDE interface for the Poisson-Boltzmann control problem.
+*/
+
+#ifndef PDE_POISSON_BOLTZMANNK_HPP
+#define PDE_POISSON_BOLTZMANNK_HPP
+
+#include "../TOOLS/pdeK.hpp"
+#include "../TOOLS/feK.hpp"
+
+#include "Intrepid2_HGRAD_QUAD_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_QUAD_C2_FEM.hpp"
+#include "Intrepid2_DefaultCubatureFactory.hpp"
+#include "Intrepid2_FunctionSpaceTools.hpp"
+#include "Intrepid2_CellTools.hpp"
+
+#include "ROL_Ptr.hpp"
+
+template <class Real, class DeviceType>
+class PDE_Poisson_Boltzmann : public PDE<Real, DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type = FE<Real, DeviceType>;
+  using fst = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct = Intrepid2::CellTools<DeviceType>;
+  using rst = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  // Finite element basis information
+  basis_ptr basisPtr_;
+  std::vector<basis_ptr> basisPtrs_;
+  // Cell cubature information
+  ROL::Ptr<Intrepid2::Cubature<DeviceType, Real, Real>> cellCub_;
+  // Cell node information
+  scalar_view volCellNodes_;
+  std::vector<std::vector<scalar_view>> bdryCellNodes_;
+  std::vector<std::vector<std::vector<int>>> bdryCellLocIds_;
+  // Finite element definition
+  ROL::Ptr<fe_type> fe_vol_;
+
+public:
+  PDE_Poisson_Boltzmann(Teuchos::ParameterList &parlist) {
+    // Finite element fields.
+    int basisOrder = parlist.sublist("Problem").get("Order of FE discretization",1);
+    if (basisOrder == 1) {
+      basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceType, Real, Real>>();
+    }
+    else if (basisOrder == 2) {
+      basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C2_FEM<DeviceType, Real, Real>>();
+    }
+    basisPtrs_.clear(); basisPtrs_.push_back(basisPtr_);
+    // Quadrature rules.
+    shards::CellTopology cellType = basisPtr_->getBaseCellTopology();                  // get the cell type from any basis
+    Intrepid2::DefaultCubatureFactory cubFactory;                                      // create cubature factory
+    int cubDegree = parlist.sublist("PDE Poisson Boltzmann").get("Cubature Degree",2); // set cubature degree, e.g., 2
+    cellCub_ = cubFactory.create<DeviceType, Real, Real>(cellType, cubDegree);         // create default cubature
+  }
+
+  void residual(scalar_view & res,
+                const scalar_view u_coeff,
+                const scalar_view z_coeff = scalar_view(),
+                const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // GET DIMENSIONS
+    int c = fe_vol_->N().extent_int(0);
+    int f = fe_vol_->N().extent_int(1);
+    int p = fe_vol_->N().extent_int(2);
+    int d = cellCub_->getDimension();
+    // INITIALIZE RESIDUAL
+    res = scalar_view("res", c, f);
+    // COMPUTE STIFFNESS TERM
+    scalar_view gradU_eval("gradU_eval", c, p, d);
+    fe_vol_->evaluateGradient(gradU_eval, u_coeff);
+    fst::integrate(res, gradU_eval, fe_vol_->gradNdetJ(), false);
+    // ADD NONLINEAR TERM
+    scalar_view valU_eval("valU_eval", c, p);
+    fe_vol_->evaluateValue(valU_eval, u_coeff);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        (valU_eval)(i,j) = std::exp(-(valU_eval)(i,j));
+      }
+    }
+    fst::integrate(res, valU_eval, fe_vol_->NdetJ(), true);
+    // ADD CONTROL TERM
+    if ( z_coeff != scalar_view() ) {
+      scalar_view valZ_eval("valZ_eval", c, p);
+      fe_vol_->evaluateValue(valZ_eval, z_coeff);
+      rst::scale(valZ_eval, static_cast<Real>(-1));
+      fst::integrate(res, valZ_eval, fe_vol_->NdetJ(), true);
+    }
+  }
+
+  void Jacobian_1(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) override {
+    // GET DIMENSIONS
+    int c = fe_vol_->N().extent_int(0);
+    int f = fe_vol_->N().extent_int(1);
+    int p = fe_vol_->N().extent_int(2);
+    // INITIALIZE JACOBIAN
+    jac = scalar_view("jac1", c, f, f);
+    // COMPUTE STIFFNESS TERM
+    fst::integrate(jac, fe_vol_->gradN(), fe_vol_->gradNdetJ(), false);
+    // ADD NONLINEAR TERM
+    scalar_view valU_eval("valU_eval", c, p);
+    fe_vol_->evaluateValue(valU_eval, u_coeff);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        (valU_eval)(i,j) = -std::exp(-(valU_eval)(i,j));
+      }
+    }
+    scalar_view NexpU("NexpU", c, f, p);
+    fst::scalarMultiplyDataField(NexpU, valU_eval, fe_vol_->N());
+    fst::integrate(jac, NexpU, fe_vol_->NdetJ(), true);
+  }
+
+  void Jacobian_2(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) override {
+    if ( z_coeff != scalar_view() ) {
+      // GET DIMENSIONS
+      int c = fe_vol_->N().extent_int(0);
+      int f = fe_vol_->N().extent_int(1);
+      // INITIALIZE JACOBIAN
+      jac = scalar_view("jac2", c, f, f);
+      // ADD CONTROL TERM
+      fst::integrate(jac, fe_vol_->N(), fe_vol_->NdetJ(), false);
+      rst::scale(jac, static_cast<Real>(-1));
+    }
+  }
+
+  void Hessian_11(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) override {
+    // GET DIMENSIONS
+    int c = fe_vol_->N().extent_int(0);
+    int f = fe_vol_->N().extent_int(1);
+    int p = fe_vol_->N().extent_int(2);
+    // INITIALIZE HESSIAN
+    hess = scalar_view("hess11", c, f, f);
+    // COMPUTE NONLINEAR TERM
+    scalar_view valU_eval("valU_eval", c, p);
+    fe_vol_->evaluateValue(valU_eval, u_coeff);
+    scalar_view valL_eval("valL_eval", c, p);
+    fe_vol_->evaluateValue(valL_eval, l_coeff);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        (valU_eval)(i,j) = (valL_eval)(i,j)*std::exp(-(valU_eval)(i,j));
+      }
+    }
+    scalar_view NLexpU("NLexpU", c, f, p);
+    fst::scalarMultiplyDataField(NLexpU, valU_eval, fe_vol_->N());
+    fst::integrate(hess, NLexpU, fe_vol_->NdetJ(), false);
+  }
+
+  void Hessian_12(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Poisson_Boltzmann:Hessian_12: Hessian is zero.");
+  }
+
+  void Hessian_21(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Poisson_Boltzmann:Hessian_21: Hessian is zero.");
+  }
+
+  void Hessian_22(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Poisson_Boltzmann:Hessian_22: Hessian is zero.");
+  }
+
+  void RieszMap_1(scalar_view & riesz) override {
+    // GET DIMENSIONS
+    int c = fe_vol_->N().extent_int(0);
+    int f = fe_vol_->N().extent_int(1);
+    // INITIALIZE RIESZ
+    riesz = scalar_view("riesz1", c, f, f);
+    Kokkos::deep_copy(riesz, fe_vol_->stiffMat());
+    rst::add(riesz,fe_vol_->massMat());
+  }
+
+  void RieszMap_2(scalar_view & riesz) override {
+    // GET DIMENSIONS
+    int c = fe_vol_->N().extent_int(0);
+    int f = fe_vol_->N().extent_int(1);
+    // INITIALIZE RIESZ
+    riesz = scalar_view("riesz2", c, f, f);
+    Kokkos::deep_copy(riesz, fe_vol_->massMat());
+  }
+
+  std::vector<basis_ptr> getFields() override {
+    return basisPtrs_;
+  }
+
+  void setCellNodes(const scalar_view & volCellNodes,
+                    const std::vector<std::vector<scalar_view>> & bdryCellNodes,
+                    const std::vector<std::vector<std::vector<int>>> & bdryCellLocIds) override {
+    volCellNodes_ = volCellNodes;
+    bdryCellNodes_ = bdryCellNodes;
+    bdryCellLocIds_ = bdryCellLocIds;
+    // Finite element definition.
+    fe_vol_ = ROL::makePtr<fe_type>(volCellNodes_, basisPtr_, cellCub_);
+  }
+
+  const ROL::Ptr<fe_type> getFE(void) const {
+    return fe_vol_;
+  }
+
+}; // PDE_Poisson_Boltzmann
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/poisson-boltzmann/pde_poisson_boltzmann_ex02K.hpp b/packages/rol/example/PDE-OPT/poisson-boltzmann/pde_poisson_boltzmann_ex02K.hpp
new file mode 100644
index 000000000000..3d6871c91bd8
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/poisson-boltzmann/pde_poisson_boltzmann_ex02K.hpp
@@ -0,0 +1,961 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  pde.hpp
+    \brief Implements the local PDE interface for the Poisson-Boltzmann control problem.
+*/
+
+#ifndef PDE_POISSON_BOLTZMANN_EX02K_HPP
+#define PDE_POISSON_BOLTZMANN_EX02K_HPP
+
+#include "../TOOLS/pdeK.hpp"
+#include "../TOOLS/feK.hpp"
+
+#include "Intrepid2_HGRAD_QUAD_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_QUAD_C2_FEM.hpp"
+#include "Intrepid2_DefaultCubatureFactory.hpp"
+#include "Intrepid2_FunctionSpaceTools.hpp"
+#include "Intrepid2_CellTools.hpp"
+
+#include "ROL_Ptr.hpp"
+
+template <class Real, class DeviceType>
+class PDE_Poisson_Boltzmann_ex02 : public PDE<Real, DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type = FE<Real, DeviceType>;
+  using fst = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct = Intrepid2::CellTools<DeviceType>;
+  using rst = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  // Finite element basis information
+  basis_ptr basisPtr_;
+  std::vector<basis_ptr> basisPtrs_;
+  // Cell cubature information
+  ROL::Ptr<Intrepid2::Cubature<DeviceType, Real, Real> > cellCub_;
+  ROL::Ptr<Intrepid2::Cubature<DeviceType, Real, Real> > bdryCub_;
+  // Cell node information
+  scalar_view volCellNodes_;
+  std::vector<std::vector<scalar_view>> bdryCellNodes_;
+  std::vector<std::vector<std::vector<int>>> bdryCellLocIds_;
+  // Finite element definition
+  ROL::Ptr<fe_type> fe_vol_;
+  std::vector<std::vector<ROL::Ptr<fe_type>>> fe_bdry_;
+  // Local degrees of freedom on boundary, for each side of the reference cell (first index).
+  std::vector<std::vector<int>> fidx_;
+
+  Real wx_, wy_, dx_;
+  Real uScale_, vScale_;
+  bool useRobin_;
+  Real robinCoeff_;
+
+public:
+  PDE_Poisson_Boltzmann_ex02(Teuchos::ParameterList &parlist) {
+    // Finite element fields.
+    int basisOrder = parlist.sublist("Problem").get("Order of FE discretization",1);
+    if (basisOrder == 1) {
+      basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceType, Real, Real>>();
+    }
+    else if (basisOrder == 2) {
+      basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C2_FEM<DeviceType, Real, Real>>();
+    }
+    basisPtrs_.clear(); basisPtrs_.push_back(basisPtr_);
+    // Quadrature rules.
+    shards::CellTopology cellType = basisPtr_->getBaseCellTopology();                  // get the cell type from any basis
+    Intrepid2::DefaultCubatureFactory cubFactory;                                      // create cubature factory
+    int cubDegree = parlist.sublist("PDE Poisson Boltzmann").get("Cubature Degree",2); // set cubature degree, e.g., 2
+    cellCub_ = cubFactory.create<DeviceType, Real, Real>(cellType, cubDegree);         // create default cubature
+
+    int d = cellType.getDimension();
+    shards::CellTopology bdryCellType = cellType.getCellTopologyData(d-1, 0);
+    int bdryCubDegree = parlist.sublist("Problem").get("Boundary Cubature Degree",2); // set cubature degree, e.g., 2
+    bdryCub_ = cubFactory.create<DeviceType, Real, Real>(bdryCellType, bdryCubDegree);
+
+    wx_      = parlist.sublist("Geometry").get("Width",0.6);
+    wy_      = parlist.sublist("Geometry").get("Height",0.2);
+    dx_      = wx_/static_cast<Real>(6);
+    uScale_  = parlist.sublist("Problem").get("Electron Density",1.0);
+    vScale_  = parlist.sublist("Problem").get("Hole Density",1.0);
+
+    useRobin_ = parlist.sublist("Problem").get("Use Robin Conditions",false);
+    robinCoeff_ = parlist.sublist("Problem").get("Robin Coefficient",1e2);
+  }
+
+  void residual(scalar_view & res,
+                const scalar_view u_coeff,
+                const scalar_view z_coeff = scalar_view(),
+                const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // GET DIMENSIONS
+    const int c = fe_vol_->gradN().extent_int(0);
+    const int f = fe_vol_->gradN().extent_int(1);
+    const int p = fe_vol_->gradN().extent_int(2);
+    const int d = fe_vol_->gradN().extent_int(3);
+    // INITIALIZE RESIDUAL
+    res = scalar_view("", c, f);
+    // COMPUTE PDE COEFFICIENTS
+    scalar_view lambda2("lambda2", c, p), delta2("delta2", c, p), scale("scale", c, p);
+    computeCoefficients(lambda2,delta2,scale);
+    // ADD STIFFNESS TERM TO RESIDUAL
+    scalar_view gradU_eval("gradU_eval", c, p, d);
+    fe_vol_->evaluateGradient(gradU_eval, u_coeff);
+    scalar_view lambda2_gradU_eval("lambda2_gradU_eval", c, p, d);
+    fst::tensorMultiplyDataData(lambda2_gradU_eval, lambda2, gradU_eval);
+    fst::integrate(res, lambda2_gradU_eval, fe_vol_->gradNdetJ(), false);
+    // ADD NONLINEAR TERM TO RESIDUAL
+    scalar_view valU_eval("valU_eval", c, p);
+    fe_vol_->evaluateValue(valU_eval, u_coeff);
+    scalar_view phi_valU_eval("phi_valU_eval", c, p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        phi_valU_eval(i,j) = delta2(i,j)*(uScale_*std::exp((valU_eval)(i,j)) - vScale_*std::exp(-(valU_eval)(i,j)));
+      }
+    }
+    fst::integrate(res, phi_valU_eval, fe_vol_->NdetJ(), true);
+    // ADD CONTROL TERM TO RESIDUAL
+    scalar_view valZ_eval("valZ_eval", c, p);
+    fe_vol_->evaluateValue(valZ_eval, z_coeff);
+    scalar_view valZ_scal("valZ_scal", c, p);
+    fst::scalarMultiplyDataData(valZ_scal, scale, valZ_eval);
+    fst::integrate(res, valZ_scal, fe_vol_->NdetJ(), true);
+    // APPLY DIRICHLET CONDITIONS
+    std::vector<std::vector<scalar_view>> bdryCellDofValues;
+    computeDirichlet(bdryCellDofValues);
+    const int numSideSets = bdryCellLocIds_.size();
+    for (int i = 0; i < numSideSets; ++i) {
+      if ( i==1 || i==2 || i==3 ) {
+        int numLocalSideIds = bdryCellLocIds_[i].size();
+        for (int j = 0; j < numLocalSideIds; ++j) {
+          int numCellsSide = bdryCellLocIds_[i][j].size();
+          if (!useRobin_) {
+            int numBdryDofs = fidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                res(cidx,fidx_[j][l])
+                  = u_coeff(cidx,fidx_[j][l]) - (bdryCellDofValues[i][j])(k,l);
+              }
+            }
+          }
+          else {
+            const int numCubPerSide = bdryCub_->getNumPoints();
+            if (numCellsSide) {
+              // Get U and Z coefficients on Robin boundary
+              scalar_view u_coeff_bdry, z_coeff_bdry;
+              u_coeff_bdry = getBoundaryCoeff(u_coeff, i, j);
+              z_coeff_bdry = getBoundaryCoeff(z_coeff, i, j);
+              // Evaluate U and Z on FE basis
+              scalar_view valU_eval_bdry, valZ_eval_bdry;
+              valU_eval_bdry = scalar_view("valU_eval_bdry", numCellsSide, numCubPerSide);
+              valZ_eval_bdry = scalar_view("valZ_eval_bdry", numCellsSide, numCubPerSide);
+              fe_bdry_[i][j]->evaluateValue(valU_eval_bdry, u_coeff_bdry);
+              fe_bdry_[i][j]->evaluateValue(valZ_eval_bdry, z_coeff_bdry);
+              // Compute Robin residual
+              scalar_view robinRes("robinRes", numCellsSide, f);
+              scalar_view robinVal("robinVal", numCellsSide, numCubPerSide);
+              computeRobin(robinVal,valU_eval_bdry,valZ_eval_bdry,i,j,0);
+              fst::integrate(robinRes, robinVal, (fe_bdry_[i][j])->NdetJ(), false);
+              // Add Stefan-Boltzmann residual to volume residual
+              for (int k = 0; k < numCellsSide; ++k) {
+                int cidx = bdryCellLocIds_[i][j][k];
+                for (int l = 0; l < f; ++l) {
+                  res(cidx,l) += robinRes(k,l);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void Jacobian_1(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // GET DIMENSIONS
+    const int c = fe_vol_->gradN().extent_int(0);
+    const int f = fe_vol_->gradN().extent_int(1);
+    const int p = fe_vol_->gradN().extent_int(2);
+    const int d = fe_vol_->gradN().extent_int(3);
+    // INITIALIZE JACOBIAN
+    jac = scalar_view("jac1", c, f, f);
+    // COMPUTE PDE COEFFICIENTS
+    scalar_view lambda2("lambda2", c, p), delta2("delta2", c, p), scale("scale", c, p);
+    computeCoefficients(lambda2, delta2, scale);
+    // ADD STIFFNESS TERM TO JACOBIAN
+    scalar_view lambda2_gradN_eval("lambda2_gradN_eval", c, f, p, d);
+    fst::tensorMultiplyDataField(lambda2_gradN_eval, lambda2, fe_vol_->gradN());
+    // COMPUTE STIFFNESS TERM
+    fst::integrate(jac, lambda2_gradN_eval, fe_vol_->gradNdetJ(), false);
+    // ADD NONLINEAR TERM
+    scalar_view valU_eval("valU_eval", c, p);
+    fe_vol_->evaluateValue(valU_eval, u_coeff);
+    scalar_view dphi_valU_eval("dphi_valU_eval", c, p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        dphi_valU_eval(i,j) = delta2(i,j)*(uScale_*std::exp((valU_eval)(i,j)) + vScale_*std::exp(-(valU_eval)(i,j)));
+      }
+    }
+    scalar_view NexpU("NexpU", c, f, p);
+    fst::scalarMultiplyDataField(NexpU, dphi_valU_eval, fe_vol_->N());
+    fst::integrate(jac, NexpU, fe_vol_->NdetJ(), true);
+    // APPLY DIRICHLET CONDITIONS
+    const int numSideSets = bdryCellLocIds_.size();
+    for (int i = 0; i < numSideSets; ++i) {
+      if ( i==1 || i==2 || i==3 ) {
+        int numLocalSideIds = bdryCellLocIds_[i].size();
+        for (int j = 0; j < numLocalSideIds; ++j) {
+          int numCellsSide = bdryCellLocIds_[i][j].size();
+          if (!useRobin_) {
+            int numBdryDofs = fidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                //std::cout << "\n   j=" << j << "  l=" << l << "  " << fidx[j][l];
+                for (int m = 0; m < f; ++m) {
+                  jac(cidx, fidx_[j][l], m) = static_cast<Real>(0);
+                }
+                jac(cidx, fidx_[j][l], fidx_[j][l]) = static_cast<Real>(1);
+              }
+            }
+          }
+          else {
+            const int numCubPerSide = bdryCub_->getNumPoints();
+            if (numCellsSide) {
+              // Get U and Z coefficients on Robin boundary
+              scalar_view u_coeff_bdry, z_coeff_bdry;
+              u_coeff_bdry = getBoundaryCoeff(u_coeff, i, j);
+              z_coeff_bdry = getBoundaryCoeff(z_coeff, i, j);
+              // Evaluate U and Z on FE basis
+              scalar_view valU_eval_bdry, valZ_eval_bdry;
+              valU_eval_bdry = scalar_view("valU_eval_bdry", numCellsSide, numCubPerSide);
+              valZ_eval_bdry = scalar_view("valZ_eval_bdry", numCellsSide, numCubPerSide);
+              fe_bdry_[i][j]->evaluateValue(valU_eval_bdry, u_coeff_bdry);
+              fe_bdry_[i][j]->evaluateValue(valZ_eval_bdry, z_coeff_bdry);
+              // Compute Robin residual
+              scalar_view robinVal_N("robinVal_N", numCellsSide, f, numCubPerSide);
+              scalar_view robinJac("robinJac", numCellsSide, f, f);
+              scalar_view robinVal("robinVal", numCellsSide, numCubPerSide);
+              computeRobin(robinVal, valU_eval_bdry, valZ_eval_bdry, i, j, 1, 1);
+              fst::scalarMultiplyDataField(robinVal_N, robinVal, (fe_bdry_[i][j])->N());
+              fst::integrate(robinJac, robinVal_N, (fe_bdry_[i][j])->NdetJ(), false);
+              // Add Stefan-Boltzmann residual to volume residual
+              for (int k = 0; k < numCellsSide; ++k) {
+                int cidx = bdryCellLocIds_[i][j][k];
+                for (int l = 0; l < f; ++l) {
+                  for (int m = 0; m < f; ++m) {
+                    jac(cidx, l, m) += robinJac(k, l, m);
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void Jacobian_2(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // GET DIMENSIONS
+    const int c = fe_vol_->gradN().extent_int(0);
+    const int f = fe_vol_->gradN().extent_int(1);
+    const int p = fe_vol_->gradN().extent_int(2);
+    // INITIALIZE JACOBIAN
+    jac = scalar_view("jac2", c, f, f);
+    // COMPUTE PDE COEFFICIENTS
+    scalar_view lambda2("lambda2", c, p), delta2("delta2", c, p), scale("scale", c, p);
+    computeCoefficients(lambda2,delta2,scale);
+    // ADD CONTROL TERM
+    scalar_view valN_scal("valN_scal", c, f, p);
+    fst::scalarMultiplyDataField(valN_scal, scale, fe_vol_->N());
+    fst::integrate(jac, valN_scal, fe_vol_->NdetJ(), false);
+    // APPLY DIRICHLET CONDITIONS
+    const int numSideSets = bdryCellLocIds_.size();
+    for (int i = 0; i < numSideSets; ++i) {
+      if ( i==1 || i==2 || i==3 ) {
+        int numLocalSideIds = bdryCellLocIds_[i].size();
+        for (int j = 0; j < numLocalSideIds; ++j) {
+          int numCellsSide = bdryCellLocIds_[i][j].size();
+          if (!useRobin_) {
+            int numBdryDofs = fidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                //std::cout << "\n   j=" << j << "  l=" << l << "  " << fidx[j][l];
+                for (int m = 0; m < f; ++m) {
+                  jac(cidx, fidx_[j][l], m) = static_cast<Real>(0);
+                }
+              }
+            }
+          }
+          else {
+            const int numCubPerSide = bdryCub_->getNumPoints();
+            if (numCellsSide) {
+              // Get U and Z coefficients on Robin boundary
+              scalar_view u_coeff_bdry, z_coeff_bdry;
+              u_coeff_bdry = getBoundaryCoeff(u_coeff, i, j);
+              z_coeff_bdry = getBoundaryCoeff(z_coeff, i, j);
+              // Evaluate U and Z on FE basis
+              scalar_view valU_eval_bdry, valZ_eval_bdry;
+              valU_eval_bdry = scalar_view("valU_eval_bdry", numCellsSide, numCubPerSide);
+              valZ_eval_bdry = scalar_view("valZ_eval_bdry", numCellsSide, numCubPerSide);
+              (fe_bdry_[i][j])->evaluateValue(valU_eval_bdry, u_coeff_bdry);
+              (fe_bdry_[i][j])->evaluateValue(valZ_eval_bdry, z_coeff_bdry);
+              // Compute Robin residual
+              scalar_view robinVal_N("robinVal_N", numCellsSide, f, numCubPerSide);
+              scalar_view robinJac("robinJac", numCellsSide, f, f);
+              scalar_view robinVal("robinVal", numCellsSide, numCubPerSide);
+              computeRobin(robinVal, valU_eval_bdry, valZ_eval_bdry, i, j, 1, 2);
+              fst::scalarMultiplyDataField(robinVal_N, robinVal, (fe_bdry_[i][j])->N());
+              fst::integrate(robinJac, robinVal_N, (fe_bdry_[i][j])->NdetJ(), false);
+              // Add Stefan-Boltzmann residual to volume residual
+              for (int k = 0; k < numCellsSide; ++k) {
+                int cidx = bdryCellLocIds_[i][j][k];
+                for (int l = 0; l < f; ++l) {
+                  for (int m = 0; m < f; ++m) {
+                    jac(cidx, l, m) += robinJac(k, l, m);
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void Hessian_11(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // GET DIMENSIONS
+    const int c = fe_vol_->gradN().extent_int(0);
+    const int f = fe_vol_->gradN().extent_int(1);
+    const int p = fe_vol_->gradN().extent_int(2);
+    // INITIALIZE HESSIAN
+    hess = scalar_view("hess11", c, f, f);
+    // COMPUTE PDE COEFFICIENTS
+    scalar_view lambda2("lambda2", c, p), delta2("delta2", c, p), scale("scale", c, p);
+    computeCoefficients(lambda2,delta2,scale);
+    // APPLY DIRICHLET CONDITIONS
+    scalar_view l_dbc("l_dbc", c, p);
+    Kokkos::deep_copy(l_dbc, l_coeff);
+    const int numSideSets = bdryCellLocIds_.size();
+    for (int i = 0; i < numSideSets; ++i) {
+      if ( i==1 || i==2 || i==3 ) {
+        int numLocalSideIds = bdryCellLocIds_[i].size();
+        for (int j = 0; j < numLocalSideIds; ++j) {
+          int numCellsSide = bdryCellLocIds_[i][j].size();
+          int numBdryDofs = fidx_[j].size();
+          for (int k = 0; k < numCellsSide; ++k) {
+            int cidx = bdryCellLocIds_[i][j][k];
+            for (int l = 0; l < numBdryDofs; ++l) {
+              l_dbc(cidx, fidx_[j][l]) = static_cast<Real>(0);
+            }
+          }
+        }
+      }
+    }
+    // COMPUTE NONLINEAR TERM
+    scalar_view valU_eval("valU_eval", c, p);
+    fe_vol_->evaluateValue(valU_eval, u_coeff);
+    scalar_view valL_eval("valL_eval", c, p);
+    fe_vol_->evaluateValue(valL_eval, l_dbc);
+    scalar_view d2phi_valU_eval("d2phi_valU_eval", c, p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        d2phi_valU_eval(i,j) = valL_eval(i,j)*delta2(i,j)
+                              *(uScale_*std::exp(valU_eval(i,j))-vScale_*std::exp(-valU_eval(i,j)));
+      }
+    }
+    scalar_view NLexpU("NLexpU", c, f, p);
+    fst::scalarMultiplyDataField(NLexpU, d2phi_valU_eval, fe_vol_->N());
+    fst::integrate(hess, NLexpU, fe_vol_->NdetJ(), false);
+  }
+
+  void Hessian_12(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Poisson_Boltzmann_ex02:Hessian_12: Hessian is zero.");
+  }
+
+  void Hessian_21(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Poisson_Boltzmann_ex02:Hessian_21: Hessian is zero.");
+  }
+
+  void Hessian_22(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Poisson_Boltzmann_ex02:Hessian_22: Hessian is zero.");
+  }
+
+  void RieszMap_1(scalar_view & riesz) override {
+    // GET DIMENSIONS
+    const int c = fe_vol_->N().extent_int(0);
+    const int f = fe_vol_->N().extent_int(1);
+    // INITIALIZE RIESZ
+    riesz = scalar_view("riesz1", c, f, f);
+    Kokkos::deep_copy(riesz, fe_vol_->stiffMat());
+    rst::add(riesz, fe_vol_->massMat());
+  }
+
+  void RieszMap_2(scalar_view & riesz) override {
+    // GET DIMENSIONS
+    const int c = fe_vol_->N().extent_int(0);
+    const int f = fe_vol_->N().extent_int(1);
+    // INITIALIZE RIESZ
+    riesz = scalar_view("riesz2", c, f, f);
+    Kokkos::deep_copy(riesz, fe_vol_->stiffMat());
+    rst::add(riesz, fe_vol_->massMat());
+  }
+
+  std::vector<basis_ptr> getFields() override {
+    return basisPtrs_;
+  }
+
+  void setCellNodes(const scalar_view & volCellNodes,
+                    const std::vector<std::vector<scalar_view>> & bdryCellNodes,
+                    const std::vector<std::vector<std::vector<int>>> & bdryCellLocIds) override {
+    volCellNodes_ = volCellNodes;
+    bdryCellNodes_ = bdryCellNodes;
+    bdryCellLocIds_ = bdryCellLocIds;
+    // Finite element definition.
+    fe_vol_ = ROL::makePtr<fe_type>(volCellNodes_, basisPtr_, cellCub_);
+    // Set local boundary DOFs.
+    fidx_ = fe_vol_->getBoundaryDofs();
+    // Construct boundary FEs
+    const int numSidesets = bdryCellNodes.size();
+    fe_bdry_.resize(numSidesets);
+    for(int i = 0; i < numSidesets; ++i) {
+      int numLocSides = bdryCellNodes[i].size();
+      fe_bdry_[i].resize(numLocSides);
+      for (int j = 0; j < numLocSides; ++j) {
+        if (bdryCellNodes_[i][j] != scalar_view()) {
+          fe_bdry_[i][j] = ROL::makePtr<fe_type>(bdryCellNodes_[i][j], basisPtr_, bdryCub_, j);
+        }
+      }
+    }
+  }
+
+  const ROL::Ptr<fe_type> getFE(void) const {
+    return fe_vol_;
+  }
+
+  const std::vector<std::vector<ROL::Ptr<fe_type>>> getBdryFE(void) const {
+    return fe_bdry_;
+  }
+
+  const scalar_view getCellNodes(void) const {
+    return volCellNodes_;
+  }
+
+  const std::vector<std::vector<std::vector<int>>> getBdryCellLocIds(void) const {
+    return bdryCellLocIds_;
+  }
+
+private:
+
+  Real evaluateLambda2(const std::vector<Real> &x) const {
+    const std::vector<Real> param = PDE<Real, DeviceType>::getParameter();
+    Real p0 = (param.size()>0 ? param[0] : static_cast<Real>(-1.5));
+    return static_cast<Real>(2.5)*std::pow(static_cast<Real>(10), p0);
+  }
+
+  Real evaluateDelta2(const std::vector<Real> &x) const {
+    const std::vector<Real> param = PDE<Real, DeviceType>::getParameter();
+    Real p1 = (param.size()>1 ? param[1] : static_cast<Real>(-0.5));
+    return static_cast<Real>(1.45)*std::pow(static_cast<Real>(10), p1);
+  }
+
+  Real evaluateScale(const std::vector<Real> &x) const {
+    return 1;
+//    const std::vector<Real> param = PDE<Real>::getParameter();
+//    const Real Y1 = static_cast<Real>(1) + static_cast<Real>(5)*(wy_-x[1])*param[2];
+//    const Real Y2 = static_cast<Real>(1) + static_cast<Real>(5)*(wy_-x[1])*param[3];
+//    const Real X1 = phi(x[0]);
+//    const Real X2 = static_cast<Real>(1)-X1;
+//    return Y1*X1 + Y2*X2;
+  }
+
+  Real phi(const Real x) const {
+    const Real zero(0), one(1), two(2);
+    const Real eps = std::sqrt(ROL::ROL_EPSILON<Real>());
+    const Real m = one/(wx_-two*dx_);
+    const Real v = (x < dx_+eps ? zero
+                   : (x > wx_-dx_-eps ? one
+                     : m*(x-dx_)));
+    return v;
+  }
+
+  void computeCoefficients(scalar_view & lambda2,
+                           scalar_view & delta2,
+                           scalar_view & scale) const {
+    // GET DIMENSIONS
+    const int c = fe_vol_->gradN().extent_int(0);
+    const int p = fe_vol_->gradN().extent_int(2);
+    const int d = fe_vol_->gradN().extent_int(3);
+    std::vector<Real> pt(d);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for ( int k = 0; k < d; ++k) {
+          pt[k] = (fe_vol_->cubPts())(i,j,k);
+        }
+        // Compute scaled Debye length
+        lambda2(i,j) = evaluateLambda2(pt);
+        delta2(i,j)  = evaluateDelta2(pt);
+        scale(i,j)   = -evaluateScale(pt);
+      }
+    }
+  }
+
+  Real evaluateRobin(const Real u, const Real z, const std::vector<Real> &x,
+                     const int sideset, const int locSideId,
+                     const int deriv = 0, const int component = 1) const {
+    const Real four(4), two(2), one(1);
+    Real C(0);
+    if (sideset==1 || sideset==3) {
+      C = static_cast<Real>(1);
+    }
+    if (sideset==2) {
+      C = static_cast<Real>(0.3);
+    }
+    Real f0 = std::log((C+std::sqrt(C*C+four))/two);
+    Real f1 = one/std::sqrt(C*C+four);
+    if ( deriv == 1 ) {
+      return (component==1) ? robinCoeff_ : -robinCoeff_ * f1;
+    }
+    if ( deriv > 1 ) {
+      return static_cast<Real>(0);
+    }
+    return robinCoeff_ * (u - (f0 + f1*(z-C)));
+  }
+
+  void computeRobin(scalar_view & robin,
+                    const scalar_view u,
+                    const scalar_view z,
+                    const int sideset,
+                    const int locSideId,
+                    const int deriv = 0,
+                    const int component = 1) const {
+    const int c = u.extent_int(0);
+    const int p = u.extent_int(1);
+    const int d = fe_vol_->gradN().extent_int(3);
+    std::vector<Real> pt(d);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k) {
+          pt[k] = (fe_bdry_[sideset][locSideId]->cubPts())(i, j, k);
+        }
+        robin(i,j) = evaluateRobin(u(i,j), z(i,j), pt, sideset, locSideId, deriv, component);
+      }
+    }
+  }
+
+  Real evaluateDirichlet(const std::vector<Real> & coords, int sideset, int locSideId) const {
+    Real val(0);
+    if (sideset==1) {
+      val = static_cast<Real>(0.436);
+    }
+    if (sideset==2) {
+      val = static_cast<Real>(0.407);
+    }
+    if (sideset==3) {
+      val = static_cast<Real>(0.436);
+    }
+    return val;
+  }
+
+  void computeDirichlet(std::vector<std::vector<scalar_view>> & bdryCellDofValues) const {
+    // Compute Dirichlet values at DOFs.
+    int d = basisPtr_->getBaseCellTopology().getDimension();
+    int numSidesets = bdryCellLocIds_.size();
+    bdryCellDofValues.resize(numSidesets);
+    for (int i=0; i<numSidesets; ++i) {
+      int numLocSides = bdryCellLocIds_[i].size();
+      bdryCellDofValues[i].resize(numLocSides);
+      for (int j=0; j<numLocSides; ++j) {
+        int c = bdryCellLocIds_[i][j].size();
+        int f = basisPtr_->getCardinality();
+        bdryCellDofValues[i][j] = scalar_view("bdryCellDofValues", c, f);
+        scalar_view coords("coords", c, f, d);
+        if (c > 0) {
+          fe_vol_->computeDofCoords(coords, bdryCellNodes_[i][j]);
+        }
+        for (int k=0; k<c; ++k) {
+          for (int l=0; l<f; ++l) {
+            std::vector<Real> dofpoint(d);
+            for (int m=0; m<d; ++m) {
+              dofpoint[m] = coords(k, l, m);
+            }
+            (bdryCellDofValues[i][j])(k, l) = evaluateDirichlet(dofpoint, i, j);
+          }
+        }
+      }
+    }
+  }
+
+  scalar_view getBoundaryCoeff(
+      const scalar_view cell_coeff,
+      int sideSet, int cell) const {
+    std::vector<int> bdryCellLocId = bdryCellLocIds_[sideSet][cell];
+    const int numCellsSide = bdryCellLocId.size();
+    const int f = basisPtr_->getCardinality();
+
+    scalar_view bdry_coeff("bdry_coeff", numCellsSide, f);
+    for (int i = 0; i < numCellsSide; ++i) {
+      for (int j = 0; j < f; ++j) {
+        bdry_coeff(i, j) = cell_coeff(bdryCellLocId[i], j);
+      }
+    }
+    return bdry_coeff;
+  }
+
+}; // PDE_Poisson_Boltzmann
+
+template <class Real, class DeviceType>
+class PDE_Doping : public PDE<Real, DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type = FE<Real, DeviceType>;
+  using fst = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct = Intrepid2::CellTools<DeviceType>;
+  using rst = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  // Finite element basis information
+  basis_ptr basisPtr_;
+  std::vector<basis_ptr> basisPtrs_;
+  // Cell cubature information
+  ROL::Ptr<Intrepid2::Cubature<DeviceType, Real, Real>> cellCub_;
+  ROL::Ptr<Intrepid2::Cubature<DeviceType, Real, Real>> bdryCub_;
+  // Cell node information
+  scalar_view volCellNodes_;
+  std::vector<std::vector<scalar_view>> bdryCellNodes_;
+  std::vector<std::vector<std::vector<int>>> bdryCellLocIds_;
+  // Finite element definition
+  ROL::Ptr<fe_type> fe_vol_;
+  std::vector<std::vector<ROL::Ptr<fe_type>>> fe_bdry_;
+  // Local degrees of freedom on boundary, for each side of the reference cell (first index).
+  std::vector<std::vector<int>> fidx_;
+  Real a_, b_;
+
+public:
+  PDE_Doping(Teuchos::ParameterList &parlist) {
+    // Finite element fields.
+    int basisOrder = parlist.sublist("Problem").get("Order of FE discretization",1);
+    if (basisOrder == 1) {
+      basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceType, Real, Real>>();
+    }
+    else if (basisOrder == 2) {
+      basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C2_FEM<DeviceType, Real, Real>>();
+    }
+    basisPtrs_.clear(); basisPtrs_.push_back(basisPtr_);
+    // Quadrature rules.
+    shards::CellTopology cellType = basisPtr_->getBaseCellTopology();                  // get the cell type from any basis
+    Intrepid2::DefaultCubatureFactory cubFactory;                                      // create cubature factory
+    int cubDegree = parlist.sublist("PDE Poisson Boltzmann").get("Cubature Degree",2); // set cubature degree, e.g., 2
+    cellCub_ = cubFactory.create<DeviceType, Real, Real>(cellType, cubDegree);         // create default cubature
+
+    int d = cellType.getDimension();
+    shards::CellTopology bdryCellType = cellType.getCellTopologyData(d-1, 0);
+    int bdryCubDegree = parlist.sublist("Problem").get("Boundary Cubature Degree",2); // set cubature degree, e.g., 2
+    bdryCub_ = cubFactory.create<DeviceType, Real, Real>(bdryCellType, bdryCubDegree);
+    a_  = parlist.sublist("Problem").get("Desired Lower Doping Value", 0.3);
+    b_  = parlist.sublist("Problem").get("Desired Upper Doping Value", 1.0);
+  }
+
+  void residual(scalar_view & res,
+                const scalar_view u_coeff,
+                const scalar_view z_coeff = scalar_view(),
+                const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    const std::vector<Real> param = PDE<Real, DeviceType>::getParameter();
+    Real cond = (param.size()>2 ? param[2] : static_cast<Real>(2e-5));
+    // GET DIMENSIONS
+    const int c = fe_vol_->gradN().extent_int(0);
+    const int f = fe_vol_->gradN().extent_int(1);
+    const int p = fe_vol_->gradN().extent_int(2);
+    const int d = fe_vol_->gradN().extent_int(3);
+    // INITIALIZE RESIDUAL
+    res = scalar_view("res", c, f);
+    // EVALUATE STATE AND CONTROL OF FEM BASIS
+    scalar_view valU_eval, valZ_eval, gradU_eval;
+    valU_eval  = scalar_view("valU_eval", c, p);
+    valZ_eval  = scalar_view("valZ_eval", c, p);
+    gradU_eval = scalar_view("gradU_eval", c, p, d);
+    fe_vol_->evaluateValue(valU_eval, u_coeff);
+    fe_vol_->evaluateValue(valZ_eval, z_coeff);
+    fe_vol_->evaluateGradient(gradU_eval, u_coeff);
+    // ADD STIFFNESS TERM TO RESIDUAL
+    scalar_view lambda2_gradU_eval("lambda2_gradU_eval", c, p, d);
+    rst::scale(lambda2_gradU_eval, gradU_eval, cond);
+    fst::integrate(res, lambda2_gradU_eval, fe_vol_->gradNdetJ(), false);
+    // ADD REACTION TERM TO RESIDUAL
+    fst::integrate(res, valU_eval, fe_vol_->NdetJ(), true);
+    // ADD CONTROL TERM TO RESIDUAL
+    scalar_view valZ_scal("valZ_scal", c, p);
+    rst::scale(valZ_scal, valZ_eval, static_cast<Real>(-1));
+    fst::integrate(res, valZ_scal, fe_vol_->NdetJ(), true);
+    // APPLY DIRICHLET CONDITIONS
+    std::vector<std::vector<scalar_view>> bdryCellDofValues;
+    computeDirichlet(bdryCellDofValues);
+    const int numSideSets = bdryCellLocIds_.size();
+    for (int i = 0; i < numSideSets; ++i) {
+      if ( i==1 || i==2 || i==3 ) {
+        int numLocalSideIds = bdryCellLocIds_[i].size();
+        for (int j = 0; j < numLocalSideIds; ++j) {
+          int numCellsSide = bdryCellLocIds_[i][j].size();
+          int numBdryDofs = fidx_[j].size();
+          for (int k = 0; k < numCellsSide; ++k) {
+            int cidx = bdryCellLocIds_[i][j][k];
+            for (int l = 0; l < numBdryDofs; ++l) {
+              res(cidx, fidx_[j][l]) = u_coeff(cidx, fidx_[j][l]) - (bdryCellDofValues[i][j])(k,l);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void Jacobian_1(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    const std::vector<Real> param = PDE<Real, DeviceType>::getParameter();
+    Real cond = (param.size()>2 ? param[2] : static_cast<Real>(2e-5));
+    // GET DIMENSIONS
+    const int c = fe_vol_->gradN().extent_int(0);
+    const int f = fe_vol_->gradN().extent_int(1);
+    const int p = fe_vol_->gradN().extent_int(2);
+    const int d = fe_vol_->gradN().extent_int(3);
+    // INITIALIZE JACOBIAN
+    jac = scalar_view("jac1", c, f, f);
+    // ADD STIFFNESS TERM TO JACOBIAN
+    scalar_view lambda2_gradN_eval("lambda2_gradN_eval", c, f, p, d);
+    rst::scale(lambda2_gradN_eval, fe_vol_->gradN(), cond);
+    fst::integrate(jac, lambda2_gradN_eval, fe_vol_->gradNdetJ(), false);
+    // ADD REACTION TERM TO JACOBIAN
+    fst::integrate(jac, fe_vol_->N(), fe_vol_->NdetJ(), true);
+    // APPLY DIRICHLET CONDITIONS
+    const int numSideSets = bdryCellLocIds_.size();
+    for (int i = 0; i < numSideSets; ++i) {
+      if ( i==1 || i==2 || i==3 ) {
+        int numLocalSideIds = bdryCellLocIds_[i].size();
+        for (int j = 0; j < numLocalSideIds; ++j) {
+          int numCellsSide = bdryCellLocIds_[i][j].size();
+          int numBdryDofs = fidx_[j].size();
+          for (int k = 0; k < numCellsSide; ++k) {
+            int cidx = bdryCellLocIds_[i][j][k];
+            for (int l = 0; l < numBdryDofs; ++l) {
+              //std::cout << "\n   j=" << j << "  l=" << l << "  " << fidx[j][l];
+              for (int m = 0; m < f; ++m) {
+                jac(cidx, fidx_[j][l], m) = static_cast<Real>(0);
+              }
+              jac(cidx, fidx_[j][l], fidx_[j][l]) = static_cast<Real>(1);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void Jacobian_2(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // GET DIMENSIONS
+    const int c = fe_vol_->gradN().extent_int(0);
+    const int f = fe_vol_->gradN().extent_int(1);
+    const int p = fe_vol_->gradN().extent_int(2);
+    // INITIALIZE JACOBIAN
+    jac = scalar_view("jac2", c, f, f);
+    // ADD CONTROL TERM
+    scalar_view valN_scal("valN_scal", c, f, p);
+    rst::scale(valN_scal, fe_vol_->N(), static_cast<Real>(-1));
+    fst::integrate(jac, valN_scal, fe_vol_->NdetJ(), false);
+    // APPLY DIRICHLET CONDITIONS
+    const int numSideSets = bdryCellLocIds_.size();
+    for (int i = 0; i < numSideSets; ++i) {
+      if ( i==1 || i==2 || i==3 ) {
+        int numLocalSideIds = bdryCellLocIds_[i].size();
+        for (int j = 0; j < numLocalSideIds; ++j) {
+          int numCellsSide = bdryCellLocIds_[i][j].size();
+          int numBdryDofs = fidx_[j].size();
+          for (int k = 0; k < numCellsSide; ++k) {
+            int cidx = bdryCellLocIds_[i][j][k];
+            for (int l = 0; l < numBdryDofs; ++l) {
+              //std::cout << "\n   j=" << j << "  l=" << l << "  " << fidx[j][l];
+              for (int m = 0; m < f; ++m) {
+                jac(cidx, fidx_[j][l], m) = static_cast<Real>(0);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void Hessian_11(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Poisson_Boltzmann_ex02:Hessian_11: Hessian is zero.");
+  }
+
+  void Hessian_12(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Poisson_Boltzmann_ex02:Hessian_12: Hessian is zero.");
+  }
+
+  void Hessian_21(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Poisson_Boltzmann_ex02:Hessian_21: Hessian is zero.");
+  }
+
+  void Hessian_22(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Poisson_Boltzmann_ex02:Hessian_22: Hessian is zero.");
+  }
+
+  void RieszMap_1(scalar_view & riesz) override {
+    // GET DIMENSIONS
+    const int c = fe_vol_->N().extent_int(0);
+    const int f = fe_vol_->N().extent_int(1);
+    // INITIALIZE RIESZ
+    riesz = scalar_view("riesz1", c, f, f);
+    Kokkos::deep_copy(riesz, fe_vol_->stiffMat());
+    rst::add(riesz, fe_vol_->massMat());
+  }
+
+  void RieszMap_2(scalar_view & riesz) override {
+    // GET DIMENSIONS
+    const int c = fe_vol_->N().extent_int(0);
+    const int f = fe_vol_->N().extent_int(1);
+    // INITIALIZE RIESZ
+    riesz = scalar_view("riesz2", c, f, f);
+    Kokkos::deep_copy(riesz, fe_vol_->stiffMat());
+    rst::add(riesz, fe_vol_->massMat());
+  }
+
+  std::vector<basis_ptr> getFields() override {
+    return basisPtrs_;
+  }
+
+  void setCellNodes(const scalar_view & volCellNodes,
+                    const std::vector<std::vector<scalar_view>> & bdryCellNodes,
+                    const std::vector<std::vector<std::vector<int>>> & bdryCellLocIds) override {
+    volCellNodes_ = volCellNodes;
+    bdryCellNodes_ = bdryCellNodes;
+    bdryCellLocIds_ = bdryCellLocIds;
+    // Finite element definition.
+    fe_vol_ = ROL::makePtr<fe_type>(volCellNodes_, basisPtr_, cellCub_);
+    // Set local boundary DOFs.
+    fidx_ = fe_vol_->getBoundaryDofs();
+    // Construct boundary FEs
+    const int numSidesets = bdryCellNodes.size();
+    fe_bdry_.resize(numSidesets);
+    for(int i = 0; i < numSidesets; ++i) {
+      int numLocSides = bdryCellNodes[i].size();
+      fe_bdry_[i].resize(numLocSides);
+      for (int j = 0; j < numLocSides; ++j) {
+        if (bdryCellNodes_[i][j] != scalar_view()) {
+          fe_bdry_[i][j] = ROL::makePtr<fe_type>(bdryCellNodes_[i][j], basisPtr_, bdryCub_, j);
+        }
+      }
+    }
+  }
+
+  const ROL::Ptr<fe_type> getFE(void) const {
+    return fe_vol_;
+  }
+
+  const std::vector<std::vector<fe_type>> getBdryFE(void) const {
+    return fe_bdry_;
+  }
+
+  const scalar_view getCellNodes(void) const {
+    return volCellNodes_;
+  }
+
+  const std::vector<std::vector<std::vector<int>>> getBdryCellLocIds(void) const {
+    return bdryCellLocIds_;
+  }
+
+private:
+
+  Real evaluateDirichlet(const std::vector<Real> & coords, int sideset, int locSideId) const {
+    Real val(0);
+    if (sideset==1) {
+      val = a_;
+    }
+    if (sideset==2) {
+      val = b_;
+    }
+    if (sideset==3) {
+      val = a_;
+    }
+    return val;
+  }
+
+  void computeDirichlet(std::vector<std::vector<scalar_view>> & bdryCellDofValues) const {
+    // Compute Dirichlet values at DOFs.
+    int d = basisPtr_->getBaseCellTopology().getDimension();
+    int numSidesets = bdryCellLocIds_.size();
+    bdryCellDofValues.resize(numSidesets);
+    for (int i=0; i<numSidesets; ++i) {
+      int numLocSides = bdryCellLocIds_[i].size();
+      bdryCellDofValues[i].resize(numLocSides);
+      for (int j=0; j<numLocSides; ++j) {
+        int c = bdryCellLocIds_[i][j].size();
+        int f = basisPtr_->getCardinality();
+        bdryCellDofValues[i][j] = scalar_view("bdryCellDofValues", c, f);
+        scalar_view coords("coords", c, f, d);
+        if (c > 0) {
+          fe_vol_->computeDofCoords(coords, bdryCellNodes_[i][j]);
+        }
+        for (int k=0; k<c; ++k) {
+          for (int l=0; l<f; ++l) {
+            std::vector<Real> dofpoint(d);
+            for (int m=0; m<d; ++m) {
+              dofpoint[m] = coords(k, l, m);
+            }
+            (bdryCellDofValues[i][j])(k, l) = evaluateDirichlet(dofpoint, i, j);
+          }
+        }
+      }
+    }
+  }
+
+}; // PDE_Poisson_Boltzmann
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/poisson/CMakeLists.txt b/packages/rol/example/PDE-OPT/poisson/CMakeLists.txt
index 3389c3d90e37..c694553c5221 100644
--- a/packages/rol/example/PDE-OPT/poisson/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/poisson/CMakeLists.txt
@@ -8,7 +8,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   # Need ROL_TpetraMultiVector.hpp
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
 
-  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  ROL_ADD_EXECUTABLE_AND_TEST(
     example_01
     SOURCES example_01.cpp
     ARGS PrintItAll
@@ -18,7 +18,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     PoissonDataCopy
     SOURCE_FILES
       input.xml plotresults.m p-cube-4x4x4.txt p-cube-4x4x4.e p-cube-8x8x8.txt p-cube-8x8x8.e
@@ -28,3 +28,33 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
 
 
 ENDIF()
+
+IF(${PROJECT_NAME}_ENABLE_Intrepid2 AND
+   ${PROJECT_NAME}_ENABLE_Ifpack2  AND
+   ${PROJECT_NAME}_ENABLE_MueLu    AND
+   ${PROJECT_NAME}_ENABLE_Amesos2  AND
+   ${PROJECT_NAME}_ENABLE_Tpetra )
+
+  # Need ROL_TpetraMultiVector.hpp
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
+
+  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+    example_01_Kokkos
+    SOURCES example_01_K.cpp
+    ARGS PrintItAll
+    NUM_MPI_PROCS 4
+    NUM_TOTAL_CORES_USED 4
+    PASS_REGULAR_EXPRESSION "TEST PASSED"
+    ADD_DIR_TO_NAME
+  )
+
+  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+    PoissonDataCopyK
+    SOURCE_FILES
+      input.xml plotresults.m p-cube-4x4x4.txt p-cube-4x4x4.e p-cube-8x8x8.txt p-cube-8x8x8.e
+    SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}"
+    DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
+  )
+
+
+ENDIF()
diff --git a/packages/rol/example/PDE-OPT/poisson/example_01.cpp b/packages/rol/example/PDE-OPT/poisson/example_01.cpp
index 5f9bd379caa7..4164e19703b2 100644
--- a/packages/rol/example/PDE-OPT/poisson/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/poisson/example_01.cpp
@@ -13,8 +13,7 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -74,7 +73,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int>> comm
     = Tpetra::getDefaultComm();
   const int myRank = comm->getRank();
diff --git a/packages/rol/example/PDE-OPT/poisson/example_01_K.cpp b/packages/rol/example/PDE-OPT/poisson/example_01_K.cpp
new file mode 100644
index 000000000000..92a05095f733
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/poisson/example_01_K.cpp
@@ -0,0 +1,195 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_01.cpp
+    \brief Shows how to solve the Poisson control problem.
+*/
+
+#include "Teuchos_Comm.hpp"
+#include "ROL_Stream.hpp"
+#include "Teuchos_GlobalMPISession.hpp"
+#include "Teuchos_XMLParameterListHelpers.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+
+#include "ROL_Reduced_Objective_SimOpt.hpp"
+#include "ROL_Solver.hpp"
+
+#include "../TOOLS/meshreaderK.hpp"
+#include "../TOOLS/linearpdeconstraintK.hpp"
+#include "../TOOLS/pdeobjectiveK.hpp"
+#include "../TOOLS/pdevectorK.hpp"
+#include "pde_poissonK.hpp"
+#include "obj_poissonK.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+template<class Real>
+class stateSolution : public Solution<Real> {
+public:
+  stateSolution(void) {}
+  Real evaluate(const std::vector<Real> &x, const int fieldNumber) const {
+    const Real pi(M_PI);
+    Real u(1);
+    int dim = x.size();
+    for (int i=0; i<dim; ++i) {
+      u *= std::sin(pi*x[i]);
+    }
+    return u;
+  }
+};
+
+template<class Real>
+class controlSolution : public Solution<Real> {
+private:
+  const Real alpha_;
+public:
+  controlSolution(const Real alpha) : alpha_(alpha) {}
+  Real evaluate(const std::vector<Real> &x, const int fieldNumber) const {
+    const Real eight(8), pi(M_PI);
+    Real z(1);
+    int dim = x.size();
+    for (int i=0; i<dim; ++i) {
+      z *= std::sin(eight*pi*x[i]);
+    }
+    Real coeff1(64), coeff2(dim);
+    return -z/(alpha_*coeff1*coeff2*pi*pi);
+  }
+};
+
+int main(int argc, char *argv[]) {
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+
+  /*** Initialize communicator. ***/
+  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  Kokkos::ScopeGuard kokkosScope (argc, argv);
+  
+  ROL::Ptr<const Teuchos::Comm<int>> comm
+    = Tpetra::getDefaultComm();
+  const int myRank = comm->getRank();
+  if ((iprint > 0) && (myRank == 0)) {
+    outStream = ROL::makePtrFromRef(std::cout);
+  }
+  else {
+    outStream = ROL::makePtrFromRef(bhs);
+  }
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+
+    /*** Read in XML input ***/
+    std::string filename = "input.xml";
+    auto parlist = ROL::getParametersFromXmlFile(filename);
+
+    /*** Initialize main data structure. ***/
+    int probDim = parlist->sublist("Problem").get("Problem Dimension",2);
+    ROL::Ptr<MeshManager<RealT,DeviceT>> meshMgr;
+    if (probDim == 1) {
+      meshMgr = ROL::makePtr<MeshManager_Interval<RealT,DeviceT>>(*parlist);
+    } else if (probDim == 2) {
+      meshMgr = ROL::makePtr<MeshManager_Rectangle<RealT,DeviceT>>(*parlist);
+    } else if (probDim == 3) {
+      meshMgr = ROL::makePtr<MeshReader<RealT,DeviceT>>(*parlist);
+    }
+    else {
+      TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument,
+        ">>> PDE-OPT/poisson/example_01.cpp: Problem dim is not 1, 2 or 3!");
+    }
+    // Initialize PDE describe Poisson's equation
+    ROL::Ptr<PDE_Poisson<RealT,DeviceT>>
+      pde = ROL::makePtr<PDE_Poisson<RealT,DeviceT>>(*parlist);
+    ROL::Ptr<ROL::Constraint_SimOpt<RealT>>
+      con = ROL::makePtr<Linear_PDE_Constraint<RealT,DeviceT>>(pde,meshMgr,comm,*parlist,*outStream);
+    // Cast the constraint and get the assembler.
+    ROL::Ptr<Linear_PDE_Constraint<RealT,DeviceT>>
+      pdecon = ROL::dynamicPtrCast<Linear_PDE_Constraint<RealT,DeviceT>>(con);
+    ROL::Ptr<Assembler<RealT,DeviceT>> assembler = pdecon->getAssembler();
+    // Initialize quadratic objective function
+    std::vector<ROL::Ptr<QoI<RealT,DeviceT>>> qoi_vec(2,ROL::nullPtr);
+    qoi_vec[0] = ROL::makePtr<QoI_L2Tracking_Poisson<RealT,DeviceT>>(pde->getFE());
+    qoi_vec[1] = ROL::makePtr<QoI_L2Penalty_Poisson<RealT,DeviceT>>(pde->getFE());
+    RealT alpha = parlist->sublist("Problem").get("Control penalty parameter",1e-2);
+    std::vector<RealT> wt(2); wt[0] = static_cast<RealT>(1); wt[1] = alpha;
+    ROL::Ptr<ROL::Objective_SimOpt<RealT>>
+      obj = ROL::makePtr<PDE_Objective<RealT,DeviceT>>(qoi_vec,wt,assembler);
+
+    // Create state vector and set to zeroes
+    ROL::Ptr<Tpetra::MultiVector<>> u_ptr, z_ptr, p_ptr, r_ptr;
+    ROL::Ptr<ROL::Vector<RealT>> up, zp, pp, rp;
+    u_ptr  = assembler->createStateVector();   u_ptr->putScalar(0.0);
+    z_ptr  = assembler->createControlVector(); z_ptr->putScalar(0.0);
+    p_ptr  = assembler->createStateVector();   p_ptr->putScalar(0.0);
+    r_ptr  = assembler->createStateVector();   r_ptr->putScalar(0.0);
+    up  = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(u_ptr,pde,assembler);
+    zp  = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(z_ptr,pde,assembler);
+    pp  = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(p_ptr,pde,assembler);
+    rp  = ROL::makePtr<PDE_DualSimVector<RealT,DeviceT>>(r_ptr,pde,assembler);
+
+    // Initialize reduced objective function
+    ROL::Ptr<ROL::Reduced_Objective_SimOpt<RealT>>
+      robj = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(obj, con, up, zp, pp);
+
+    // Build optimization problem and check derivatives
+    ROL::Ptr<ROL::Problem<RealT>>
+      optProb = ROL::makePtr<ROL::Problem<RealT>>(robj,zp);
+    optProb->check(true,*outStream);
+
+    // Build optimization solver and solve
+    zp->zero(); up->zero(); pp->zero();
+    ROL::Solver<RealT> optSolver(optProb,*parlist);
+    std::clock_t timerTR = std::clock();
+    optSolver.solve(*outStream);
+    *outStream << "Trust Region Time: "
+               << static_cast<RealT>(std::clock()-timerTR)/static_cast<RealT>(CLOCKS_PER_SEC)
+               << " seconds." << std::endl << std::endl;
+
+    // Compute solution error
+    RealT tol(1.e-8);
+    con->solve(*rp,*up,*zp,tol);
+    ROL::Ptr<Solution<RealT>>
+      usol = ROL::makePtr<stateSolution<RealT>>();
+    RealT uerr = assembler->computeStateError(u_ptr,usol);
+    *outStream << "State Error: " << uerr << std::endl;
+    ROL::Ptr<Solution<RealT>>
+      zsol = ROL::makePtr<controlSolution<RealT>>(alpha);
+    RealT zerr = assembler->computeControlError(z_ptr,zsol);
+    *outStream << "Control Error: " << zerr << std::endl;
+
+    // Output.
+    pdecon->outputTpetraVector(u_ptr,"state.txt");
+    pdecon->outputTpetraVector(z_ptr,"control.txt");
+    pdecon->outputTpetraData();
+    assembler->printMeshData(*outStream);
+
+    errorFlag += (uerr > 10. ? 1 : 0);
+
+    // Get a summary from the time monitor.
+    Teuchos::TimeMonitor::summarize();
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/poisson/obj_poissonK.hpp b/packages/rol/example/PDE-OPT/poisson/obj_poissonK.hpp
new file mode 100644
index 000000000000..c520b118d0b0
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/poisson/obj_poissonK.hpp
@@ -0,0 +1,240 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  obj.hpp
+    \brief Provides the interface for local (cell-based) objective function computations.
+*/
+
+#ifndef PDEOPT_QOI_L2TRACKING_POISSONK_HPP
+#define PDEOPT_QOI_L2TRACKING_POISSONK_HPP
+
+#include "../TOOLS/qoiK.hpp"
+#include "pde_poissonK.hpp"
+
+template <class Real,class DeviceType>
+class QoI_L2Tracking_Poisson : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using fe_type = FE<Real,DeviceType>;
+  using fst = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using rst = Intrepid2::RealSpaceTools<DeviceType>;
+private:
+  ROL::Ptr<fe_type> fe_;
+
+  scalar_view target_;
+
+  Real targetFunc(const std::vector<Real> & x) const {
+    int size = x.size();
+    const Real eight(8), pi(M_PI);
+    Real s1(1), s2(1);
+    for (int i = 0; i < size; ++i) {
+      s1 *= std::sin(eight*pi*x[i]);
+      s2 *= std::sin(pi*x[i]);
+    }
+    return -s1 + s2;
+  }
+
+public:
+  QoI_L2Tracking_Poisson(const ROL::Ptr<fe_type> &fe) : fe_(fe) {
+    int c = fe_->cubPts().extent_int(0);
+    int p = fe_->cubPts().extent_int(1);
+    int d = fe_->cubPts().extent_int(2);
+    std::vector<Real> pt(d);
+    target_ = scalar_view("target_",c,p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k) {
+          pt[k] = fe_->cubPts()(i,j,k);
+        }
+        target_(i,j) = targetFunc(pt);
+      }
+    }
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = u_coeff.extent_int(0);
+    int p = fe_->cubPts().extent_int(1);
+    // Initialize output val
+    val = scalar_view("val",c);
+    // Evaluate state on FE basis
+    scalar_view valU_eval("valU_eval", c, p);
+    fe_->evaluateValue(valU_eval, u_coeff);
+    // Compute difference between state and target
+    rst::subtract(valU_eval,target_);
+    // Compute squared L2-norm of diff
+    fe_->computeIntegral(val,valU_eval,valU_eval);
+    // Scale by one half
+    rst::scale(val,static_cast<Real>(0.5));
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr)  override {
+    // Get relevant dimensions
+    int c = u_coeff.extent_int(0);
+    int p = fe_->cubPts().extent_int(1);
+    int f = fe_->N().extent_int(1);
+    // Initialize output grad
+    grad = scalar_view("grad", c, f);
+    // Evaluate state on FE basis
+    scalar_view valU_eval("valU_eval", c, p);
+    fe_->evaluateValue(valU_eval, u_coeff);
+    // Compute difference between state and target
+    rst::subtract(valU_eval,target_);
+    // Compute gradient of squared L2-norm of diff
+    fst::integrate(grad,valU_eval,fe_->NdetJ(), false);
+  }
+
+  void gradient_2(scalar_view & grad,
+                  scalar_view u_coeff,
+                  scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_L2Tracking_Poisson::gradient_2 is zero.");
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    int c = v_coeff.extent_int(0);
+    int p = fe_->cubPts().extent_int(1);
+    int f = fe_->N().extent_int(1);
+    scalar_view valV_eval("valV_eval",c, p);
+    hess = scalar_view("hess", c, f);
+    fe_->evaluateValue(valV_eval, v_coeff);
+    fst::integrate(hess,valV_eval,fe_->NdetJ(), false);
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_L2Tracking_Poisson::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_L2Tracking_Poisson::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_L2Tracking_Poisson::HessVec_22 is zero.");
+  }
+
+}; // QoI_L2Tracking
+
+template <class Real, class DeviceType>
+class QoI_L2Penalty_Poisson : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using fe_type = FE<Real,DeviceType>;
+  using fst = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using rst = Intrepid2::RealSpaceTools<DeviceType>;
+private:
+  ROL::Ptr<fe_type> fe_;
+
+public:
+  QoI_L2Penalty_Poisson(const ROL::Ptr<fe_type> &fe) : fe_(fe) {}
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = z_coeff.extent_int(0);
+    int p = fe_->cubPts().extent_int(1);
+    // Initialize output val
+    val = scalar_view("val",c);
+    // Build local state tracking term
+    scalar_view valZ_eval("valZ", c, p);
+    fe_->evaluateValue(valZ_eval, z_coeff);
+    fe_->computeIntegral(val,valZ_eval,valZ_eval);
+    rst::scale(val,static_cast<Real>(0.5));
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_L2Tracking_Poisson::gradient_1 is zero.");
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = z_coeff.extent_int(0);
+    int p = fe_->cubPts().extent_int(1);
+    int f = fe_->N().extent_int(1);
+    // Initialize output grad
+    grad = scalar_view("grad", c, f);
+    // Build local gradient of state tracking term
+    scalar_view valZ_eval("valZ_eval", c, p);
+    fe_->evaluateValue(valZ_eval, z_coeff);
+    fst::integrate(grad, valZ_eval, fe_->NdetJ(), false);
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_L2Penalty_Poisson::HessVec_11 is zero.");
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_L2Penalty_Poisson::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_L2Penalty_Poisson::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    int c = v_coeff.extent_int(0);
+    int p = fe_->cubPts().extent_int(1);
+    int f = fe_->N().extent_int(1);
+    scalar_view valV_eval("valV", c, p);
+    hess = scalar_view("hess", c, f);
+    fe_->evaluateValue(valV_eval, v_coeff);
+    fst::integrate(hess, valV_eval, fe_->NdetJ(), false);
+  }
+
+}; // QoI_L2Penalty
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/poisson/pde_poissonK.hpp b/packages/rol/example/PDE-OPT/poisson/pde_poissonK.hpp
new file mode 100644
index 000000000000..cb51276fd5cf
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/poisson/pde_poissonK.hpp
@@ -0,0 +1,367 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  pde.hpp
+    \brief Implements the local PDE interface for the Poisson control problem.
+*/
+
+#ifndef PDE_POISSONK_HPP
+#define PDE_POISSONK_HPP
+
+#include "../TOOLS/pdeK.hpp"
+#include "../TOOLS/feK.hpp"
+
+#include "Intrepid2_HGRAD_LINE_Cn_FEM.hpp"
+#include "Intrepid2_HGRAD_QUAD_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_QUAD_C2_FEM.hpp"
+#include "Intrepid2_HGRAD_HEX_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_HEX_C2_FEM.hpp"
+#include "Intrepid2_DefaultCubatureFactory.hpp"
+#include "Intrepid2_FunctionSpaceTools.hpp"
+#include "Intrepid2_CellTools.hpp"
+
+#include "ROL_Ptr.hpp"
+
+template <class Real, class DeviceType>
+class PDE_Poisson : public PDE<Real, DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type = FE<Real,DeviceType>;
+  using fst = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct = Intrepid2::CellTools<DeviceType>;
+  using rst = Intrepid2::RealSpaceTools<DeviceType>;
+private:
+  // Finite element basis information
+  basis_ptr basisPtr_;
+  std::vector<basis_ptr> basisPtrs_;
+  // Cell cubature information
+  ROL::Ptr<Intrepid2::Cubature<DeviceType, Real, Real>> cellCub_;
+  // Cell node information
+  scalar_view volCellNodes_;
+  std::vector<std::vector<scalar_view>> bdryCellNodes_;
+  std::vector<std::vector<std::vector<int>>> bdryCellLocIds_;
+  // Finite element definition
+  ROL::Ptr<fe_type> fe_vol_;
+  // Local degrees of freedom on boundary, for each side of the reference cell (first index).
+  std::vector<std::vector<int>> fidx_;
+  // Coordinates of degrees freedom on boundary cells.
+  // Indexing:  [sideset number][local side id](cell number, value at dof)
+  std::vector<std::vector<scalar_view>> bdryCellDofValues_;
+
+  bool useStateRiesz_;
+  bool useControlRiesz_;
+  Real alpha_;
+
+  Real DirichletFunc(const std::vector<Real> & coords, int sideset, int locSideId) const {
+    return 0;
+  }
+
+  Real evaluateRHS(const std::vector<Real> &x) const {
+    const Real pi(M_PI), eight(8);
+    Real s1(1), s2(1);
+    int dim = x.size();
+    for (int i=0; i<dim; ++i) {
+      s1 *= std::sin(eight*pi*x[i]);
+      s2 *= std::sin(pi*x[i]);
+    }
+    Real coeff1(64), coeff2(dim);
+    return s1/(alpha_*coeff1*coeff2*pi*pi) + coeff2*pi*pi*s2;
+  }
+
+  void computeRHS(scalar_view rhs) const {
+    // GET DIMENSIONS
+    int c = fe_vol_->gradN().extent_int(0);
+    int p = fe_vol_->gradN().extent_int(2);
+    int d = fe_vol_->gradN().extent_int(3);
+    std::vector<Real> pt(d);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for ( int k = 0; k < d; ++k) {
+          pt[k] = fe_vol_->cubPts()(i,j,k);
+        }
+        // Compute forcing term f
+        rhs(i,j) = -evaluateRHS(pt);
+      }
+    }
+  }
+
+public:
+  PDE_Poisson(Teuchos::ParameterList &parlist) {
+    // Finite element fields.
+    int basisOrder = parlist.sublist("Problem").get("Basis Order",1);
+    int cubDegree  = parlist.sublist("Problem").get("Cubature Degree",4);
+    int probDim    = parlist.sublist("Problem").get("Problem Dimension",2);
+    if (probDim > 3 || probDim < 1) {
+      TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument,
+        ">>> PDE-OPT/poisson/pde_poisson.hpp: Problem dimension is not 1, 2 or 3!");
+    }
+    if (basisOrder > 2 || basisOrder < 1) {
+      TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument,
+        ">>> PDE-OPT/poisson/pde_poisson.hpp: Basis order is not 1 or 2!");
+    }
+    if (probDim == 1) {
+      basisPtr_ = Teuchos::rcp(new Intrepid2::Basis_HGRAD_LINE_Cn_FEM<DeviceType, Real, Real>(basisOrder,Intrepid2::POINTTYPE_EQUISPACED));
+    } else if (probDim == 2) {
+      if (basisOrder == 1) {
+        basisPtr_ = Teuchos::rcp(new Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceType, Real, Real>);
+      }
+      else if (basisOrder == 2) {
+        basisPtr_ = Teuchos::rcp(new Intrepid2::Basis_HGRAD_QUAD_C2_FEM<DeviceType, Real, Real>);
+      }
+    } else if (probDim == 3) {
+      if (basisOrder == 1) {
+        basisPtr_ = Teuchos::rcp(new Intrepid2::Basis_HGRAD_HEX_C1_FEM<DeviceType, Real, Real>);
+      }
+      else if (basisOrder == 2) {
+        basisPtr_ = Teuchos::rcp(new Intrepid2::Basis_HGRAD_HEX_C2_FEM<DeviceType, Real, Real>);
+      }
+    }
+    basisPtrs_.clear(); basisPtrs_.push_back(basisPtr_);
+    // Quadrature rules.
+    shards::CellTopology cellType = basisPtr_->getBaseCellTopology();
+    Intrepid2::DefaultCubatureFactory cubFactory;
+    cellCub_ = cubFactory.create<DeviceType, Real, Real>(cellType, cubDegree);
+    // Problem data.
+    useStateRiesz_   = parlist.sublist("Problem").get("Use State Riesz Map", true);      // use Riesz map for state variables?
+    useControlRiesz_ = parlist.sublist("Problem").get("Use Control Riesz Map", true);    // use Riesz map for control variables?
+    alpha_           = parlist.sublist("Problem").get("Control penalty parameter",1e-2);
+  }
+
+  void residual(scalar_view & res,
+                const scalar_view u_coeff,
+                const scalar_view z_coeff = ROL::nullPtr,
+                const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // GET DIMENSIONS
+    int c = u_coeff.extent_int(0);
+    int p = cellCub_->getNumPoints();
+    int f = basisPtr_->getCardinality();
+    int d = cellCub_->getDimension();
+    // INITIALIZE RESIDUAL
+    res = scalar_view("res", c, f);
+    // COMPUTE STIFFNESS TERM
+    scalar_view gradU_eval("gradU_eval", c, p, d);
+    fe_vol_->evaluateGradient(gradU_eval, u_coeff);
+    fst::integrate(res, gradU_eval, fe_vol_->gradNdetJ(), false);
+    // COMPUTE RHS
+    scalar_view rhs("rhs", c, p);
+    computeRHS(rhs);
+    fst::integrate(res, rhs, fe_vol_->NdetJ(), true);
+    // ADD CONTROL TERM TO RESIDUAL
+    if ( z_coeff.is_allocated()) {
+      scalar_view valZ_eval("valZ_eval", c, p);
+      fe_vol_->evaluateValue(valZ_eval, z_coeff);
+      rst::scale(valZ_eval,static_cast<Real>(-1));
+      fst::integrate(res, valZ_eval, fe_vol_->NdetJ(), true);
+    }
+    // APPLY DIRICHLET CONDITIONS
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        int numLocalSideIds = bdryCellLocIds_[i].size();
+        for (int j = 0; j < numLocalSideIds; ++j) {
+          int numCellsSide = bdryCellLocIds_[i][j].size();
+          int numBdryDofs = fidx_[j].size();
+          for (int k = 0; k < numCellsSide; ++k) {
+            int cidx = bdryCellLocIds_[i][j][k];
+            for (int l = 0; l < numBdryDofs; ++l) {
+              //std::cout << "\n   j=" << j << "  l=" << l << "  " << fidx_[j][l];
+              res(cidx,fidx_[j][l]) = u_coeff(cidx,fidx_[j][l]) - (bdryCellDofValues_[i][j])(k,fidx_[j][l]);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void Jacobian_1(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) override {
+    // GET DIMENSIONS
+    int c = u_coeff.extent_int(0);
+    int f = basisPtr_->getCardinality();
+    // INITILAIZE JACOBIAN
+    jac = scalar_view("jac", c, f, f);
+    // COMPUTE STIFFNESS TERM
+    fst::integrate(jac, fe_vol_->gradN(), fe_vol_->gradNdetJ(), false);
+    // APPLY DIRICHLET CONDITIONS
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        int numLocalSideIds = bdryCellLocIds_[i].size();
+        for (int j = 0; j < numLocalSideIds; ++j) {
+          int numCellsSide = bdryCellLocIds_[i][j].size();
+          int numBdryDofs = fidx_[j].size();
+          for (int k = 0; k < numCellsSide; ++k) {
+            int cidx = bdryCellLocIds_[i][j][k];
+            for (int l = 0; l < numBdryDofs; ++l) {
+              //std::cout << "\n   j=" << j << "  l=" << l << "  " << fidx[j][l];
+              for (int m = 0; m < f; ++m) {
+                jac(cidx,fidx_[j][l],m) = static_cast<Real>(0);
+              }
+              jac(cidx,fidx_[j][l],fidx_[j][l]) = static_cast<Real>(1);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void Jacobian_2(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real> > & z_param = ROL::nullPtr) override {
+    if ( z_coeff.is_allocated() ) {
+      // GET DIMENSIONS
+      int c = u_coeff.extent_int(0);
+      int f = basisPtr_->getCardinality();
+      // INITIALIZE JACOBIAN
+      jac = scalar_view("jac", c, f, f);
+      // ADD CONTROL TERM
+      fst::integrate(jac, fe_vol_->N(), fe_vol_->NdetJ(), false);
+      rst::scale(jac, static_cast<Real>(-1));
+      // APPLY DIRICHLET CONDITIONS
+      int numSideSets = bdryCellLocIds_.size();
+      if (numSideSets > 0) {
+        for (int i = 0; i < numSideSets; ++i) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numBdryDofs = fidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                //std::cout << "\n   j=" << j << "  l=" << l << "  " << fidx[j][l];
+                for (int m = 0; m < f; ++m) {
+                  jac(cidx,fidx_[j][l],m) = static_cast<Real>(0);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    else {
+      throw Exception::Zero(">>> (PDE_Poisson::Jacobian_2): Jacobian is zero.");
+    }
+  }
+
+  void Hessian_11(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Poisson::Hessian_11): Hessian is zero.");
+  }
+
+  void Hessian_12(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Poisson::Hessian_12): Hessian is zero.");
+  }
+
+  void Hessian_21(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Poisson::Hessian_21): Hessian is zero.");
+  }
+
+  void Hessian_22(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Poisson::Hessian_22): Hessian is zero.");
+  }
+
+  void RieszMap_1(scalar_view & riesz) override {
+    // Optionally disable Riesz map ...
+    if (!useStateRiesz_) {
+      throw Exception::NotImplemented(">>> (PDE_Poisson::RieszMap_1): Not implemented.");
+    }
+
+    // ...otherwise ...
+
+    int c = fe_vol_->N().extent_int(0);
+    int f = fe_vol_->N().extent_int(1);
+    // INITILAIZE JACOBIAN
+    riesz = scalar_view("riesz", c, f, f);
+    Kokkos::deep_copy(riesz,fe_vol_->stiffMat());
+    rst::add(riesz,fe_vol_->massMat());
+  }
+
+  void RieszMap_2(scalar_view & riesz) override {
+    // Optionally disable Riesz map ...
+    if (!useControlRiesz_) {
+      throw Exception::NotImplemented(">>> (PDE_Poisson::RieszMap_2): Not implemented.");
+    }
+
+    // ...otherwise ...
+
+    int c = fe_vol_->N().extent_int(0);
+    int f = fe_vol_->N().extent_int(1);
+    // INITILAIZE JACOBIAN
+    riesz = scalar_view("riesz", c, f, f);
+    Kokkos::deep_copy(riesz, fe_vol_->massMat());
+  }
+
+  std::vector<basis_ptr> getFields() override   {
+    return basisPtrs_;
+  }
+
+  void setCellNodes(const scalar_view &volCellNodes,
+                    const std::vector<std::vector<scalar_view>> &bdryCellNodes,
+                    const std::vector<std::vector<std::vector<int>>> &bdryCellLocIds) override {
+    volCellNodes_ = volCellNodes;
+    bdryCellNodes_ = bdryCellNodes;
+    bdryCellLocIds_ = bdryCellLocIds;
+    // Finite element definition.
+    fe_vol_ = ROL::makePtr<fe_type>(volCellNodes_,basisPtr_,cellCub_);
+    // Set local boundary DOFs.
+    fidx_ = fe_vol_->getBoundaryDofs();
+    // Compute Dirichlet values at DOFs.
+    int d = basisPtr_->getBaseCellTopology().getDimension();
+    int numSidesets = bdryCellLocIds_.size();
+    bdryCellDofValues_.resize(numSidesets);
+    for (int i=0; i<numSidesets; ++i) {
+      int numLocSides = bdryCellLocIds_[i].size();
+      bdryCellDofValues_[i].resize(numLocSides);
+      for (int j=0; j<numLocSides; ++j) {
+        int c = bdryCellLocIds_[i][j].size();
+        int f = basisPtr_->getCardinality();
+        bdryCellDofValues_[i][j] = scalar_view("bdryVals", c, f);
+        scalar_view coords("coords", c, f, d);
+        if (c > 0) {
+          fe_vol_->computeDofCoords(coords, bdryCellNodes_[i][j]);
+        }
+        for (int k=0; k<c; ++k) {
+          for (int l=0; l<f; ++l) {
+            std::vector<Real> dofpoint(d);
+            for (int m=0; m<d; ++m) {
+              dofpoint[m] = coords(k, l, m);
+            }
+            bdryCellDofValues_[i][j](k, l) = DirichletFunc(dofpoint, i, j);
+          }
+        }
+      }
+    }
+  }
+
+  const ROL::Ptr<fe_type> getFE(void) const {
+    return fe_vol_;
+  }
+
+}; // PDE_Poisson
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/published/CMakeLists.txt b/packages/rol/example/PDE-OPT/published/CMakeLists.txt
index 0956dfd9d439..8b5b6e1222c6 100644
--- a/packages/rol/example/PDE-OPT/published/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/published/CMakeLists.txt
@@ -1,3 +1,5 @@
 ADD_SUBDIRECTORY(IMAvolumes_KouriRidzal2017)
 ADD_SUBDIRECTORY(Helmholtz_KouriRidzalTuminaro2020)
 ADD_SUBDIRECTORY(TRSPG_Kouri2021)
+ADD_SUBDIRECTORY(NonsmoothTR_BaraldiKouri2022)
+ADD_SUBDIRECTORY(MPA_KouriSurowiec2019)
diff --git a/packages/rol/example/PDE-OPT/published/Helmholtz_KouriRidzalTuminaro2020/CMakeLists.txt b/packages/rol/example/PDE-OPT/published/Helmholtz_KouriRidzalTuminaro2020/CMakeLists.txt
index 47b2ae353229..7c4d20c3f018 100644
--- a/packages/rol/example/PDE-OPT/published/Helmholtz_KouriRidzalTuminaro2020/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/published/Helmholtz_KouriRidzalTuminaro2020/CMakeLists.txt
@@ -8,7 +8,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   # Need ROL_TpetraMultiVector.hpp
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_01
     SOURCES example_01.cpp
     ADD_DIR_TO_NAME
@@ -16,7 +16,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
 
   FILE(GLOB INP_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} inputs/*.xml)
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     HelmholtzKouriRidzalTuminaro2020DataCopy
     SOURCE_FILES
       input_ex01.xml
@@ -37,5 +37,44 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
   )
 
+ENDIF()
+
+IF(${PROJECT_NAME}_ENABLE_Intrepid2 AND
+   ${PROJECT_NAME}_ENABLE_Ifpack2   AND
+   ${PROJECT_NAME}_ENABLE_MueLu     AND
+   ${PROJECT_NAME}_ENABLE_Amesos2   AND
+   ${PROJECT_NAME}_ENABLE_Tpetra )
+
+  # Need ROL_TpetraMultiVector.hpp
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
+
+  ROL_ADD_EXECUTABLE(
+    example_01_Kokkos
+    SOURCES example_01K.cpp
+    ADD_DIR_TO_NAME
+  )
+
+  FILE(GLOB INP_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} inputs/*.xml)
+
+  ROL_COPY_FILES_TO_BINARY_DIR(
+    HelmholtzKouriRidzalTuminaro2020DataCopyK
+    SOURCE_FILES
+      input_ex01.xml
+      README.md
+      study.m
+      run.m
+      prdw.m
+      imshift.m
+      mesh/README.md
+      mesh/helmholtz-mesh-0.e
+      mesh/helmholtz-mesh-1.e
+      mesh/helmholtz-mesh-0.txt
+      mesh/helmholtz-mesh-1.txt
+      mesh/helmholtz-mesh.jou
+      results/README.md
+      ${INP_FILES}
+    SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}"
+    DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
+  )
 
 ENDIF()
diff --git a/packages/rol/example/PDE-OPT/published/Helmholtz_KouriRidzalTuminaro2020/example_01.cpp b/packages/rol/example/PDE-OPT/published/Helmholtz_KouriRidzalTuminaro2020/example_01.cpp
index 7ee452ee2494..7a442d0f5d3d 100644
--- a/packages/rol/example/PDE-OPT/published/Helmholtz_KouriRidzalTuminaro2020/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/published/Helmholtz_KouriRidzalTuminaro2020/example_01.cpp
@@ -13,8 +13,8 @@
 
 #include "Teuchos_Comm.hpp"
 #include "Teuchos_Time.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
 #include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -37,7 +37,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv);
+  ROL::GlobalMPISession mpiSession (&argc, &argv);
   ROL::Ptr<const Teuchos::Comm<int>> comm
     = Tpetra::getDefaultComm();
 
diff --git a/packages/rol/example/PDE-OPT/published/Helmholtz_KouriRidzalTuminaro2020/example_01K.cpp b/packages/rol/example/PDE-OPT/published/Helmholtz_KouriRidzalTuminaro2020/example_01K.cpp
new file mode 100644
index 000000000000..3a67803501b2
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/published/Helmholtz_KouriRidzalTuminaro2020/example_01K.cpp
@@ -0,0 +1,127 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_01.cpp
+    \brief Shows how to solve the optimal control of Helmholtz problem.
+*/
+
+#include "Teuchos_Comm.hpp"
+#include "Teuchos_Time.hpp"
+#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+
+#include "ROL_Stream.hpp"
+#include "ROL_Ptr.hpp"
+
+#include "../../TOOLS/meshmanagerK.hpp"
+#include "../../TOOLS/meshreaderK.hpp"
+#include "../../TOOLS/assemblerK.hpp"
+
+#include "pde_helmholtz_realK.hpp"
+#include "pde_helmholtz_imagK.hpp"
+#include "obj_helmholtzK.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+int main(int argc, char *argv[]) {
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession (&argc, &argv);
+  Kokkos::ScopeGuard kokkosScope (argc, argv);
+  auto comm = Tpetra::getDefaultComm();
+
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  const int numProcs = (comm->getSize() > 1) ? comm->getSize() : 0;
+  const int myRank = comm->getRank();
+  auto outStream = ROL::makeStreamPtr( std::cout, (argc > 1) && (myRank==0) );
+
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+    //RealT tol(1e-8);// one(1);
+
+    /*** Read in XML input ***/
+    std::string filename = "input_ex01.xml";
+    auto parlist = ROL::getParametersFromXmlFile(filename);
+    int example = parlist->sublist("Problem").get("Example",1);
+
+    /*** Initialize main data structure. ***/
+    ROL::Ptr<MeshManager<RealT,DeviceT>> meshMgr;
+    if (example==1)
+      meshMgr = ROL::makePtr<MeshReader<RealT,DeviceT>>(*parlist, numProcs);
+    else
+      meshMgr = ROL::makePtr<MeshManager_Rectangle<RealT,DeviceT>>(*parlist);
+    // Initialize PDE describing real components of the Helmholtz equation.
+    auto pde_real = ROL::makePtr<PDE_Helmholtz_Real<RealT,DeviceT>>(*parlist);
+    auto assembler_real = ROL::makePtr<Assembler<RealT,DeviceT>>(pde_real->getFields(),meshMgr,comm,*parlist,*outStream);
+    assembler_real->setCellNodes(*pde_real);
+    // Initialize PDE describing imaginary components of the Helmholtz equation.
+    auto pde_imag = ROL::makePtr<PDE_Helmholtz_Imag<RealT,DeviceT>>(*parlist);
+    auto assembler_imag = ROL::makePtr<Assembler<RealT,DeviceT>>(pde_imag->getFields(),meshMgr,comm,*parlist,*outStream);
+    assembler_imag->setCellNodes(*pde_imag);
+    // Initialize objective functions
+    auto qoi_state_real = ROL::makePtr<QoI_Helmholtz_StateTracking<RealT,DeviceT>>(pde_real->getFE(), *parlist, 0);
+    auto qoi_state_imag = ROL::makePtr<QoI_Helmholtz_StateTracking<RealT,DeviceT>>(pde_real->getFE(), *parlist, 1);
+    auto qoi_ctrl       = ROL::makePtr<QoI_Helmholtz_ControlPenalty<RealT,DeviceT>>(pde_real->getFE(), *parlist);
+
+    // Create state vector.
+    auto u_ptr = assembler_real->createStateVector();    u_ptr->putScalar(0.0);
+    auto z_ptr = assembler_real->createControlVector();  z_ptr->putScalar(0.0);
+    auto r_ptr = assembler_real->createResidualVector(); r_ptr->putScalar(0.0);
+
+    ROL::Ptr<Tpetra::CrsMatrix<>> A, B, L, M, C, R;
+    ROL::Ptr<Tpetra::MultiVector<>> wr, wi;
+    assembler_real->assemblePDEJacobian1(A,pde_real,u_ptr,z_ptr,ROL::nullPtr);
+    assembler_imag->assemblePDEJacobian1(B,pde_imag,u_ptr,z_ptr,ROL::nullPtr);
+    assembler_real->assemblePDEJacobian2(L,pde_real,u_ptr,z_ptr,ROL::nullPtr);
+    assembler_real->assemblePDERieszMap2(M,pde_real);
+    assembler_real->assembleQoIHessian11(C,qoi_state_real,u_ptr,z_ptr,ROL::nullPtr);
+    assembler_real->assembleQoIHessian22(R,qoi_ctrl,u_ptr,z_ptr,ROL::nullPtr);
+    assembler_real->assembleQoIGradient1(wr,qoi_state_real,u_ptr,z_ptr,ROL::nullPtr);
+    assembler_real->assembleQoIGradient1(wi,qoi_state_imag,u_ptr,z_ptr,ROL::nullPtr);
+    wr->scale(-1.0); wi->scale(-1.0);
+
+    Tpetra::MatrixMarket::Writer<Tpetra::CrsMatrix<>> matWriter;
+    matWriter.writeSparseFile("Amatrix.txt",A);
+    matWriter.writeSparseFile("Bmatrix.txt",B);
+    matWriter.writeSparseFile("Lmatrix.txt",L);
+    matWriter.writeSparseFile("Mmatrix.txt",M);
+    matWriter.writeSparseFile("Cmatrix.txt",C);
+    matWriter.writeSparseFile("Rmatrix.txt",R);
+
+    Tpetra::MatrixMarket::Writer<Tpetra::CrsMatrix<>> vecWriter;
+    vecWriter.writeDenseFile("WRvector.txt", wr);
+    vecWriter.writeDenseFile("WIvector.txt", wi);
+    std::string mapfile = "map.txt" + filename;
+    vecWriter.writeMapFile("map.txt", *wr->getMap());
+
+    assembler_real->printMeshData(*outStream);
+
+    // Get a summary from the time monitor.
+    Teuchos::TimeMonitor::summarize();
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/published/Helmholtz_KouriRidzalTuminaro2020/obj_helmholtzK.hpp b/packages/rol/example/PDE-OPT/published/Helmholtz_KouriRidzalTuminaro2020/obj_helmholtzK.hpp
new file mode 100644
index 000000000000..df20c52c5336
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/published/Helmholtz_KouriRidzalTuminaro2020/obj_helmholtzK.hpp
@@ -0,0 +1,394 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  obj.hpp
+    \brief Provides the interface for local (cell-based) objective function computations.
+*/
+
+#ifndef PDEOPT_QOI_HELMHOLTZK_HPP
+#define PDEOPT_QOI_HELMHOLTZK_HPP
+
+#include "../../TOOLS/qoiK.hpp"
+#include "pde_helmholtz_realK.hpp"
+
+template <class Real, class DeviceType>
+class QoI_Helmholtz_StateTracking : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> fe_;
+
+  scalar_view weight_;
+  std::vector<scalar_view> target_;
+
+  Real RoiRadius_;
+  Real waveNumber_;
+  Real angle_;
+  int example_;
+
+  unsigned comp_;
+
+protected:
+  bool insideDomain(const std::vector<Real> &x) const {
+    bool val = true;
+    if (example_==1) {
+      const Real eps = std::sqrt(ROL::ROL_EPSILON<Real>());
+      Real xnorm(0);
+      for (const auto xi : x) xnorm += xi*xi;
+      xnorm = std::sqrt(xnorm);
+      val = (xnorm <= RoiRadius_+eps);
+    }
+    return val;
+  }
+
+  void computeDomainWeight(void) {
+    int c = fe_->gradN().extent_int(0);
+    int p = fe_->gradN().extent_int(2);
+    int d = fe_->gradN().extent_int(3);
+
+    weight_ = scalar_view("weight",c,p);
+   
+    const Real zero(0), one(1);
+    bool inside(false);
+    std::vector<Real> x(d);
+    for (int i = 0; i < c; ++i) {
+      inside = false;
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k)
+          x[k] = (fe_->cubPts())(i,j,k);
+        if ( insideDomain(x) ) {
+          inside = true;
+          break;
+        }
+      }
+      for (int j = 0; j < p; ++j)
+        weight_(i,j) = (inside ? one : zero);
+    }
+  }
+
+  Real evaluateTarget(const std::vector<Real> &x, const int component) const {
+    const Real arg = waveNumber_ * (std::cos(angle_)*x[0] + std::sin(angle_)*x[1]);
+    return (component==0) ? std::cos(arg) : std::sin(arg);
+  }
+
+  void computeTarget(void) {
+    int c = fe_->gradN().extent_int(0);
+    int p = fe_->gradN().extent_int(2);
+    int d = fe_->gradN().extent_int(3);
+
+    target_.clear(); target_.resize(2);
+    target_[0] = scalar_view("target",c,p);
+    target_[1] = scalar_view("target",c,p);
+   
+    std::vector<Real> x(d);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k)
+          x[k] = (fe_->cubPts())(i,j,k);
+        (target_[0])(i,j) = evaluateTarget(x,0);
+        (target_[1])(i,j) = evaluateTarget(x,1);
+      }
+    }
+  }
+
+  Real DegreesToRadians(const Real deg) const {
+    return deg * static_cast<Real>(M_PI) / static_cast<Real>(180);
+  }
+  
+public:
+  QoI_Helmholtz_StateTracking(const ROL::Ptr<fe_type> &fe,
+                              ROL::ParameterList &parlist,
+                              int comp = 0)
+    : fe_(fe), RoiRadius_(2), comp_(comp) {
+    waveNumber_ = parlist.sublist("Problem").get("Wave Number",10.0);
+    angle_      = parlist.sublist("Problem").get("Target Angle",45.0);
+    angle_      = DegreesToRadians(angle_);
+    example_    = parlist.sublist("Problem").get("Example",1);
+    computeDomainWeight();
+    computeTarget();
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = fe_->gradN().extent_int(0);
+    int p = fe_->gradN().extent_int(2);
+    // Initialize output val
+    val = scalar_view("val",c);
+    // Evaluate tracking term
+    scalar_view diffU("diffU", c, p);
+    scalar_view WdiffU("WdiffU", c, p);
+    fe_->evaluateValue(diffU, u_coeff);
+    rst::subtract(diffU,target_[comp_]);
+    fst::scalarMultiplyDataData(WdiffU,weight_,diffU);
+    fe_->computeIntegral(val,WdiffU,diffU,true);
+    rst::scale(val,static_cast<Real>(0.5));
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    int p = fe_->gradN().extent_int(2);
+    // Evaluate tracking term
+    scalar_view diffU("diffU", c, p);
+    scalar_view WdiffU("WdiffU", c, p);
+    fe_->evaluateValue(diffU, u_coeff);
+    rst::subtract(diffU,target_[comp_]);
+    fst::scalarMultiplyDataData(diffU,weight_,diffU);
+    grad = scalar_view("grad", c, f);
+    fst::integrate(grad,WdiffU,fe_->NdetJ(),false);
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Helmholtz_StateTracking::gradient_2 is zero.");
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    int p = fe_->gradN().extent_int(2);
+    // Evaluate tracking term
+    scalar_view valV("valV", c, p);
+    scalar_view WvalV("WvalV", c, p);
+    fe_->evaluateValue(valV, v_coeff);
+    fst::scalarMultiplyDataData(WvalV,weight_,valV);
+    hess = scalar_view("hess", c, f);
+    fst::integrate(hess,WvalV,fe_->NdetJ(),false);
+  }
+
+  void Hessian_11(scalar_view & hess,
+            const scalar_view u_coeff,
+            const scalar_view z_coeff = scalar_view(),
+            const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    int p = fe_->gradN().extent_int(2);
+    // Build force/control term
+    scalar_view F("F", c, f, p);
+    fst::scalarMultiplyDataField(F, weight_, fe_->N());
+    hess = scalar_view("hess", c, f, f);
+    fst::integrate(hess,F,fe_->NdetJ(),false);
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Helmholtz::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Helmholtz::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Helmholtz::HessVec_22 is zero.");
+  }
+
+}; // QoI_Helmholtz
+
+
+template <class Real, class DeviceType>
+class QoI_Helmholtz_ControlPenalty : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> fe_;
+
+  Real innerAnnulusRadius_;
+  Real outerAnnulusRadius_;
+  int example_;
+
+  scalar_view weight_;
+
+  bool insideDomain(const std::vector<Real> &x) const {
+    bool val = true;
+    if (example_==1) {
+      const Real eps = std::sqrt(ROL::ROL_EPSILON<Real>());
+      Real xnorm(0);
+      for (const auto xi : x) xnorm += xi*xi;
+      xnorm = std::sqrt(xnorm);
+      val = (xnorm <= outerAnnulusRadius_+eps && xnorm >= innerAnnulusRadius_-eps);
+    }
+    return val;
+  }
+
+  void computeDomainWeight(void) {
+    int c = fe_->gradN().extent_int(0);
+    int p = fe_->gradN().extent_int(2);
+    int d = fe_->gradN().extent_int(3);
+
+    weight_ = scalar_view("weigt",c,p);
+   
+    const Real one(1), zero(0);
+    bool inside(false);
+    std::vector<Real> x(d);
+    for (int i = 0; i < c; ++i) {
+      inside = false;
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k)
+          x[k] = (fe_->cubPts())(i,j,k);
+        if ( insideDomain(x) ) {
+          inside = true;
+          break;
+        }
+      }
+      for (int j = 0; j < p; ++j)
+        weight_(i,j) = (inside ? one : zero);
+    }
+  }
+
+public:
+  QoI_Helmholtz_ControlPenalty(const ROL::Ptr<fe_type> &fe,
+                               ROL::ParameterList &parlist)
+  : fe_(fe), innerAnnulusRadius_(2.5), outerAnnulusRadius_(2.6) {
+    example_ = parlist.sublist("Problem").get("Example",1);
+    computeDomainWeight();
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = fe_->gradN().extent_int(0);
+    int p = fe_->gradN().extent_int(2);
+    // Initialize output val
+    val = scalar_view("val", c);
+    // Evaluate control penalty
+    scalar_view valZ("valZ", c, p);
+    scalar_view WvalZ("WvalZ", c, p);
+    fe_->evaluateValue(valZ, z_coeff);
+    fst::scalarMultiplyDataData(WvalZ,weight_,valZ);
+    fe_->computeIntegral(val,WvalZ,valZ,true);
+    rst::scale(val,static_cast<Real>(0.5));
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Helmholtz_ControlCost::gradient_1 is zero.");
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    int p = fe_->gradN().extent_int(2);
+    // Evaluate control penalty
+    scalar_view valZ("valZ", c, p);
+    scalar_view WvalZ("WvalZ", c, p);
+    fe_->evaluateValue(valZ, z_coeff);
+    fst::scalarMultiplyDataData(WvalZ,weight_,valZ);
+    grad = scalar_view("grad", c, f);
+    fst::integrate(grad,WvalZ,fe_->NdetJ(),false);
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Helmholtz_ControlCost::HessVec_11 is zero.");
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Helmholtz_ControlCost::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Helmholtz_ControlCost::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    int p = fe_->gradN().extent_int(2);
+    // Evaluate control penalty
+    scalar_view valV("valV", c, p);
+    scalar_view WvalV("WvalV", c, p);
+    fe_->evaluateValue(valV, v_coeff);
+    fst::scalarMultiplyDataData(WvalV,weight_,valV);
+    hess = scalar_view("hess", c, f);
+    fst::integrate(hess,WvalV,fe_->NdetJ(),false);
+  }
+
+  void Hessian_22(scalar_view & hess,
+            const scalar_view u_coeff,
+            const scalar_view z_coeff = scalar_view(),
+            const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    int p = fe_->gradN().extent_int(2);
+    // Build force/control term
+    scalar_view F("F", c, f, p);
+    fst::scalarMultiplyDataField(F, weight_, fe_->N());
+    hess = scalar_view("hess", c, f, f);
+    fst::integrate(hess,F,fe_->NdetJ(),false);
+  }
+
+}; // QoI_Helmholtz_ControlPenalty
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/published/Helmholtz_KouriRidzalTuminaro2020/pde_helmholtz_imagK.hpp b/packages/rol/example/PDE-OPT/published/Helmholtz_KouriRidzalTuminaro2020/pde_helmholtz_imagK.hpp
new file mode 100644
index 000000000000..7f18a68cf6e1
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/published/Helmholtz_KouriRidzalTuminaro2020/pde_helmholtz_imagK.hpp
@@ -0,0 +1,264 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  pde_helmholtz.hpp
+    \brief Implements the local PDE interface for the optimal control of
+           Helmholtz.
+*/
+
+#ifndef PDE_HELMHOLTZ_IMAGK_HPP
+#define PDE_HELMHOLTZ_IMAGK_HPP
+
+#include "../../TOOLS/pdeK.hpp"
+#include "../../TOOLS/feK.hpp"
+#include "../../TOOLS/fieldhelperK.hpp"
+
+#include "Intrepid2_HGRAD_QUAD_C1_FEM.hpp"
+#include "Intrepid2_DefaultCubatureFactory.hpp"
+#include "Intrepid2_FunctionSpaceTools.hpp"
+#include "Intrepid2_CellTools.hpp"
+
+#include "ROL_Ptr.hpp"
+
+template <class Real, class DeviceType>
+class PDE_Helmholtz_Imag : public PDE<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  // Finite element basis information
+  basis_ptr basisPtr_;
+  std::vector<basis_ptr> basisPtrs_;
+  // Cell cubature information
+  ROL::Ptr<Intrepid2::Cubature<DeviceType,Real,Real>> cellCub_, bdryCub_;
+  // Cell node information
+  scalar_view volCellNodes_;
+  std::vector<std::vector<scalar_view>> bdryCellNodes_;
+  std::vector<std::vector<std::vector<int>>> bdryCellLocIds_;
+  // Finite element definition
+  ROL::Ptr<fe_type> fe_;
+  std::vector<ROL::Ptr<fe_type>> feBdry_;
+  // Local degrees of freedom on boundary, for each side of the reference cell (first index).
+  std::vector<std::vector<int>> fidx_;
+  // Coordinates of degrees freedom on boundary cells.
+  // Indexing:  [sideset number][local side id](cell number, value at dof)
+  std::vector<std::vector<scalar_view>> bdryCellDofValues_;
+
+  Real waveNumber_;
+  Real dampFactor_;
+  Real impFactor_;
+  
+  scalar_view getBoundaryCoeff(const scalar_view cell_coeff, int sideSet, int cell) const {
+    std::vector<int> bdryCellLocId = bdryCellLocIds_[sideSet][cell];
+    const int numCellsSide = bdryCellLocId.size();
+    const int f = basisPtr_->getCardinality();
+    
+    scalar_view bdry_coeff("bdry_coeff", numCellsSide, f);
+    for (int i = 0; i < numCellsSide; ++i) {
+      for (int j = 0; j < f; ++j)
+        bdry_coeff(i, j) = cell_coeff(bdryCellLocId[i], j);
+    }
+    return bdry_coeff;
+  }
+
+public:
+  PDE_Helmholtz_Imag(ROL::ParameterList &parlist)
+    : waveNumber_(parlist.sublist("Problem").get("Wave Number",10.0)),
+      dampFactor_(parlist.sublist("Problem").get("Damping Factor", 1.0)),
+      impFactor_ (parlist.sublist("Problem").get("Impedance Factor", 1.0)) {
+    // Finite element fields.
+    basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceType,Real,Real>>();
+    basisPtrs_.clear(); basisPtrs_.push_back(basisPtr_);
+    // Quadrature rules.
+    shards::CellTopology cellType = basisPtr_->getBaseCellTopology();        // get the cell type from the basis
+    Intrepid2::DefaultCubatureFactory cubFactory;                            // create cubature factory
+    int cubDegree = parlist.sublist("Problem").get("Cubature Degree", 2);    // set cubature degree, e.g., 2
+    cellCub_ = cubFactory.create<DeviceType,Real,Real>(cellType, cubDegree); // create default cubature
+    int d = cellType.getDimension();
+    shards::CellTopology bdryCellType = cellType.getCellTopologyData(d-1, 0);
+    int bdryCubDegree = parlist.sublist("Problem").get("Boundary Cubature Degree",2); // set cubature degree, e.g., 2
+    bdryCub_ = cubFactory.create<DeviceType,Real,Real>(bdryCellType, bdryCubDegree);
+  }
+
+  void residual(scalar_view & res,
+                const scalar_view u_coeff,
+                const scalar_view z_coeff = scalar_view(),
+                const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    int p = fe_->gradN().extent_int(2);
+    // Initialize residuals.
+    res = scalar_view("res",c,f);
+    // Evaluate/interpolate finite element fields on cells.
+    scalar_view valu_eval("valu_eval", c, p);
+    fe_->evaluateValue(valu_eval, u_coeff);
+    // Integrate PDE term
+    fst::integrate(res,valu_eval,fe_->NdetJ(),false);
+    rst::scale(res, waveNumber_*dampFactor_);
+    // APPLY ROBIN CONTROLS: Sideset 0
+    int sideset = 0;
+    int numLocalSideIds = bdryCellLocIds_[sideset].size();
+    const int numCubPerSide = bdryCub_->getNumPoints();
+    for (int j = 0; j < numLocalSideIds; ++j) {
+      int numCellsSide = bdryCellLocIds_[sideset][j].size();
+      if (numCellsSide) {
+        scalar_view robinRes("robinRes", numCellsSide, f);
+        // Get U coefficients on Robin boundary
+        scalar_view u_coeff_bdry = getBoundaryCoeff(u_coeff, sideset, j);
+        // Evaluate U on FE basis
+        scalar_view valu_eval_bdry("valu_eval_bdry", numCellsSide, numCubPerSide);
+        feBdry_[j]->evaluateValue(valu_eval_bdry, u_coeff_bdry);
+        // Compute Robin residual
+        fst::integrate(robinRes,valu_eval_bdry,feBdry_[j]->NdetJ(),false);
+        // Add Robin residual to volume residual
+        for (int k = 0; k < numCellsSide; ++k) {
+          int cidx = bdryCellLocIds_[sideset][j][k];
+          for (int l = 0; l < f; ++l)
+            res(cidx,l) += waveNumber_*impFactor_*robinRes(k,l);
+        }
+      }
+    }
+  }
+
+  void Jacobian_1(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    // Initialize Jacobian.
+    jac = scalar_view("jac",c,f,f);
+    // Add PDE term
+    Kokkos::deep_copy(jac,fe_->massMat());
+    rst::scale(jac, dampFactor_*waveNumber_);
+    // APPLY ROBIN CONTROL: Sideset 0
+    int sideset = 0;
+    int numLocalSideIds = bdryCellLocIds_[sideset].size();
+    for (int j = 0; j < numLocalSideIds; ++j) {
+      int numCellsSide = bdryCellLocIds_[sideset][j].size();
+      if (numCellsSide) {
+        // Add Neumann control Jacobian to volume residual
+        for (int k = 0; k < numCellsSide; ++k) {
+          int cidx = bdryCellLocIds_[sideset][j][k];
+          for (int l = 0; l < f; ++l) { 
+            for (int m = 0; m < f; ++m) { 
+              jac(cidx,l,m) += impFactor_*waveNumber_*(feBdry_[j]->massMat())(k,l,m);
+            }
+          }
+        }
+      }
+    }
+  }
+
+
+  void Jacobian_2(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Helmholtz::Jacobian_2): Jacobian is zero.");
+  }
+
+  void Hessian_11(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Helmholtz::Hessian_11): Hessian is zero.");
+  }
+
+  void Hessian_12(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Helmholtz::Hessian_12): Hessian is zero.");
+  }
+
+  void Hessian_21(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Helmholtz::Hessian_21): Hessian is zero.");
+  }
+
+  void Hessian_22(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Helmholtz::Hessian_22): Hessian is zero.");
+  }
+
+  void RieszMap_1(scalar_view & riesz) override {
+    // Retrieve dimensions.
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    // Initialize Jacobian.
+    riesz = scalar_view("riesz1",c,f,f);
+    Kokkos::deep_copy(riesz,fe_->stiffMat());
+    rst::add(riesz, fe_->massMat());
+  }
+
+  void RieszMap_2(scalar_view & riesz) override {
+    // Retrieve dimensions.
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    // Initialize Jacobian.
+    riesz = scalar_view("riesz2",c,f,f);
+    Kokkos::deep_copy(riesz,fe_->massMat());
+  }
+
+  std::vector<basis_ptr> getFields() override {
+    return basisPtrs_;
+  }
+
+  void setCellNodes(const scalar_view &volCellNodes,
+                    const std::vector<std::vector<scalar_view>> &bdryCellNodes,
+                    const std::vector<std::vector<std::vector<int>>> &bdryCellLocIds) override {
+    volCellNodes_ = volCellNodes;
+    bdryCellNodes_ = bdryCellNodes;
+    bdryCellLocIds_ = bdryCellLocIds;
+    // Finite element definition.
+    fe_ = ROL::makePtr<fe_type>(volCellNodes_,basisPtr_,cellCub_);
+    fidx_ = fe_->getBoundaryDofs();
+    // Construct boundary FE
+    int sideset = 0;
+    int numLocSides = bdryCellNodes[sideset].size();
+    feBdry_.resize(numLocSides);
+    for (int j = 0; j < numLocSides; ++j) {
+      if (bdryCellNodes[sideset][j] != scalar_view()) {
+        feBdry_[j] = ROL::makePtr<fe_type>(bdryCellNodes[sideset][j],basisPtr_,bdryCub_,j);
+      }
+    }
+  }
+
+  const ROL::Ptr<fe_type> getFE(void) const {
+    return fe_;
+  }
+
+  const std::vector<ROL::Ptr<fe_type>> getBdryFE(void) const {
+    return feBdry_;
+  }
+
+  const std::vector<std::vector<int>> getBdryCellLocIds(const int sideset = 0) const {
+    return bdryCellLocIds_[sideset];
+  }
+
+}; // PDE_Helmholtz
+
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/published/Helmholtz_KouriRidzalTuminaro2020/pde_helmholtz_realK.hpp b/packages/rol/example/PDE-OPT/published/Helmholtz_KouriRidzalTuminaro2020/pde_helmholtz_realK.hpp
new file mode 100644
index 000000000000..fd968e4a6f7e
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/published/Helmholtz_KouriRidzalTuminaro2020/pde_helmholtz_realK.hpp
@@ -0,0 +1,265 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  pde_helmholtz.hpp
+    \brief Implements the local PDE interface for the optimal control of
+           Helmholtz.
+*/
+
+#ifndef PDE_HELMHOLTZ_REALK_HPP
+#define PDE_HELMHOLTZ_REALK_HPP
+
+#include "../../TOOLS/pdeK.hpp"
+#include "../../TOOLS/feK.hpp"
+#include "../../TOOLS/fieldhelperK.hpp"
+
+#include "Intrepid2_HGRAD_QUAD_C1_FEM.hpp"
+#include "Intrepid2_DefaultCubatureFactory.hpp"
+#include "Intrepid2_FunctionSpaceTools.hpp"
+#include "Intrepid2_CellTools.hpp"
+
+#include "ROL_Ptr.hpp"
+
+template <class Real, class DeviceType>
+class PDE_Helmholtz_Real : public PDE<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  // Finite element basis information
+  basis_ptr basisPtr_;
+  std::vector<basis_ptr> basisPtrs_;
+  // Cell cubature information
+  ROL::Ptr<Intrepid2::Cubature<DeviceType,Real,Real>> cellCub_;
+  // Cell node information
+  scalar_view volCellNodes_;
+  std::vector<std::vector<scalar_view>> bdryCellNodes_;
+  std::vector<std::vector<std::vector<int>>> bdryCellLocIds_;
+  // Finite element definition
+  ROL::Ptr<fe_type> fe_;
+
+  Real innerAnnulusRadius_;
+  Real outerAnnulusRadius_;
+  int example_;
+  Real waveNumber_;
+
+  scalar_view ctrlWeight_;
+  scalar_view ctrlJac_;
+
+  bool insideControlDomain(const std::vector<Real> &x) const {
+    bool val = true;
+    if (example_==1) {
+      const Real eps = std::sqrt(ROL::ROL_EPSILON<Real>());
+      Real xnorm(0);
+      const int d = x.size();
+      for (int i = 0; i < d; ++i) {
+        xnorm += x[i]*x[i];
+      }
+      xnorm = std::sqrt(xnorm);
+      val = (xnorm <= outerAnnulusRadius_+eps && xnorm >= innerAnnulusRadius_-eps);
+    }
+    return val;
+  }
+
+  void computeControlWeight(void) {
+    int c = fe_->gradN().extent_int(0);
+    int p = fe_->gradN().extent_int(2);
+    int d = fe_->gradN().extent_int(3);
+   
+    ctrlWeight_ = scalar_view("ctrlWeight_", c, p);
+
+    const Real zero(0), one(1);
+    bool inside(false);
+    std::vector<Real> x(d);
+    for (int i = 0; i < c; ++i) {
+      inside = false;
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k)
+          x[k] = (fe_->cubPts())(i,j,k);
+        if ( insideControlDomain(x) ) {
+          inside = true;
+          break;
+        }
+      }
+      for (int j = 0; j < p; ++j)
+        ctrlWeight_(i,j) = (inside ? one : zero);
+    }
+  }
+
+  void buildControlJacobian(void) {
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    int p = fe_->gradN().extent_int(2);
+
+    // Build force/control term
+    scalar_view F("F", c, f, p);
+    fst::scalarMultiplyDataField(F, ctrlWeight_, fe_->N());
+    ctrlJac_ = scalar_view("ctrlJac", c, f, f);
+    fst::integrate(ctrlJac_,F,fe_->NdetJ(),false);
+    rst::scale(ctrlJac_,static_cast<Real>(-1));
+  }
+
+public:
+  PDE_Helmholtz_Real(ROL::ParameterList &parlist)
+    : innerAnnulusRadius_(2.5), outerAnnulusRadius_(2.6),
+      example_(parlist.sublist("Problem").get("Example",1)),
+      waveNumber_(parlist.sublist("Problem").get("Wave Number",10.0)) {
+    // Finite element fields.
+    basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceType,Real,Real>>();
+    basisPtrs_.clear(); basisPtrs_.push_back(basisPtr_);
+    // Quadrature rules.
+    shards::CellTopology cellType = basisPtr_->getBaseCellTopology();        // get the cell type from the basis
+    Intrepid2::DefaultCubatureFactory cubFactory;                            // create cubature factory
+    int cubDegree = parlist.sublist("Problem").get("Cubature Degree", 2);    // set cubature degree, e.g., 2
+    cellCub_ = cubFactory.create<DeviceType,Real,Real>(cellType, cubDegree); // create default cubature
+  }
+
+  void residual(scalar_view & res,
+                const scalar_view u_coeff,
+                const scalar_view z_coeff = scalar_view(),
+                const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    const Real one(1), two(2);
+    // Retrieve dimensions.
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    int p = fe_->gradN().extent_int(2);
+    int d = fe_->gradN().extent_int(3);
+    // Initialize residuals.
+    res = scalar_view("res",c,f);
+    // Evaluate/interpolate finite element fields on cells.
+    scalar_view valz_eval("valz_eval", c, p);
+    scalar_view valu_eval("valu_eval", c, p);
+    scalar_view gradu_eval("gradu_eval", c, p, d);
+    fe_->evaluateValue(valz_eval, z_coeff);
+    fe_->evaluateValue(valu_eval, u_coeff);
+    fe_->evaluateGradient(gradu_eval, u_coeff);
+    // Integrate PDE term
+    fst::integrate(res,valu_eval,fe_->NdetJ(),false);
+    rst::scale(res, -std::pow(waveNumber_,two));
+    fst::integrate(res,gradu_eval,fe_->gradNdetJ(),true);
+    // Build control term
+    scalar_view F("F", c, p);
+    fst::scalarMultiplyDataData(F, ctrlWeight_, valz_eval);
+    rst::scale(F,-one);
+    // Volumetric intregration
+    fst::integrate(res,F,fe_->NdetJ(),true);
+  }
+
+  void Jacobian_1(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    const Real two(2);
+    // Retrieve dimensions.
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    // Initialize Jacobian.
+    jac = scalar_view("jac",c,f,f);
+    // Add PDE terms
+    Kokkos::deep_copy(jac,fe_->massMat());
+    rst::scale(jac, -std::pow(waveNumber_,two));
+    rst::add(jac, fe_->stiffMat());
+  }
+
+
+  void Jacobian_2(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    // Initialize Jacobian.
+    jac = scalar_view("jac",c,f,f);
+    // Add control term
+    Kokkos::deep_copy(jac,ctrlJac_);
+    rst::scale(jac, static_cast<Real>(-1));
+  }
+
+  void Hessian_11(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Helmholtz::Hessian_11): Hessian is zero.");
+  }
+
+  void Hessian_12(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Helmholtz::Hessian_12): Hessian is zero.");
+  }
+
+  void Hessian_21(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Helmholtz::Hessian_21): Hessian is zero.");
+  }
+
+  void Hessian_22(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Helmholtz::Hessian_22): Hessian is zero.");
+  }
+
+  void RieszMap_1(scalar_view & riesz) override {
+    // Retrieve dimensions.
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    // Initialize Jacobian.
+    riesz = scalar_view("riesz1",c,f,f);
+    Kokkos::deep_copy(riesz,fe_->stiffMat());
+    rst::add(riesz, fe_->massMat());
+  }
+
+  void RieszMap_2(scalar_view & riesz) override {
+    // Retrieve dimensions.
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    // Initialize Jacobian.
+    riesz = scalar_view("riesz2",c,f,f);
+    Kokkos::deep_copy(riesz,fe_->massMat());
+  }
+
+  std::vector<basis_ptr> getFields() override {
+    return basisPtrs_;
+  }
+
+  void setCellNodes(const scalar_view &volCellNodes,
+                    const std::vector<std::vector<scalar_view>> &bdryCellNodes,
+                    const std::vector<std::vector<std::vector<int>>> &bdryCellLocIds) override {
+    volCellNodes_ = volCellNodes;
+    bdryCellNodes_ = bdryCellNodes;
+    bdryCellLocIds_ = bdryCellLocIds;
+    // Finite element definition.
+    fe_ = ROL::makePtr<fe_type>(volCellNodes_,basisPtr_,cellCub_);
+    // Compute control weight
+    computeControlWeight();
+    buildControlJacobian();
+  }
+
+  const ROL::Ptr<fe_type> getFE(void) const {
+    return fe_;
+  }
+
+}; // PDE_Helmholtz
+
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/published/IMAvolumes_KouriRidzal2017/CMakeLists.txt b/packages/rol/example/PDE-OPT/published/IMAvolumes_KouriRidzal2017/CMakeLists.txt
index 29cf81f91827..d0790d8c49a9 100644
--- a/packages/rol/example/PDE-OPT/published/IMAvolumes_KouriRidzal2017/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/published/IMAvolumes_KouriRidzal2017/CMakeLists.txt
@@ -15,19 +15,19 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/trikota/src/sol)
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/trikota/src/sol/sparse_grids)
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_RS
     SOURCES example_RS.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_FS
     SOURCES example_FS.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     IMAvolumesKouriRidzal2017DataCopy
     SOURCE_FILES
       input_RS.xml input_FS.xml
@@ -35,5 +35,42 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
   )
 
+ENDIF()
+
+IF(${PROJECT_NAME}_ENABLE_Intrepid2 AND
+   ${PROJECT_NAME}_ENABLE_Ifpack2   AND
+   ${PROJECT_NAME}_ENABLE_MueLu     AND
+   ${PROJECT_NAME}_ENABLE_Amesos2   AND
+   ${PROJECT_NAME}_ENABLE_TriKota   AND
+   ${PROJECT_NAME}_ENABLE_Tpetra )
+
+
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/teuchos/src/sol)
+
+  # Need ROL_TpetraMultiVector.hpp
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/trikota/src/sol)
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/trikota/src/sol/sparse_grids)
+
+  ROL_ADD_EXECUTABLE(
+    example_RS_Kokkos
+    SOURCES example_RSK.cpp
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_ADD_EXECUTABLE(
+    example_FS_Kokkos
+    SOURCES example_FSK.cpp
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_COPY_FILES_TO_BINARY_DIR(
+    IMAvolumesKouriRidzal2017DataCopyK
+    SOURCE_FILES
+      input_RS.xml input_FS.xml
+    SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}"
+    DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
+  )
 
 ENDIF()
diff --git a/packages/rol/example/PDE-OPT/published/IMAvolumes_KouriRidzal2017/example_FS.cpp b/packages/rol/example/PDE-OPT/published/IMAvolumes_KouriRidzal2017/example_FS.cpp
index 5b057c48ff93..cc2de9a213c2 100644
--- a/packages/rol/example/PDE-OPT/published/IMAvolumes_KouriRidzal2017/example_FS.cpp
+++ b/packages/rol/example/PDE-OPT/published/IMAvolumes_KouriRidzal2017/example_FS.cpp
@@ -14,8 +14,8 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
+#include "ROL_XMLReader.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -121,7 +121,7 @@ int main(int argc, char *argv[]) {
   Teuchos::updateParametersFromXmlFile( filename, parlist.ptr() );
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<Teuchos::Comm<int> > comm_linalg, comm_sample;
 #ifdef HAVE_MPI
   int nLinAlg = parlist->sublist("Solver").get("Number of Cores", 4);
diff --git a/packages/rol/example/PDE-OPT/published/IMAvolumes_KouriRidzal2017/example_FSK.cpp b/packages/rol/example/PDE-OPT/published/IMAvolumes_KouriRidzal2017/example_FSK.cpp
new file mode 100644
index 000000000000..8439441e84eb
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/published/IMAvolumes_KouriRidzal2017/example_FSK.cpp
@@ -0,0 +1,356 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_FS.cpp
+    \brief Full-space solution of a thermal-fluids problem with random coefficients,
+           using a risk-neutral formulation and Monte Carlo sampling.
+*/
+
+#include "Teuchos_Comm.hpp"
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+#include "ROL_XMLReader.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+
+#include "ROL_TpetraMultiVector.hpp"
+#include "ROL_Algorithm.hpp"
+#include "ROL_ConstraintStatusTest.hpp"
+#include "ROL_CompositeStep.hpp"
+#include "ROL_Reduced_Objective_SimOpt.hpp"
+#include "ROL_MonteCarloGenerator.hpp"
+#include "ROL_SparseGridGenerator.hpp"
+#include "ROL_SimulatedConstraint.hpp"
+#include "ROL_SimulatedObjectiveCVaR.hpp"
+#include "ROL_SimulatedObjective.hpp"
+#include "ROL_TpetraTeuchosBatchManager.hpp"
+
+#include "../../TOOLS/pdeconstraintK.hpp"
+#include "../../TOOLS/pdeobjectiveK.hpp"
+#include "../../TOOLS/pdevectorK.hpp"
+#include "pde_thermal-fluidsK.hpp"
+#include "obj_thermal-fluidsK.hpp"
+#include "mesh_thermal-fluidsK.hpp"
+#include "split_comm_world.hpp"
+
+template<class Real>
+void print(ROL::Objective<Real> &obj,
+           const ROL::Vector<Real> &z,
+           ROL::SampleGenerator<Real> &sampler,
+           const int ngsamp,
+           const ROL::Ptr<const Teuchos::Comm<int> > &comm,
+           const std::string &filename) {
+  Real tol(1e-8);
+  // Build objective function distribution
+  int nsamp = sampler.numMySamples();
+  std::vector<Real> myvalues(nsamp), myzerovec(nsamp, 0);
+  std::vector<double> gvalues(ngsamp), gzerovec(ngsamp, 0);
+  std::vector<Real> sample = sampler.getMyPoint(0);
+  int sdim = sample.size();
+  std::vector<std::vector<Real> > mysamples(sdim, myzerovec);
+  std::vector<std::vector<double> > gsamples(sdim, gzerovec);
+  for (int i = 0; i < nsamp; ++i) {
+    sample = sampler.getMyPoint(i);
+    obj.setParameter(sample);
+    myvalues[i] = static_cast<double>(obj.value(z,tol));
+    for (int j = 0; j < sdim; ++j)
+      mysamples[j][i] = static_cast<double>(sample[j]);
+  }
+
+  // Send data to root processor
+#ifdef HAVE_MPI
+  auto mpicomm = ROL::dynamicPtrCast<const Teuchos::MpiComm<int> >(comm);
+  int nproc = Teuchos::size<int>(*mpicomm);
+  std::vector<int> sampleCounts(nproc, 0), sampleDispls(nproc, 0);
+  MPI_Gather(&nsamp,1,MPI_INT,&sampleCounts[0],1,MPI_INT,0,*(mpicomm->getRawMpiComm())());
+  for (int i = 1; i < nproc; ++i)
+    sampleDispls[i] = sampleDispls[i-1] + sampleCounts[i-1];
+  MPI_Gatherv(&myvalues[0],nsamp,MPI_DOUBLE,&gvalues[0],&sampleCounts[0],&sampleDispls[0],MPI_DOUBLE,0,*(mpicomm->getRawMpiComm())());
+  for (int j = 0; j < sdim; ++j)
+    MPI_Gatherv(&mysamples[j][0],nsamp,MPI_DOUBLE,&gsamples[j][0],&sampleCounts[0],&sampleDispls[0],MPI_DOUBLE,0,*(mpicomm->getRawMpiComm())());
+#else
+  gvalues.assign(myvalues.begin(),myvalues.end());
+  for (int j = 0; j < sdim; ++j)
+    gsamples[j].assign(mysamples[j].begin(),mysamples[j].end());
+#endif
+
+  // Print
+  int rank  = Teuchos::rank<int>(*comm);
+  if ( rank==0 ) {
+    std::ofstream file;
+    file.open(filename);
+    file << std::scientific << std::setprecision(15);
+    for (int i = 0; i < ngsamp; ++i) {
+      for (int j = 0; j < sdim; ++j)
+        file << std::setw(25) << std::left << gsamples[j][i];
+      file << std::setw(25) << std::left << gvalues[i] << std::endl;
+    }
+    file.close();
+  }
+}
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+int main(int argc, char *argv[]) {
+
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+
+  /*** Read in XML input ***/
+  std::string filename = "input_FS.xml";
+  auto parlist = ROL::getParametersFromXmlFile(filename);
+
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  Kokkos::ScopeGuard kokkosScope (argc, argv);
+  ROL::Ptr<Teuchos::Comm<int> > comm_linalg, comm_sample;
+#ifdef HAVE_MPI
+  int nLinAlg = parlist->sublist("Solver").get("Number of Cores", 4);
+  split_comm_world(comm_linalg, comm_sample, nLinAlg);
+#else
+  comm_sample = Tpetra::getDefaultComm();
+  comm_linalg = ROL::makePtr<Teuchos::SerialComm<int>>();
+#endif
+  const int myRankLinAlg = comm_linalg->getRank();
+  const int myRankSample = comm_sample->getRank();
+  if ((iprint > 0) && (myRankLinAlg == 0) && (myRankSample == 0))
+    outStream = ROL::makePtrFromRef(std::cout);
+  else
+    outStream = ROL::makePtrFromRef(bhs);
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+
+    parlist->sublist("SimOpt").sublist("Solve").set("Output Iteration History",((myRankLinAlg == 0) && (myRankSample == 0)));
+
+    /*** Initialize main data structure. ***/
+    auto meshMgr = ROL::makePtr<MeshManager_ThermalFluids<RealT,DeviceT>>(*parlist);
+    // Initialize PDE describing Navier-Stokes equations.
+    auto pde = ROL::makePtr<PDE_ThermalFluids_ex03<RealT,DeviceT>>(*parlist);
+    auto con = ROL::makePtr<PDE_Constraint<RealT,DeviceT>>(pde,meshMgr,comm_linalg,*parlist,*outStream);
+    // Cast the constraint and get the assembler.
+    auto assembler = pdecon->getAssembler();
+    con->setSolveParameters(*parlist);
+    con->outputTpetraData();
+
+    // Create state vector and set to zeroes
+    auto u_ptr  = assembler->createStateVector();    u_ptr->randomize();
+    auto p_ptr  = assembler->createStateVector();    p_ptr->randomize();
+    auto y_ptr  = assembler->createStateVector();    y_ptr->randomize();
+    auto r_ptr  = assembler->createResidualVector(); r_ptr->randomize();
+    auto z_ptr  = assembler->createControlVector();  z_ptr->putScalar(1.234); //z_ptr->randomize();
+    auto s_ptr  = assembler->createControlVector();  s_ptr->putScalar(2.345); //s_ptr->randomize();
+    auto t_ptr  = assembler->createControlVector();  t_ptr->putScalar(3.456); //t_ptr->randomize();
+    auto up  = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(u_ptr,pde,assembler);
+    auto pp  = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(p_ptr,pde,assembler);
+    auto yp  = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(y_ptr,pde,assembler);
+    auto rp  = ROL::makePtr<PDE_DualSimVector<RealT,DeviceT>>(r_ptr,pde,assembler);
+    auto zp  = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(z_ptr,pde,assembler);
+    auto sp  = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(s_ptr,pde,assembler);
+    auto tp  = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(t_ptr,pde,assembler);
+
+    // Initialize objective function.
+    std::vector<ROL::Ptr<QoI<RealT,DeviceT>>> qoi_vec(2,ROL::nullPtr);
+    qoi_vec[0] = ROL::makePtr<QoI_State_ThermalFluids<RealT,DeviceT>>(*parlist,
+                                                                 pde->getVelocityFE(),
+                                                                 pde->getPressureFE(),
+                                                                 pde->getThermalFE(),
+                                                                 pde->getFieldHelper());
+    qoi_vec[1] = ROL::makePtr<QoI_L2Penalty_ThermalFluids<RealT,DeviceT>>(pde->getVelocityFE(),
+                                                                     pde->getPressureFE(),
+                                                                     pde->getThermalFE(),
+                                                                     pde->getThermalBdryFE(),
+                                                                     pde->getBdryCellLocIds(),
+                                                                     pde->getFieldHelper());
+    auto std_obj = ROL::makePtr<StdObjective_ThermalFluids<RealT>>(*parlist);
+    auto obj = ROL::makePtr<PDE_Objective<RealT,DeviceT>>(qoi_vec,std_obj,assembler);
+    auto stateStore = ROL::makePtr<ROL::VectorController<RealT>>();
+    auto robj = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(obj, con, stateStore, up, zp, pp, true, false);
+
+    /*************************************************************************/
+    /***************** BUILD SAMPLER *****************************************/
+    /*************************************************************************/
+    int Nbottom = parlist->sublist("Problem").get("Bottom KL Truncation Order",5);
+    int Nleft   = parlist->sublist("Problem").get("Left KL Truncation Order",5);
+    int Nright  = parlist->sublist("Problem").get("Right KL Truncation Order",5);
+    int stochDim = Nbottom + Nleft + Nright + 3;
+    bool use_sg = parlist->sublist("Problem").get("Use sparse grid",false);
+
+    auto bman = ROL::makePtr<ROL::TpetraTeuchosBatchManager<RealT>>(comm_sample);
+    ROL::Ptr<ROL::SampleGenerator<RealT>> sampler;
+
+    // Build vector of distributions
+    std::vector<ROL::Ptr<ROL::Distribution<RealT>>> distVec(stochDim);
+    Teuchos::ParameterList UList;
+    UList.sublist("Distribution").set("Name","Uniform");
+    UList.sublist("Distribution").sublist("Uniform").set("Lower Bound",-1.0);
+    UList.sublist("Distribution").sublist("Uniform").set("Upper Bound", 1.0);
+    for (int i = 0; i < stochDim; ++i)
+      distVec[i] = ROL::DistributionFactory<RealT>(UList);
+
+    if (use_sg) {
+      int maxLevel   = parlist->sublist("Problem").get("Maximum Sparse Grid Level",7);
+      bool printSG   = parlist->sublist("Problem").get("Print Sparse Grid Size",false);
+      ROL::QuadratureInfo info;
+      info.dim        = stochDim;
+      info.maxLevel   = maxLevel;
+      info.normalized = true;
+      info.adaptive   = false;
+      info.print      = (printSG&&((myRankLinAlg == 0) && (myRankSample == 0)));
+      info.name       = "Full";
+      info.rule1D.clear();   info.rule1D.resize(info.dim,ROL::QUAD_CLENSHAWCURTIS);
+      info.growth1D.clear(); info.growth1D.resize(info.dim,ROL::GROWTH_DEFAULT);
+      sampler = ROL::makePtr<ROL::SparseGridGenerator<RealT>>(bman,info,false);
+    }
+    else { 
+      // Sampler
+      int nsamp = parlist->sublist("Problem").get("Number of samples",100);
+      sampler = ROL::makePtr<ROL::MonteCarloGenerator<RealT>>(nsamp,distVec,bman);
+    }
+
+    /*************************************************************************/
+    /***************** BUILD STOCHASTIC PROBLEM ******************************/
+    /*************************************************************************/
+    bool useW    = parlist->sublist("SOL").sublist("Simulated").get("Use Constraint Weights", true);
+    bool useCVaR = parlist->sublist("SOL").sublist("Simulated").get("Use CVaR", false);
+    auto simcon = ROL::makePtr<ROL::SimulatedConstraint<RealT>>(sampler, con, useW);
+    ROL::Ptr<ROL::Objective<RealT>> simobj;
+    if (useCVaR) {
+      Teuchos::ParameterList list = parlist->sublist("SOL").sublist("Simulated");
+        auto pf = ROL::makePtr<ROL::PlusFunction<RealT>>(list);
+      RealT alpha = parlist->sublist("SOL").sublist("Simulated").get("CVaR Confidence Level", 0.9);
+      simobj = ROL::makePtr<ROL::SimulatedObjectiveCVaR<RealT>>(sampler, obj, pf, alpha);
+    }
+    else {
+      simobj = ROL::makePtr<ROL::SimulatedObjective<RealT>>(sampler, obj);
+    }
+    std::vector<ROL::Ptr<ROL::Vector<RealT>>> vuvec, vpvec, vyvec;
+    for (int i = 0; i < sampler->numMySamples(); ++i) {
+      auto vu_ptr  = assembler->createStateVector(); vu_ptr->putScalar(4.567); //vu_ptr->randomize();
+      auto vp_ptr  = assembler->createStateVector(); vp_ptr->putScalar(5.678); //vp_ptr->randomize();
+      auto vy_ptr  = assembler->createStateVector(); vy_ptr->putScalar(6.789); //vy_ptr->randomize();
+      auto vup  = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(vu_ptr,pde,assembler);
+      auto vpp  = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(vp_ptr,pde,assembler);
+      auto vyp  = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(vy_ptr,pde,assembler);
+      vuvec.push_back(vup);
+      vpvec.push_back(vpp);
+      vyvec.push_back(vyp);
+    }
+    auto vu = ROL::makePtr<ROL::SimulatedVector<RealT,DeviceT>>(vuvec,bman);
+    auto vp = ROL::makePtr<ROL::SimulatedVector<RealT,DeviceT>>(vpvec,bman);
+    auto vy = ROL::makePtr<ROL::SimulatedVector<RealT,DeviceT>>(vyvec,bman);
+    ROL::Ptr<ROL::Vector<RealT> > rz, rs, rt;
+    if (useCVaR) {
+      Teuchos::RCP<Teuchos::ParameterList> cvarList = Teuchos::rcp( new Teuchos::ParameterList() );
+      cvarList->sublist("SOL").sublist("Risk Measure").set("Name","CVaR");
+      rz = ROL::makePtr<ROL::RiskVector<RealT>>(cvarList, zp);
+      rs = ROL::makePtr<ROL::RiskVector<RealT>>(cvarList, sp);
+      rt = ROL::makePtr<ROL::RiskVector<RealT>>(cvarList, tp);
+    }
+    else {
+      rz = zp;
+      rs = sp;
+      rt = tp;
+    }
+    ROL::Vector_SimOpt<RealT> x(vu,rz);
+    ROL::Vector_SimOpt<RealT> p(vp,rs);
+    ROL::Vector_SimOpt<RealT> y(vy,rt);
+    x.checkVector(p,y,true,*outStream);
+
+    bool derivCheck = parlist->sublist("Problem").get("Check derivatives",false);
+    if (derivCheck) {
+      *outStream << std::endl << "TESTING SimulatedConstraint" << std::endl;
+      simcon->checkApplyJacobian(x, p, *vu, true, *outStream);
+      simcon->checkAdjointConsistencyJacobian(*vu, p, x, *vu, x, true, *outStream);
+      simcon->checkApplyAdjointHessian(x, *vu, p, x, true, *outStream);
+      *outStream << std::endl << "TESTING SimulatedObjective" << std::endl;
+      RealT tol = 1e-8;
+      simobj->value(x, tol);
+      simobj->checkGradient(x, p, true, *outStream);
+      simobj->checkHessVec(x, p, true, *outStream);
+    }
+
+    zp->zero();
+    auto vusim = ROL::dynamicPtrCast<ROL::SimulatedVector<RealT>>(vu);
+    for (int i = 0; i < sampler->numMySamples(); ++i) {
+      RealT tol = 1e-8;
+      std::vector<RealT> param = sampler->getMyPoint(i);
+      con->setParameter(param);
+      vusim->get(i)->zero();
+      con->update(*(vusim->get(i)),*zp);
+      con->solve(*rp,*(vusim->get(i)),*zp,tol);
+    }
+
+    bool zeroInit = parlist->sublist("Problem").get("Zero initial guess",true);
+    if (zeroInit) {
+      x.zero();
+      vp->zero();
+    }
+
+    /*************************************************************************/
+    /***************** SOLVE PROBLEM *****************************************/
+    /*************************************************************************/
+    auto step = ROL::makePtr<ROL::CompositeStep<RealT>>(*parlist);
+    auto status = ROL::makePtr<ROL::ConstraintStatusTest<RealT>>(*parlist);
+    ROL::Algorithm<RealT> algo(step,status,false);
+    std::clock_t timer = std::clock();
+    algo.run(x, *vp, *simobj, *simcon, true, *outStream);
+    *outStream << "Optimization time: "
+               << static_cast<RealT>(std::clock()-timer)/static_cast<RealT>(CLOCKS_PER_SEC)
+               << " seconds." << std::endl << std::endl;
+    
+    /*************************************************************************/
+    /***************** OUTPUT RESULTS ****************************************/
+    /*************************************************************************/
+    std::clock_t timer_print = std::clock();
+    assembler->printMeshData(*outStream);
+    // Output control to file
+    pdecon->outputTpetraVector(z_ptr,"control.txt");
+    // Output expected state and samples to file
+    *outStream << std::endl << "Print Expected Value of State" << std::endl;
+    up->zero(); pp->zero();
+    for (int i = 0; i < sampler->numMySamples(); ++i)
+      up->axpy(sampler->getMyWeight(i),*(vusim->get(i)));
+    bman->sumAll(*up,*pp);
+    con->outputTpetraVector(p_ptr,"mean_state.txt");
+    // Build full objective function distribution
+    *outStream << std::endl << "Print Objective CDF" << std::endl;
+    int nsamp_dist = parlist->sublist("Problem").get("Number of output samples",100);
+    auto sampler_dist = ROL::makePtr<ROL::MonteCarloGenerator<RealT>>(nsamp_dist,distVec,bman);
+    print<RealT>(*robj,*zp,*sampler_dist,nsamp_dist,comm_sample,"obj_samples.txt");
+    // Build vorticity objective function distribution
+    auto obj0 = ROL::makePtr<IntegralObjective<RealT,DeviceT>>(qoi_vec[0],assembler);
+    auto stateStore0 = ROL::makePtr<ROL::VectorController<RealT>>();
+    auto robj0 = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(obj0, con, stateStore0, up, zp, pp, true, false);
+    print<RealT>(*robj0,*zp,*sampler_dist,nsamp_dist,comm_sample,"vort_samples.txt");
+
+    *outStream << "Output time: "
+               << static_cast<RealT>(std::clock()-timer_print)/static_cast<RealT>(CLOCKS_PER_SEC)
+               << " seconds." << std::endl << std::endl;
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/published/IMAvolumes_KouriRidzal2017/example_RS.cpp b/packages/rol/example/PDE-OPT/published/IMAvolumes_KouriRidzal2017/example_RS.cpp
index 0c11c489dd0b..01cd950afcc3 100644
--- a/packages/rol/example/PDE-OPT/published/IMAvolumes_KouriRidzal2017/example_RS.cpp
+++ b/packages/rol/example/PDE-OPT/published/IMAvolumes_KouriRidzal2017/example_RS.cpp
@@ -14,8 +14,8 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
+#include "ROL_XMLReader.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -110,7 +110,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int> > comm
     = Tpetra::getDefaultComm();
   ROL::Ptr<const Teuchos::Comm<int> > serial_comm
diff --git a/packages/rol/example/PDE-OPT/published/IMAvolumes_KouriRidzal2017/example_RSK.cpp b/packages/rol/example/PDE-OPT/published/IMAvolumes_KouriRidzal2017/example_RSK.cpp
new file mode 100644
index 000000000000..39366c350a85
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/published/IMAvolumes_KouriRidzal2017/example_RSK.cpp
@@ -0,0 +1,358 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_RS.cpp
+    \brief Reduced-space solution of a thermal-fluids problem with random coefficients,
+           using a risk-neutral formulation and adaptive sparse-grid sampling.
+*/
+
+#include "Teuchos_Comm.hpp"
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+#include "ROL_XMLReader.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+
+#include "ROL_TpetraMultiVector.hpp"
+#include "ROL_Reduced_Objective_SimOpt.hpp"
+#include "ROL_MonteCarloGenerator.hpp"
+#include "ROL_SparseGridGenerator.hpp"
+#include "ROL_OptimizationSolver.hpp"
+#include "ROL_TpetraTeuchosBatchManager.hpp"
+
+#include "../../TOOLS/pdeconstraintK.hpp"
+#include "../../TOOLS/pdeobjectiveK.hpp"
+#include "../../TOOLS/pdevectorK.hpp"
+#include "pde_thermal-fluidsK.hpp"
+#include "obj_thermal-fluidsK.hpp"
+#include "mesh_thermal-fluidsK.hpp"
+
+template<class Real>
+void print(ROL::Objective<Real> &obj,
+           const ROL::Vector<Real> &z,
+           ROL::SampleGenerator<Real> &sampler,
+           const int ngsamp,
+           const ROL::Ptr<const Teuchos::Comm<int> > &comm,
+           const std::string &filename) {
+  Real tol(1e-8);
+  // Build objective function distribution
+  int nsamp = sampler.numMySamples();
+  std::vector<Real> myvalues(nsamp), myzerovec(nsamp, 0);
+  std::vector<double> gvalues(ngsamp), gzerovec(ngsamp, 0);
+  std::vector<Real> sample = sampler.getMyPoint(0);
+  int sdim = sample.size();
+  std::vector<std::vector<Real> > mysamples(sdim, myzerovec);
+  std::vector<std::vector<double> > gsamples(sdim, gzerovec);
+  for (int i = 0; i < nsamp; ++i) {
+    sample = sampler.getMyPoint(i);
+    obj.setParameter(sample);
+    myvalues[i] = static_cast<double>(obj.value(z,tol));
+    for (int j = 0; j < sdim; ++j)
+      mysamples[j][i] = static_cast<double>(sample[j]);
+  }
+
+  // Send data to root processor
+#ifdef HAVE_MPI
+  auto mpicomm = ROL::dynamicPtrCast<const Teuchos::MpiComm<int>>(comm);
+  int nproc = Teuchos::size<int>(*mpicomm);
+  std::vector<int> sampleCounts(nproc, 0), sampleDispls(nproc, 0);
+  MPI_Gather(&nsamp,1,MPI_INT,&sampleCounts[0],1,MPI_INT,0,*(mpicomm->getRawMpiComm())());
+  for (int i = 1; i < nproc; ++i)
+    sampleDispls[i] = sampleDispls[i-1] + sampleCounts[i-1];
+  MPI_Gatherv(&myvalues[0],nsamp,MPI_DOUBLE,&gvalues[0],&sampleCounts[0],&sampleDispls[0],MPI_DOUBLE,0,*(mpicomm->getRawMpiComm())());
+  for (int j = 0; j < sdim; ++j)
+    MPI_Gatherv(&mysamples[j][0],nsamp,MPI_DOUBLE,&gsamples[j][0],&sampleCounts[0],&sampleDispls[0],MPI_DOUBLE,0,*(mpicomm->getRawMpiComm())());
+#else
+  gvalues.assign(myvalues.begin(),myvalues.end());
+  for (int j = 0; j < sdim; ++j)
+    gsamples[j].assign(mysamples[j].begin(),mysamples[j].end());
+#endif
+
+  // Print
+  int rank  = Teuchos::rank<int>(*comm);
+  if ( rank==0 ) {
+    std::ofstream file;
+    file.open(filename);
+    file << std::scientific << std::setprecision(15);
+    for (int i = 0; i < ngsamp; ++i) {
+      for (int j = 0; j < sdim; ++j)
+        file << std::setw(25) << std::left << gsamples[j][i];
+      file << std::setw(25) << std::left << gvalues[i] << std::endl;
+    }
+    file.close();
+  }
+}
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+int main(int argc, char *argv[]) {
+
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  Kokkos::ScopeGuard kokkosScope (argc, argv);
+  auto comm = Tpetra::getDefaultComm();
+  auto serial_comm = ROL::makePtr<Teuchos::SerialComm<int>>();
+  const int myRank = comm->getRank();
+  if ((iprint > 0) && (myRank == 0))
+    outStream = ROL::makePtrFromRef(std::cout);
+  else
+    outStream = ROL::makePtrFromRef(bhs);
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+
+    /*** Read in XML input ***/
+    std::string filename = "input_RS.xml";
+    auto parlist = ROL::getParametersFromXmlFile(filename);
+    bool printSimOpt = parlist->sublist("SimOpt").sublist("Solve").get("Output Iteration History",false);
+    parlist->sublist("SimOpt").sublist("Solve").set("Output Iteration History",printSimOpt&&(myRank==0));
+    int verbosity = parlist->sublist("General").get("Print Verbosity",0);
+    verbosity = (myRank != 0) ? 0 : verbosity;
+    parlist->sublist("General").set("Print Verbosity",verbosity);
+
+    /*** Initialize main data structure. ***/
+    auto meshMgr = ROL::makePtr<MeshManager_ThermalFluids<RealT,DeviceT>>(*parlist);
+    // Initialize PDE describing Navier-Stokes equations.
+    auto pde = ROL::makePtr<PDE_ThermalFluids_ex03<RealT,DeviceT>>(*parlist);
+    auto con = ROL::makePtr<PDE_Constraint<RealT,DeviceT>>(pde,meshMgr,serial_comm,*parlist,*outStream);
+    // Cast the constraint and get the assembler.
+    auto assembler = pdecon->getAssembler();
+    con->setSolveParameters(*parlist);
+    con->outputTpetraData();
+
+    // Create state vector and set to zeroes
+    auto u_ptr  = assembler->createStateVector();    u_ptr->randomize();
+    auto p_ptr  = assembler->createStateVector();    p_ptr->randomize();
+    auto du_ptr = assembler->createStateVector();    du_ptr->randomize();
+    auto dy_ptr = assembler->createStateVector();    dy_ptr->randomize();
+    auto yu_ptr = assembler->createStateVector();    yu_ptr->randomize();
+    auto yp_ptr = assembler->createStateVector();    yp_ptr->randomize();
+    auto r_ptr  = assembler->createResidualVector(); r_ptr->randomize();
+    auto z_ptr  = assembler->createControlVector();  z_ptr->randomize();
+    auto dz_ptr = assembler->createControlVector();  dz_ptr->randomize();
+    auto up  = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(u_ptr,pde,assembler);
+    auto pp  = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(p_ptr,pde,assembler);
+    auto dup = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(du_ptr,pde,assembler);
+    auto dyp = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(dy_ptr,pde,assembler);
+    auto yup = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(yu_ptr,pde,assembler);
+    auto ypp = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(yp_ptr,pde,assembler);
+    auto rp  = ROL::makePtr<PDE_DualSimVector<RealT,DeviceT>>(r_ptr,pde,assembler);
+    auto zp  = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(z_ptr,pde,assembler);
+    auto dzp = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(dz_ptr,pde,assembler);
+    // Create ROL SimOpt vectors
+    ROL::Vector_SimOpt<RealT> x(up,zp);
+    ROL::Vector_SimOpt<RealT> d(dup,dzp);
+
+    // Initialize objective function.
+    std::vector<ROL::Ptr<QoI<RealT,DeviceT>>> qoi_vec(2,ROL::nullPtr);
+    qoi_vec[0] = ROL::makePtr<QoI_State_ThermalFluids<RealT,DeviceT>>(*parlist,
+                                                                 pde->getVelocityFE(),
+                                                                 pde->getPressureFE(),
+                                                                 pde->getThermalFE(),
+                                                                 pde->getFieldHelper());
+    qoi_vec[1] = ROL::makePtr<QoI_L2Penalty_ThermalFluids<RealT,DeviceT>>(pde->getVelocityFE(),
+                                                                     pde->getPressureFE(),
+                                                                     pde->getThermalFE(),
+                                                                     pde->getThermalBdryFE(),
+                                                                     pde->getBdryCellLocIds(),
+                                                                     pde->getFieldHelper());
+    auto std_obj = ROL::makePtr<StdObjective_ThermalFluids<RealT>>(*parlist);
+    auto obj = ROL::makePtr<PDE_Objective<RealT,DeviceT>>(qoi_vec,std_obj,assembler);
+    auto stateStore = ROL::makePtr<ROL::VectorController<RealT>>();
+    auto robj = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(obj, con, stateStore, up, zp, pp, true, false);
+
+    /*************************************************************************/
+    /***************** BUILD SAMPLER *****************************************/
+    /*************************************************************************/
+    int Nbottom = parlist->sublist("Problem").get("Bottom KL Truncation Order",5);
+    int Nleft   = parlist->sublist("Problem").get("Left KL Truncation Order",5);
+    int Nright  = parlist->sublist("Problem").get("Right KL Truncation Order",5);
+    int stochDim = Nbottom + Nleft + Nright + 3;
+    // Build vector of distributions
+    std::vector<ROL::Ptr<ROL::Distribution<RealT>>> distVec(stochDim);
+    Teuchos::ParameterList UList;
+    UList.sublist("Distribution").set("Name","Uniform");
+    UList.sublist("Distribution").sublist("Uniform").set("Lower Bound",-1.0);
+    UList.sublist("Distribution").sublist("Uniform").set("Upper Bound", 1.0);
+    for (int i = 0; i < stochDim; ++i)
+      distVec[i] = ROL::DistributionFactory<RealT>(UList);
+    // Sampler
+    int maxLevel   = parlist->sublist("Problem").get("Maximum Sparse Grid Level",7);
+    bool adaptiveV = parlist->sublist("Problem").get("Use Value Adaptive Sparse Grids",false);
+    bool adaptiveG = parlist->sublist("Problem").get("Use Gradient Adaptive Sparse Grids",true);
+    bool printSG   = parlist->sublist("Problem").get("Print Sparse Grid Size",false);
+    ROL::QuadratureInfo info, vinfo, ginfo;
+    info.dim        = stochDim;
+    info.maxLevel   = maxLevel;
+    info.normalized = true;
+    info.adaptive   = false;
+    info.print      = (printSG&&(myRank==0));
+    info.name       = "Full";
+    info.rule1D.clear();   info.rule1D.resize(info.dim,ROL::QUAD_CLENSHAWCURTIS);
+    info.growth1D.clear(); info.growth1D.resize(info.dim,ROL::GROWTH_DEFAULT);
+    vinfo           = info;
+    vinfo.adaptive  = adaptiveV;
+    vinfo.name      = "Value";
+    ginfo           = info;
+    ginfo.adaptive  = adaptiveG;
+    ginfo.name      = "Gradient";
+    auto bman = ROL::makePtr<ROL::TpetraTeuchosBatchManager<RealT>>(comm);
+    auto sampler = ROL::makePtr<ROL::SparseGridGenerator<RealT>>(bman,info,false);
+    auto vsampler = ROL::makePtr<ROL::SparseGridGenerator<RealT>>(bman,vinfo,adaptiveV);
+    auto gsampler = ROL::makePtr<ROL::SparseGridGenerator<RealT>>(bman,ginfo,adaptiveG);
+    /*************************************************************************/
+    /***************** BUILD STOCHASTIC PROBLEM ******************************/
+    /*************************************************************************/
+    ROL::OptimizationProblem<RealT> opt(robj,zp);
+    parlist->sublist("SOL").set("Initial Statistic", static_cast<RealT>(1));
+    opt.setStochasticObjective(*parlist,vsampler,gsampler);
+
+    // Run derivative checks
+    bool checkDeriv = parlist->sublist("Problem").get("Check derivatives",false);
+    if ( checkDeriv ) {
+      *outStream << "Check Gradient of Full Objective Function" << std::endl;
+      obj->checkGradient(x,d,true,*outStream);
+      *outStream << std::endl << "Check Hessian of Full Objective Function" << std::endl;
+      obj->checkHessVec(x,d,true,*outStream);
+      *outStream << std::endl << "Check Jacobian of Constraint" << std::endl;
+      con->checkApplyJacobian(x,d,*up,true,*outStream);
+      *outStream << std::endl << "Check Jacobian_1 of Constraint" << std::endl;
+      con->checkApplyJacobian_1(*up,*zp,*dup,*rp,true,*outStream);
+      *outStream << std::endl << "Check Jacobian_2 of Constraint" << std::endl;
+      con->checkApplyJacobian_2(*up,*zp,*dzp,*rp,true,*outStream);
+      *outStream << std::endl << "Check Hessian of Constraint" << std::endl;
+      con->checkApplyAdjointHessian(x,*dup,d,x,true,*outStream);
+      *outStream << std::endl << "Check Hessian_11 of Constraint" << std::endl;
+      con->checkApplyAdjointHessian_11(*up,*zp,*pp,*dup,*rp,true,*outStream);
+      *outStream << std::endl << "Check Hessian_12 of Constraint" << std::endl;
+      con->checkApplyAdjointHessian_12(*up,*zp,*pp,*dup,*dzp,true,*outStream);
+      *outStream << std::endl << "Check Hessian_21 of Constraint" << std::endl;
+      con->checkApplyAdjointHessian_21(*up,*zp,*pp,*dzp,*rp,true,*outStream);
+      *outStream << std::endl << "Check Hessian_22 of Constraint" << std::endl;
+      con->checkApplyAdjointHessian_22(*up,*zp,*pp,*dzp,*dzp,true,*outStream);
+
+      *outStream << std::endl << "Check Adjoint Jacobian of Constraint" << std::endl;
+      con->checkAdjointConsistencyJacobian(*dup,d,x,true,*outStream);
+      *outStream << std::endl << "Check Adjoint Jacobian_1 of Constraint" << std::endl;
+      con->checkAdjointConsistencyJacobian_1(*pp,*dup,*up,*zp,true,*outStream);
+      *outStream << std::endl << "Check Adjoint Jacobian_2 of Constraint" << std::endl;
+      con->checkAdjointConsistencyJacobian_2(*pp,*dzp,*up,*zp,true,*outStream);
+
+      *outStream << std::endl << "Check Constraint Solve" << std::endl;
+      con->checkSolve(*up,*zp,*rp,true,*outStream);
+      *outStream << std::endl << "Check Inverse Jacobian_1 of Constraint" << std::endl;
+      con->checkInverseJacobian_1(*rp,*dup,*up,*zp,true,*outStream);
+      *outStream << std::endl << "Check Inverse Adjoint Jacobian_1 of Constraint" << std::endl;
+      con->checkInverseAdjointJacobian_1(*rp,*pp,*up,*zp,true,*outStream);
+
+      *outStream << std::endl << "Check Gradient of Reduced Objective Function" << std::endl;
+      robj->checkGradient(*zp,*dzp,true,*outStream);
+      *outStream << std::endl << "Check Hessian of Reduced Objective Function" << std::endl;
+      robj->checkHessVec(*zp,*dzp,true,*outStream);
+    }
+
+    up->zero();
+    zp->zero();
+    parlist->sublist("Step").set("Type","Trust Region");
+    ROL::OptimizationSolver<RealT> solver(opt,*parlist);
+    std::clock_t timer = std::clock();
+    solver.solve(*outStream);
+    *outStream << "Optimization time: "
+               << static_cast<RealT>(std::clock()-timer)/static_cast<RealT>(CLOCKS_PER_SEC)
+               << " seconds." << std::endl << std::endl;
+
+    parlist->sublist("General").set("Inexact Objective Function", false);
+    parlist->sublist("General").set("Inexact Gradient", false);
+    ROL::OptimizationProblem<RealT> optFull(robj,zp);
+    parlist->sublist("SOL").set("Initial Statistic", static_cast<RealT>(1));
+    opt.setStochasticObjective(*parlist,sampler);
+    ROL::Algorithm<RealT> algoFull("Trust Region",*parlist,false);
+    std::clock_t timerFull = std::clock();
+    algoFull.run(optFull,true,*outStream);
+    *outStream << "Optimization time: "
+               << static_cast<RealT>(std::clock()-timerFull)/static_cast<RealT>(CLOCKS_PER_SEC)
+               << " seconds." << std::endl << std::endl;
+    
+    /*************************************************************************/
+    /***************** OUTPUT RESULTS ****************************************/
+    /*************************************************************************/
+    vsampler->print("Value_samples");
+    gsampler->print("Gradient_samples");
+    ROL::dynamicPtrCast<ROL::SparseGridGenerator<RealT>>(vsampler)->printIndexSet();
+    ROL::dynamicPtrCast<ROL::SparseGridGenerator<RealT>>(gsampler)->printIndexSet();
+    std::clock_t timer_print = std::clock();
+    assembler->printMeshData(*outStream);
+    // Output control to file
+    con->outputTpetraVector(z_ptr,"control.txt");
+    // Output expected state and samples to file
+    *outStream << std::endl << "Print Expected Value of State" << std::endl;
+    up->zero(); pp->zero(); dup->zero(); dzp->zero(); yup->zero(); dyp->zero();
+    RealT tol(1.e-8);
+    auto bman_Eu = ROL::makePtr<ROL::TpetraTeuchosBatchManager<RealT>>(comm);
+    std::vector<RealT> sample(stochDim);
+    std::stringstream name_samp;
+    name_samp << "samples_" << bman->batchID() << ".txt";
+    std::ofstream file_samp;
+    file_samp.open(name_samp.str());
+    file_samp << std::scientific << std::setprecision(15);
+    for (int i = 0; i < sampler->numMySamples(); ++i) {
+      *outStream << "Sample i = " << i << std::endl;
+      sample = sampler->getMyPoint(i);
+      con->setParameter(sample);
+      con->solve(*rp,*dup,*zp,tol);
+      up->axpy(sampler->getMyWeight(i),*dup);
+      con->solve(*rp,*dyp,*dzp,tol);
+      yup->axpy(sampler->getMyWeight(i),*dyp);
+      for (int j = 0; j < stochDim; ++j)
+        file_samp << std::setw(25) << std::left << sample[j];
+      file_samp << std::endl;
+    }
+    file_samp.close();
+    bman_Eu->sumAll(*up,*pp);
+    bman_Eu->sumAll(*yup,*ypp);
+    con->outputTpetraVector(p_ptr,"mean_state.txt");
+    con->outputTpetraVector(yp_ptr,"mean_uncontrolled_state.txt");
+    // Build full objective function distribution
+    *outStream << std::endl << "Print Objective CDF" << std::endl;
+    int nsamp_dist = parlist->sublist("Problem").get("Number of output samples",100);
+    auto sampler_dist = ROL::makePtr<ROL::MonteCarloGenerator<RealT>>(nsamp_dist,distVec,bman);
+    print<RealT>(*robj,*zp,*sampler_dist,nsamp_dist,comm,"obj_samples.txt");
+    // Build vorticity objective function distribution
+    auto obj0 = ROL::makePtr<IntegralObjective<RealT,DeviceT>>(qoi_vec[0],assembler);
+    auto stateStore0 = ROL::makePtr<ROL::VectorController<RealT>>();
+    auto robj0 = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(obj0, con, stateStore0, up, zp, pp, true, false);
+    print<RealT>(*robj0,*zp,*sampler_dist,nsamp_dist,comm,"vort_samples.txt");
+
+    *outStream << "Output time: "
+               << static_cast<RealT>(std::clock()-timer_print)/static_cast<RealT>(CLOCKS_PER_SEC)
+               << " seconds." << std::endl << std::endl;
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/published/IMAvolumes_KouriRidzal2017/mesh_thermal-fluidsK.hpp b/packages/rol/example/PDE-OPT/published/IMAvolumes_KouriRidzal2017/mesh_thermal-fluidsK.hpp
new file mode 100644
index 000000000000..e100d55c7755
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/published/IMAvolumes_KouriRidzal2017/mesh_thermal-fluidsK.hpp
@@ -0,0 +1,129 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+#include "../../TOOLS/meshmanagerK.hpp"
+
+template <class Real, class DeviceType>
+class MeshManager_ThermalFluids : public MeshManager_Rectangle<Real,DeviceType> {
+private:
+  int nx_;
+  int ny_;
+
+  ROL::Ptr<std::vector<std::vector<std::vector<int>>>>  meshSideSets_;
+
+public:
+
+  MeshManager_ThermalFluids(ROL::ParameterList &parlist) : MeshManager_Rectangle<Real,DeviceType>(parlist) {
+    nx_ = parlist.sublist("Geometry").get("NX", 3);
+    ny_ = parlist.sublist("Geometry").get("NY", 1);
+    computeSideSets();
+  }
+
+  void computeSideSets() {
+
+    int numSideSets = 8;
+    meshSideSets_ = ROL::makePtr<std::vector<std::vector<std::vector<int>>>>(numSideSets);
+
+    Real patchFrac = static_cast<Real>(1)/static_cast<Real>(3);
+    int np1 = static_cast<int>(patchFrac * static_cast<Real>(nx_));
+    int np2 = static_cast<int>(patchFrac * static_cast<Real>(nx_));
+    int np3 = nx_-(np1+np2);
+
+    // Bottom
+    (*meshSideSets_)[0].resize(4);
+    (*meshSideSets_)[0][0].resize(nx_);
+    (*meshSideSets_)[0][1].resize(0);
+    (*meshSideSets_)[0][2].resize(0);
+    (*meshSideSets_)[0][3].resize(0);
+    // Right
+    (*meshSideSets_)[1].resize(4);
+    (*meshSideSets_)[1][0].resize(0);
+    (*meshSideSets_)[1][1].resize(ny_);
+    (*meshSideSets_)[1][2].resize(0);
+    (*meshSideSets_)[1][3].resize(0);
+    // Left
+    (*meshSideSets_)[2].resize(4);
+    (*meshSideSets_)[2][0].resize(0);
+    (*meshSideSets_)[2][1].resize(0);
+    (*meshSideSets_)[2][2].resize(0);
+    (*meshSideSets_)[2][3].resize(ny_);
+    // Top
+    (*meshSideSets_)[3].resize(4);
+    (*meshSideSets_)[3][0].resize(0);
+    (*meshSideSets_)[3][1].resize(0);
+    (*meshSideSets_)[3][2].resize(nx_);
+    (*meshSideSets_)[3][3].resize(0);
+    // Thermal boundaries
+    // Top Right
+    (*meshSideSets_)[4].resize(4);
+    (*meshSideSets_)[4][0].resize(0);
+    (*meshSideSets_)[4][1].resize(0);
+    (*meshSideSets_)[4][2].resize(np3);
+    (*meshSideSets_)[4][3].resize(0);
+    // Top Center
+    (*meshSideSets_)[5].resize(4);
+    (*meshSideSets_)[5][0].resize(0);
+    (*meshSideSets_)[5][1].resize(0);
+    (*meshSideSets_)[5][2].resize(np2);
+    (*meshSideSets_)[5][3].resize(0);
+    // Top Left
+    (*meshSideSets_)[6].resize(4);
+    (*meshSideSets_)[6][0].resize(0);
+    (*meshSideSets_)[6][1].resize(0);
+    (*meshSideSets_)[6][2].resize(np1);
+    (*meshSideSets_)[6][3].resize(0);
+    // Pressure Pinning
+    (*meshSideSets_)[7].resize(4);
+    //(*meshSideSets_)[7][0].resize(1);
+    //(*meshSideSets_)[7][1].resize(1);
+    //(*meshSideSets_)[7][2].resize(1);
+    //(*meshSideSets_)[7][3].resize(1);
+    (*meshSideSets_)[7][0].resize(1);
+    (*meshSideSets_)[7][1].resize(0);
+    (*meshSideSets_)[7][2].resize(0);
+    (*meshSideSets_)[7][3].resize(0);
+    
+    // Bottom
+    for (int i=0; i<nx_; ++i)
+      (*meshSideSets_)[0][0][i] = i;
+    // Right
+    for (int i=0; i<ny_; ++i)
+      (*meshSideSets_)[1][1][i] = (i+1)*nx_-1;
+    // Left
+    for (int i=0; i<ny_; ++i)
+      (*meshSideSets_)[2][3][i] = i*nx_;
+    // Top
+    for (int i=0; i<nx_; ++i)
+      (*meshSideSets_)[3][2][i] = i + nx_*(ny_-1);
+    // Top Left
+    for (int i=0; i<np1; ++i)
+      (*meshSideSets_)[6][2][i] = i + nx_*(ny_-1);
+    // Top Center
+    for (int i=0; i<np2; ++i)
+      (*meshSideSets_)[5][2][i] = i + np1 + nx_*(ny_-1);
+    // Top Right
+    for (int i=0; i<np3; ++i)
+      (*meshSideSets_)[4][2][i] = i + (np1+np2) + nx_*(ny_-1);
+    // Pressure Pinning
+    //int offset = nx_ * (ny_/2) + (nx_-1)/2;
+    //(*meshSideSets_)[7][0][0] = offset + nx_ + 1;
+    //(*meshSideSets_)[7][1][0] = offset + nx_;
+    //(*meshSideSets_)[7][2][0] = offset;
+    //(*meshSideSets_)[7][3][0] = offset + 1;
+    (*meshSideSets_)[7][0][0] = 0;
+
+  } // computeSideSets
+
+  ROL::Ptr<std::vector<std::vector<std::vector<int>>>> getSideSets(
+              const bool verbose = false,
+              std::ostream & outStream = std::cout) const { 
+    return meshSideSets_;
+  }
+
+}; // MeshManager_ThermalFluids
diff --git a/packages/rol/example/PDE-OPT/published/IMAvolumes_KouriRidzal2017/obj_thermal-fluidsK.hpp b/packages/rol/example/PDE-OPT/published/IMAvolumes_KouriRidzal2017/obj_thermal-fluidsK.hpp
new file mode 100644
index 000000000000..e3794467d79a
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/published/IMAvolumes_KouriRidzal2017/obj_thermal-fluidsK.hpp
@@ -0,0 +1,947 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  obj.hpp
+    \brief Provides the interface for local (cell-based) objective function computations.
+*/
+
+#ifndef PDEOPT_QOI_THERMALFLUIDSK_HPP
+#define PDEOPT_QOI_THERMALFLUIDSK_HPP
+
+#include "../../TOOLS/qoiK.hpp"
+#include "pde_thermal-fluidsK.hpp"
+
+template <class Real, class DeviceType>
+class QoI_Vorticity_ThermalFluids : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> feVel_, fePrs_, feThr_;
+  const ROL::Ptr<FieldHelper<Real,DeviceType>> fieldHelper_;
+  scalar_view weight_;
+
+  Real weightFunc(const std::vector<Real> & x) const {
+    return static_cast<Real>(1);
+  }
+
+public:
+  QoI_Vorticity_ThermalFluids(const ROL::Ptr<fe_type> &feVel,
+                              const ROL::Ptr<fe_type> &fePrs,
+                              const ROL::Ptr<fe_type> &feThr,
+                              const ROL::Ptr<FieldHelper<Real,DeviceType>> &fieldHelper,
+                              ROL::ParameterList &parlist)
+    : feVel_(feVel), fePrs_(fePrs), feThr_(feThr), fieldHelper_(fieldHelper) {
+    int c = feVel_->cubPts().extent_int(0);
+    int p = feVel_->cubPts().extent_int(1);
+    int d = feVel_->cubPts().extent_int(2);
+    std::vector<Real> pt(d);
+    weight_ = scalar_view("weight", c,p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k)
+          pt[k] = (feVel_->cubPts())(i,j,k);
+        weight_(i,j) = weightFunc(pt);
+      }
+    }
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = u_coeff.extent_int(0);
+    int p = feVel_->cubPts().extent_int(1);
+    int d = feVel_->cubPts().extent_int(2);
+    // Initialize output val
+    val = scalar_view("val", c);
+    // Get components of the control
+    std::vector<scalar_view> U;
+    fieldHelper_->splitFieldCoeff(U, u_coeff);
+    // Evaluate on FE basis
+    std::vector<scalar_view> gradU_vec(d);
+    for (int i = 0; i < d; ++i) {
+      gradU_vec[i] = scalar_view("gradU_vec", c, p, d);
+      feVel_->evaluateGradient(gradU_vec[i], U[i]);
+    }
+    // Compute weighted curl
+    scalar_view curlU_eval;
+    if (d==2)
+      curlU_eval = scalar_view("curlU_eval", c, p);
+    else if (d==3)
+      curlU_eval = scalar_view"curlU_eval", (c, p, d);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        if (d==2)
+          curlU_eval(i,j) = weight_(i,j) * ((gradU_vec[1])(i,j,0)-(gradU_vec[0])(i,j,1));
+        else if (d==3) {
+          for (int k = 0; k < d; ++k) {
+            int i1 = (k+2)%d, i2 = (k+1)%d;
+            curlU_eval(i,j,k) = weight_(i,j) * ((gradU_vec[i1])(i,j,i2)-(gradU_vec[i2])(i,j,i1));
+          }
+        }
+      }
+    }
+    // Compute L2 norm squared
+    feVel_->computeIntegral(val,curlU_eval,curlU_eval,false);
+    rst::scale(val,static_cast<Real>(0.5));
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = u_coeff.extent_int(0);
+    int fv = feVel_->N().extent_int(1);
+    int fp = fePrs_->N().extent_int(1);
+    int fh = feThr_->N().extent_int(1);
+    int p = feVel_->cubPts().extent_int(1);
+    int d = feVel_->cubPts().extent_int(2);
+    // Initialize output grad
+    std::vector<scalar_view> G(d+2);
+    for (int i = 0; i < d; ++i)
+      G[i] = scalar_view("grad", c, fv);
+    G[d] = scalar_view("grad", c, fp);
+    G[d+1] = scalar_view("grad", c, fh);
+    // Get components of the control
+    std::vector<scalar_view> U;
+    fieldHelper_->splitFieldCoeff(U, u_coeff);
+    // Evaluate on FE basis
+    std::vector<scalar_view> gradU_vec(d);
+    for (int i = 0; i < d; ++i) {
+      gradU_vec[i] = scalar_view("gradU_vec", c, p, d);
+      feVel_->evaluateGradient(gradU_vec[i], U[i]);
+    }
+    // Compute weighted curl
+    int size = (d==2) ? 1 : d;
+    std::vector<scalar_view> curlU_vec(size);
+    if (d==2) {
+      curlU_vec[0] = scalar_view("curlU_vec", c, p);
+      for (int i = 0; i < c; ++i) {
+        for (int j = 0; j < p; ++j)
+          (curlU_vec[0])(i,j) = weight_(i,j) * ((gradU_vec[1])(i,j,0)-(gradU_vec[0])(i,j,1));
+      }
+    }
+    else if (d==3) {
+      for (int i = 0; i < d; ++i) {
+        curlU_vec[i] = scalar_view("curlU_vec", c, p);
+        for (int j = 0; j < c; ++j) {
+          for (int k = 0; k < p; ++k) {
+            int i1 = (i+2)%d, i2 = (i+1)%d;
+            (curlU_vec[i])(j,k) = weight_(j,k) * ((gradU_vec[i1])(j,k,i2)-(gradU_vec[i2])(j,k,i1));
+          }
+        }
+      }
+    }
+    // Build local gradient of state tracking term
+    if (d==2) {
+      fst::integrate(G[0],curlU_vec[0],feVel_->DNDdetJ(1),false);
+      rst::scale(G[0],static_cast<Real>(-1));
+      fst::integrate(G[1],curlU_vec[0],feVel_->DNDdetJ(0),false);
+    }
+    else if (d==3) {
+      for (int i = 0; i < d; ++i) {
+        int i1 = (i+2)%d, i2 = (i+1)%d;
+        fst::integrate(G[i],curlU_vec[i1],feVel_->DNDdetJ(i2),false);
+        rst::scale(G[i],static_cast<Real>(-1));
+        fst::integrate(G[i],curlU_vec[i2],feVel_->DNDdetJ(i1),false);
+      }
+    }
+    fieldHelper_->combineFieldCoeff(grad, G);
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Vorticity_ThermalFluids::gradient_2 is zero.");
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c  = z_coeff.extent_int(0);
+    int p  = feVel_->cubPts().extent_int(1);
+    int fv = feVel_->N().extent_int(1);
+    int fp = fePrs_->N().extent_int(1);
+    int fh = feThr_->N().extent_int(1);
+    int d = feVel_->cubPts().extent_int(2);
+    // Initialize output grad
+    std::vector<scalar_view> H(d+2);
+    for (int i = 0; i < d; ++i)
+      H[i] = scalar_view("hess", c, fv);
+    H[d] = scalar_view("hess", c, fp);
+    H[d+1] = scalar_view("hess", c, fh);
+    // Get components of the control
+    std::vector<scalar_view> V;
+    fieldHelper_->splitFieldCoeff(V, v_coeff);
+    // Evaluate on FE basis
+    std::vector<scalar_view> gradV_vec(d);
+    for (int i = 0; i < d; ++i) {
+      gradV_vec[i] = scalar_view("gradV_vec", c, p, d);
+      feVel_->evaluateGradient(gradV_vec[i], V[i]);
+    }
+    // Compute weighted curl
+    int size = (d==2) ? 1 : d;
+    std::vector<scalar_view> curlV_vec(size);
+    if (d==2) {
+      curlV_vec[0] = scalar_view("curlV_vec", c, p);
+      for (int i = 0; i < c; ++i) {
+        for (int j = 0; j < p; ++j)
+          (curlV_vec[0])(i,j) = weight_(i,j) * ((gradV_vec[1])(i,j,0)-(gradV_vec[0])(i,j,1));
+      }
+    }
+    else if (d==3) {
+      for (int i = 0; i < d; ++i) {
+        curlV_vec[i] = scalar_view("curlV_vec", c, p);
+        for (int j = 0; j < c; ++j) {
+          for (int k = 0; k < p; ++k) {
+            int i1 = (i+2)%d, i2 = (i+1)%d;
+            (curlV_vec[i])(j,k) = weight_(j,k) * ((gradV_vec[i1])(j,k,i2)-(gradV_vec[i2])(j,k,i1));
+          }
+        }
+      }
+    }
+    // Build local gradient of state tracking term
+    if (d==2) {
+      fst::integrate(H[0],curlV_vec[0],feVel_->DNDdetJ(1),false);
+      rst::scale(H[0],static_cast<Real>(-1));
+      fst::integrate(H[1],curlV_vec[0],feVel_->DNDdetJ(0),false);
+    }
+    else if (d==3) {
+      for (int i = 0; i < d; ++i) {
+        int i1 = (i+2)%d, i2 = (i+1)%d;
+        fst::integrate(H[i],curlV_vec[i1],feVel_->DNDdetJ(i2),false);
+        rst::scale(*H[i],static_cast<Real>(-1));
+        fst::integrate(H[i],curlV_vec[i2],feVel_->DNDdetJ(i1),false);
+      }
+    }
+
+    fieldHelper_->combineFieldCoeff(hess, H);
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Vorticity_ThermalFluids::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Vorticity_ThermalFluids::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Vorticity_ThermalFluids::HessVec_22 is zero.");
+  }
+
+}; // QoI_Vorticity_ThermalFluids
+
+template <class Real>
+class QoI_Circulation_ThermalFluids : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> feVel_, fePrs_, feThr_;
+  const ROL::Ptr<FieldHelper<Real,DeviceType>> fieldHelper_;
+  scalar_view weight_;
+
+  Real weightFunc(const std::vector<Real> & x) const {
+    return static_cast<Real>(1);
+  }
+
+public:
+  QoI_Circulation_ThermalFluids(const ROL::Ptr<fe_type> &feVel,
+                                const ROL::Ptr<fe_type> &fePrs,
+                                const ROL::Ptr<fe_type> &feThr,
+                                const ROL::Ptr<FieldHelper<Real,DeviceType>> &fieldHelper)
+    : feVel_(feVel), fePrs_(fePrs), feThr_(feThr), fieldHelper_(fieldHelper) {
+    int c = feVel_->cubPts().extent_int(0);
+    int p = feVel_->cubPts().extent_int(1);
+    int d = feVel_->cubPts().extent_int(2);
+    std::vector<Real> pt(d);
+    weight_ = scalar_view("weight",c,p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k)
+          pt[k] = (feVel_->cubPts())(i,j,k);
+        weight_(i,j) = weightFunc(pt);
+      }
+    }
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = u_coeff.extent_int(0);
+    int p = feVel_->cubPts().extent_int(1);
+    int d = feVel_->cubPts().extent_int(2);
+    // Initialize output val
+    val = scalar_view("val", c);
+    // Get components of the control
+    std::vector<scalar_view> U;
+    fieldHelper_->splitFieldCoeff(U, u_coeff);
+    // Evaluate on FE basis
+    scalar_view gradUX_eval("gradUX_eval", c, p, d);
+    scalar_view gradUY_eval("gradUY_eval", c, p, d);
+    feVel_->evaluateGradient(gradUX_eval, U[0]);
+    feVel_->evaluateGradient(gradUY_eval, U[1]);
+    // Compute curl
+    scalar_view curlU_eval("curlU_eval", c, p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j)
+        curlU_eval(i,j) = gradUY_eval(i,j,0) - gradUX_eval(i,j,1);
+    }
+    // Compute circulation
+    feVel_->computeIntegral(val,curlU_eval,weight_,false);
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = u_coeff.extent_int(0);
+    int fv = feVel_->N().extent_int(1);
+    int fp = fePrs_->N().extent_int(1);
+    int fh = feThr_->N().extent_int(1);
+    int d = feVel_->cubPts().extent_int(2);
+    // Initialize output grad
+    std::vector<scalar_view> G(d+2);
+    for (int i = 0; i < d; ++i)
+      G[i] = scalar_view("grad", c, fv);
+    G[d] = scalar_view("grad", c, fp);
+    G[d+1] = scalar_view("grad", c, fh);
+    // Build local gradient of state tracking term
+    fst::integrate(G[0],weight_,feVel_->DNDdetJ(1),false);
+    rst::scale(G[0],static_cast<Real>(-1));
+    fst::integrate(G[1],weight_,feVel_->DNDdetJ(0),false);
+
+    fieldHelper_->combineFieldCoeff(grad, G);
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Circulation_ThermalFluids::gradient_2 is zero.");
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Circulation_ThermalFluids::HessVec_11 is zero.");
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Circulation_ThermalFluids::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Circulation_ThermalFluids::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Circulation_ThermalFluids::HessVec_22 is zero.");
+  }
+
+}; // QoI_Circulation_ThermalFluids
+
+template <class Real>
+class QoI_Horizontal_ThermalFluids : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> feVel_, fePrs_, feThr_;
+  const ROL::Ptr<FieldHelper<Real,DeviceType>> fieldHelper_;
+  scalar_view weight_;
+
+  Real weightFunc(const std::vector<Real> & x) const {
+    return static_cast<Real>(1);
+  }
+
+public:
+  QoI_Horizontal_ThermalFluids(const ROL::Ptr<fe_type> &feVel,
+                               const ROL::Ptr<fe_type> &fePrs,
+                               const ROL::Ptr<fe_type> &feThr,
+                               const ROL::Ptr<FieldHelper<Real,DeviceType>> &fieldHelper)
+    : feVel_(feVel), fePrs_(fePrs), feThr_(feThr), fieldHelper_(fieldHelper) {
+    int c = feVel_->cubPts().extent_int(0);
+    int p = feVel_->cubPts().extent_int(1);
+    int d = feVel_->cubPts().extent_int(2);
+    std::vector<Real> pt(d);
+    weight_ = scalar_view("weight",c,p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k)
+          pt[k] = (feVel_->cubPts())(i,j,k);
+        weight_(i,j) = weightFunc(pt);
+      }
+    }
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = u_coeff.extent_int(0);
+    int p = feVel_->cubPts().extent_int(1);
+    // Initialize output val
+    val = scalar_view("val", c);
+    // Get components of the control
+    std::vector<scalar_view> U;
+    fieldHelper_->splitFieldCoeff(U, u_coeff);
+    // Evaluate on FE basis
+    scalar_view valUX_eval("valUX_eval", c, p);
+    scalar_view valUY_eval("valUY_eval", c, p);
+    feVel_->evaluateValue(valUX_eval, U[0]);
+    feVel_->evaluateValue(valUY_eval, U[1]);
+    // Compute negative part of x-velocity
+    scalar_view minUX_eval("minUX_eval", c, p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j)
+        minUX_eval(i,j) = std::min(static_cast<Real>(0),valUX_eval(i,j));
+    }
+    // Multiply by weight
+    scalar_view weighted_minUX_eval("weighted_minUX_eval", c, p);
+    fst::scalarMultiplyDataData(weighted_minUX_eval,weight_,minUX_eval);
+    scalar_view weighted_valUY_eval("weighted_valUY_eval", c, p);
+    fst::scalarMultiplyDataData(weighted_valUY_eval,weight_,valUY_eval);
+    // Compute L2 norm squared
+    feVel_->computeIntegral(val,minUX_eval,weighted_minUX_eval,false);
+    feVel_->computeIntegral(val,valUY_eval,weighted_valUY_eval,true);
+
+    rst::scale(val,static_cast<Real>(0.5));
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = u_coeff.extent_int(0);
+    int fv = feVel_->N().extent_int(1);
+    int fp = fePrs_->N().extent_int(1);
+    int fh = feThr_->N().extent_int(1);
+    int p = feVel_->cubPts().extent_int(1);
+    int d = feVel_->cubPts().extent_int(2);
+    // Initialize output grad
+    std::vector<scalar_view> G(d+2);
+    for (int i = 0; i < d; ++i)
+      G[i] = scalar_view("grad", c, fv);
+    G[d] = scalar_view("grad", c, fp);
+    G[d+1] = scalar_view("grad", c, fh);
+    // Get components of the control
+    std::vector<scalar_view> U;
+    fieldHelper_->splitFieldCoeff(U, u_coeff);
+    // Evaluate on FE basis
+    scalar_view valUX_eval("valUX_eval", c, p);
+    scalar_view valUY_eval("valUY_eval", c, p);
+    feVel_->evaluateValue(valUX_eval, U[0]);
+    feVel_->evaluateValue(valUY_eval, U[1]);
+    // Compute negative part of x-velocity
+    scalar_view weighted_minUX_eval("weighted_minUX_eval", c, p);
+    scalar_view weighted_valUY_eval("weighted_valUY_eval", c, p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        weighted_minUX_eval(i,j) = weight_(i,j) * std::min(static_cast<Real>(0),valUX_eval(i,j));
+        weighted_valUY_eval(i,j) = weight_(i,j) * valUY_eval(i,j);
+      }
+    }
+    // Build local gradient of state tracking term
+    fst::integrate(G[0],weighted_minUX_eval,feVel_->NdetJ(),false);
+    fst::integrate(G[1],weighted_valUY_eval,feVel_->NdetJ(),false);
+
+    fieldHelper_->combineFieldCoeff(grad, G);
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Horizontal_ThermalFluids::gradient_2 is zero.");
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c  = z_coeff.extent_int(0);
+    int p  = feVel_->cubPts().extent_int(1);
+    int fv = feVel_->N().extent_int(1);
+    int fp = fePrs_->N().extent_int(1);
+    int fh = feThr_->N().extent_int(1);
+    int d = feVel_->cubPts().extent_int(2);
+    // Initialize output grad
+    std::vector<scalar_view> H(d+2);
+    for (int i = 0; i < d; ++i)
+      H[i] = scalar_view("hess", c, fv);
+    H[d] = scalar_view("hess", c, fp);
+    H[d+1] = scalar_view("hess", c, fh);
+    // Get components of the control
+    std::vector<scalar_view> U;
+    fieldHelper_->splitFieldCoeff(U, u_coeff);
+    std::vector<scalar_view> V;
+    fieldHelper_->splitFieldCoeff(V, v_coeff);
+    // Evaluate on FE basis
+    scalar_view valUX_eval("valUX_eval", c, p);
+    scalar_view valVX_eval("valVX_eval", c, p);
+    scalar_view valVY_eval("valVY_eval", c, p);
+    feVel_->evaluateValue(valUX_eval, U[0]);
+    feVel_->evaluateValue(valVX_eval, V[0]);
+    feVel_->evaluateValue(valVY_eval, V[1]);
+    // Compute negative part of x-velocity
+    scalar_view weighted_minVX_eval("weighted_minVX_eval", c, p);
+    scalar_view weighted_valVY_eval("weighted_valVY_eval", c, p);
+    const Real zero(0), one(1);
+    Real scale(0), uij(0);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        uij = valUX_eval(i,j);
+        scale = (uij < zero ? one : (uij > zero ? zero : one));
+        weighted_minVX_eval(i,j) = scale * weight_(i,j) * valVX_eval(i,j);
+        weighted_valVY_eval(i,j) = weight_(i,j) * valVY_eval(i,j);
+      }
+    }
+    // Build local gradient of state tracking term
+    fst::integrate(H[0],weighted_minVX_eval,feVel_->NdetJ(),false);
+    fst::integrate(H[1],weighted_valVY_eval,feVel_->NdetJ(),false);
+
+    fieldHelper_->combineFieldCoeff(hess, H);
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Horizontal_ThermalFluids::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Horizontal_ThermalFluids::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Horizontal_ThermalFluids::HessVec_22 is zero.");
+  }
+
+}; // QoI_Horizontal_ThermalFluids
+
+template <class Real>
+class QoI_State_ThermalFluids : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  ROL::Ptr<QoI<Real,DeviceType>> qoi_;
+
+public:
+  QoI_State_ThermalFluids(ROL::ParameterList &parlist,
+                         const ROL::Ptr<fe_type> &feVel,
+                         const ROL::Ptr<fe_type> &fePrs,
+                         const ROL::Ptr<fe_type> &feThr,
+                         const ROL::Ptr<FieldHelper<Real,DeviceType>> &fieldHelper) {
+    std::string stateObj = parlist.sublist("Problem").get("Objective type","Vorticity");
+    if ( stateObj != "Vorticity" && stateObj != "Circulation" && stateObj != "Directional" )
+      throw Exception::NotImplemented(">>> (QoI_State_ThermalFluids): Unknown objective type."); 
+    if ( stateObj == "Vorticity" )
+      qoi_ = ROL::makePtr<QoI_Vorticity_ThermalFluids<Real,DeviceType>>(feVel,fePrs,feThr,fieldHelper,parlist);
+    else if ( stateObj == "Directional" )
+      qoi_ = ROL::makePtr<QoI_Horizontal_ThermalFluids<Real,DeviceType>>(feVel,fePrs,feThr,fieldHelper);
+    else
+      qoi_ = ROL::makePtr<QoI_Circulation_ThermalFluids<Real,DeviceType>>(feVel,fePrs,feThr,fieldHelper);
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    return qoi_->value(val, u_coeff, z_coeff, z_param);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    qoi_->gradient_1(grad, u_coeff, z_coeff, z_param);
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    qoi_->gradient_2(grad, u_coeff, z_coeff, z_param);
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    qoi_->HessVec_11(hess, v_coeff, u_coeff, z_coeff, z_param);
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    qoi_->HessVec_12(hess, v_coeff, u_coeff, z_coeff, z_param);
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    qoi_->HessVec_21(hess, v_coeff, u_coeff, z_coeff, z_param);
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    qoi_->HessVec_22(hess, v_coeff, u_coeff, z_coeff, z_param);
+  }
+
+};
+
+template <class Real>
+class QoI_L2Penalty_ThermalFluids : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> feVel_, fePrs_, feThr_;
+  const std::vector<std::vector<ROL::Ptr<fe_type>>> feThrBdry_;
+  const std::vector<std::vector<std::vector<int>>> bdryCellLocIds_;
+  const ROL::Ptr<FieldHelper<Real,DeviceType>> fieldHelper_;
+
+  scalar_view getBoundaryCoeff(const scalar_view cell_coeff, int sideSet, int cell) const {
+    std::vector<int> bdryCellLocId = bdryCellLocIds_[sideSet][cell];
+    const int numCellsSide = bdryCellLocId.size();
+    const int f = feThr_->N().extent_int(1);
+    
+    scalar_view bdry_coeff("bdry_coeff", numCellsSide, f);
+    for (int i = 0; i < numCellsSide; ++i) {
+      for (int j = 0; j < f; ++j)
+        bdry_coeff(i, j) = cell_coeff(bdryCellLocId[i], j);
+    }
+    return bdry_coeff;
+  }
+
+public:
+  QoI_L2Penalty_ThermalFluids(const ROL::Ptr<fe_type> &feVel,
+                              const ROL::Ptr<fe_type> &fePrs,
+                              const ROL::Ptr<fe_type> &feThr,
+                              const std::vector<std::vector<ROL::Ptr<fe_type>>> &feThrBdry,
+                              const std::vector<std::vector<std::vector<int>>> &bdryCellLocIds,
+                              const ROL::Ptr<FieldHelper<Real,DeviceType>> &fieldHelper)
+    : feVel_(feVel), fePrs_(fePrs), feThr_(feThr), feThrBdry_(feThrBdry),
+      bdryCellLocIds_(bdryCellLocIds), fieldHelper_(fieldHelper) {}
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    const int c = feVel_->gradN().extent_int(0);
+    const int d = feVel_->gradN().extent_int(3);
+    // Get components of the control
+    std::vector<scalar_view> Z;
+    fieldHelper_->splitFieldCoeff(Z, z_coeff);
+    // Initialize output val
+    val = scalar_view("val", c);
+    // Compute cost integral
+    const int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if ( i == 1 || i == 2 ) {
+          const int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            const int numCellsSide = bdryCellLocIds_[i][j].size();
+            if ( numCellsSide ) {
+              const int numCubPerSide = feThrBdry_[i][j]->cubPts().extent_int(1);
+              // Evaluate control on FE basis
+              scalar_view z_coeff_bdry = getBoundaryCoeff(Z[d+1], i, j);
+              scalar_view valZ_eval("valZ_eval", numCellsSide, numCubPerSide);
+              feThrBdry_[i][j]->evaluateValue(valZ_eval, z_coeff_bdry);
+              // Integrate cell L2 norm squared
+              scalar_view intVal("intVal", numCellsSide);
+              feThrBdry_[i][j]->computeIntegral(intVal,valZ_eval,valZ_eval,false);
+              // Add to integral value
+              for (int k = 0; k < numCellsSide; ++k) {
+                int cidx = bdryCellLocIds_[i][j][k];
+                val(cidx) += static_cast<Real>(0.5)*intVal(k);
+              }
+            }
+          }
+        }
+      }
+    }
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_L2Penalty_ThermalFluids::gradient_1 is zero.");
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fv = feVel_->gradN().extent_int(1);
+    const int fp = fePrs_->gradN().extent_int(1);
+    const int fh = feThr_->gradN().extent_int(1);
+    const int d  = feThr_->gradN().extent_int(3);
+    // Initialize output grad
+    std::vector<scalar_view> G(d+2);
+    for (int i = 0; i < d; ++i)
+      G[i] = scalar_view("grad", c, fv);
+    G[d] = scalar_view("grad", c, fp);
+    G[d+1] = scalar_view("grad", c, fh);
+    // Get components of the control
+    std::vector<scalar_view> Z;
+    fieldHelper_->splitFieldCoeff(Z, z_coeff);
+    // Compute cost integral
+    const int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if ( i == 1 || i == 2 ) {
+          const int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            const int numCellsSide = bdryCellLocIds_[i][j].size();
+            if ( numCellsSide ) {
+              const int numCubPerSide = feThrBdry_[i][j]->cubPts().extent_int(1);
+              // Evaluate control on FE basis
+              scalar_view z_coeff_bdry = getBoundaryCoeff(Z[d+1], i, j);
+              scalar_view valZ_eval("valZ_eval", numCellsSide, numCubPerSide);
+              feThrBdry_[i][j]->evaluateValue(valZ_eval, z_coeff_bdry);
+              // Compute gradient of squared L2-norm of diff
+              scalar_view intGrad("intGrad", numCellsSide, fh);
+              fst::integrate(intGrad,valZ_eval,feThrBdry_[i][j]->NdetJ(),false);
+              // Add to integral value
+              for (int k = 0; k < numCellsSide; ++k) {
+                int cidx = bdryCellLocIds_[i][j][k];
+                for (int l = 0; l < fh; ++l)
+                  (G[d+1])(cidx,l) += intGrad(k,l);
+              }
+            }
+          }
+        }
+      }
+    }
+
+    fieldHelper_->combineFieldCoeff(grad, G);
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_L2Penalty_ThermalFluids::HessVec_11 is zero.");
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_L2Penalty_ThermalFluids::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_L2Penalty_ThermalFluids::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fv = feVel_->gradN().extent_int(1);
+    const int fp = fePrs_->gradN().extent_int(1);
+    const int fh = feThr_->gradN().extent_int(1);
+    const int d  = feThr_->gradN().extent_int(3);
+    // Initialize output grad
+    std::vector<scalar_view> H(d+2);
+    for (int i = 0; i < d; ++i)
+      H[i] = scalar_view("hess", c, fv);
+    H[d] = scalar_view("hess", c, fp);
+    H[d+1] = scalar_view("hess", c, fh);
+    // Get components of the control
+    std::vector<scalar_view> V;
+    fieldHelper_->splitFieldCoeff(V, v_coeff);
+    // Compute cost integral
+    const int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if ( i == 1 || i == 2 ) {
+          const int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            const int numCellsSide = bdryCellLocIds_[i][j].size();
+            if ( numCellsSide ) {
+              const int numCubPerSide = feThrBdry_[i][j]->cubPts().extent_int(1);
+              // Evaluate control on FE basis
+              scalar_view v_coeff_bdry = getBoundaryCoeff(V[d+1], i, j);
+              scalar_view valV_eval("valV_eval", numCellsSide, numCubPerSide);
+              feThrBdry_[i][j]->evaluateValue(valV_eval, v_coeff_bdry);
+              // Compute gradient of squared L2-norm of diff
+              scalar_view intHess("intHess", numCellsSide, fh);
+              fst::integrate(intHess,valV_eval,feThrBdry_[i][j]->NdetJ(),false);
+              // Add to integral value
+              for (int k = 0; k < numCellsSide; ++k) {
+                int cidx = bdryCellLocIds_[i][j][k];
+                for (int l = 0; l < fh; ++l)
+                  (H[d+1])(cidx,l) += intHess(k,l);
+              }
+            }
+          }
+        }
+      }
+    }
+
+    fieldHelper_->combineFieldCoeff(hess, H);
+  }
+
+}; // QoI_L2Penalty_ThermalFluids
+
+template <class Real>
+class StdObjective_ThermalFluids : public ROL::StdObjective<Real> {
+private:
+  Real alpha_;
+  std::string stateObj_;
+
+public:
+  StdObjective_ThermalFluids(ROL::ParameterList &parlist) {
+    alpha_    = parlist.sublist("Problem").get("Control penalty parameter",1.e-4);
+    stateObj_ = parlist.sublist("Problem").get("Objective type","Vorticity");
+    if ( stateObj_ != "Vorticity" && stateObj_ != "Circulation" && stateObj_ != "Directional") {
+      throw Exception::NotImplemented(">>> (StdObjective_ThermalFluids): Unknown objective type."); 
+    }
+  }
+
+  Real value(const std::vector<Real> &x, Real &tol) {
+    Real val = alpha_*x[1];
+    if ( stateObj_ == "Vorticity" || stateObj_ == "Directional" ) {
+      val += x[0];
+    }
+    else {
+      val += static_cast<Real>(0.5)*x[0]*x[0];
+    }
+    return val;
+  }
+
+  void gradient(std::vector<Real> &g, const std::vector<Real> &x, Real &tol) {
+    const Real one(1);
+    if ( stateObj_ == "Vorticity" || stateObj_ == "Directional" ) {
+      g[0] = one;
+    }
+    else {
+      g[0] = x[0];
+    }
+    g[1] = alpha_;
+  }
+
+  void hessVec(std::vector<Real> &hv, const std::vector<Real> &v, const std::vector<Real> &x, Real &tol) {
+    const Real zero(0);
+    if ( stateObj_ == "Vorticity" || stateObj_ == "Directional" ) {
+      hv[0] = zero;
+    }
+    else {
+      hv[0] = v[0];
+    }
+    hv[1] = zero;
+  }
+
+}; // OBJ_SCALAR
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/published/IMAvolumes_KouriRidzal2017/pde_thermal-fluidsK.hpp b/packages/rol/example/PDE-OPT/published/IMAvolumes_KouriRidzal2017/pde_thermal-fluidsK.hpp
new file mode 100644
index 000000000000..c0f2fea1040a
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/published/IMAvolumes_KouriRidzal2017/pde_thermal-fluidsK.hpp
@@ -0,0 +1,1239 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  pde_thermal-fluids.hpp
+    \brief Implements the local PDE interface for the Navier-Stokes control problem.
+*/
+
+#ifndef PDE_THERMALFLUIDS_EX03K_HPP
+#define PDE_THERMALFLUIDS_EX03K_HPP
+
+#include "../../TOOLS/pdeK.hpp"
+#include "../../TOOLS/feK.hpp"
+#include "../../TOOLS/fieldhelperK.hpp"
+
+#include "Intrepid2_HGRAD_QUAD_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_QUAD_C2_FEM.hpp"
+#include "Intrepid2_DefaultCubatureFactory.hpp"
+#include "Intrepid2_FunctionSpaceTools.hpp"
+#include "Intrepid2_CellTools.hpp"
+
+#include "ROL_Ptr.hpp"
+
+template <class Real,class DeviceType>
+class PDE_ThermalFluids_ex03 : public PDE<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  // Finite element basis information
+  basis_ptr basisPtrVel_, basisPtrPrs_, basisPtrThr_;
+  std::vector<basis_ptr> basisPtrs_;
+  // Cell cubature information
+  ROL::Ptr<Intrepid2::Cubature<DeviceType,Real,Real>> cellCub_, bdryCub_;
+  // Cell node information
+  scalar_view volCellNodes_;
+  std::vector<std::vector<scalar_view>> bdryCellNodes_;
+  std::vector<std::vector<std::vector<int>>> bdryCellLocIds_;
+  // Finite element definition
+  ROL::Ptr<fe_type> feVel_, fePrs_, feThr_;
+  std::vector<std::vector<ROL::Ptr<fe_type>>> feVelBdry_, fePrsBdry_, feThrBdry_;
+  // Local degrees of freedom on boundary, for each side of the reference cell (first index).
+  std::vector<std::vector<int>> fvidx_, fpidx_, fhidx_;
+  // Coordinates of degrees freedom on boundary cells.
+  // Indexing:  [sideset number][local side id](cell number, value at dof)
+  std::vector<std::vector<scalar_view>> bdryCellVDofValues_, bdryCellTDofValues_;
+  // Field pattern, offsets, etc.
+  std::vector<std::vector<int>> fieldPattern_; // local Field/DOF pattern; set from DOF manager 
+  int numFields_;                              // number of fields (equations in the PDE)
+  int numDofs_;                                // total number of degrees of freedom for all (local) fields
+  std::vector<int> offset_;                    // for each field, a counting offset
+  std::vector<int> numFieldDofs_;              // for each field, number of degrees of freedom
+  
+  // Problem parameters.
+  Real Re_, Pr_, Gr_, h_;
+  const Real grav_;
+  int Nbottom_, Nleft_, Nright_;
+  Real ReScale_, PrScale_, GrScale_, hScale_, TScale_;
+  bool pinPressure_;
+
+  ROL::Ptr<FieldHelper<Real,DeviceType>> fieldHelper_;
+
+  Real velocityDirichletFunc(const std::vector<Real> & coords, int sideset, int locSideId, int dir) const {
+    Real val(0);
+    if ((sideset==3) && (dir==1)) {
+      const Real one(1), two(2), three(3), four(4), x = coords[0];
+      if (x <= one/three)
+        val = two*(one/three - x)*x;
+      else if (x > one/three && x < two/three)
+        val = -four*(x-one/three)*(two/three-x);
+      else if (x >= two/three)
+        val = two*(x-two/three)*(one-x);
+    }
+    return val;
+  }
+
+  Real thermalDirichletFunc(const std::vector<Real> & coords, int sideset, int locSideId) const {
+    const std::vector<Real> param = PDE<Real,DeviceType>::getParameter();
+    Real val(0);
+    if (sideset==0) {
+      val = static_cast<Real>(1);
+      if (param.size()) {
+        Real root2(std::sqrt(2.0)), pi(M_PI), ln2(std::log(2.0));
+        for (int i = 0; i < Nbottom_; ++i) {
+          Real di(i+1);
+          val += TScale_ * param[i]/ln2 * (root2 * std::sin(di * pi * coords[0]))/(di * pi);
+        }
+      }
+    }
+    else if (sideset==5) {
+      val = static_cast<Real>(0);
+    }
+    return val;
+  }
+
+  void computeDirichlet(void) {
+    // Compute Dirichlet values at DOFs.
+    int d  = basisPtrVel_->getBaseCellTopology().getDimension();
+    int fv = basisPtrVel_->getCardinality();
+    int ft = basisPtrThr_->getCardinality();
+    int numSidesets = bdryCellLocIds_.size();
+    bdryCellVDofValues_.resize(numSidesets);
+    bdryCellTDofValues_.resize(numSidesets);
+    for (int i=0; i<numSidesets; ++i) {
+      int numLocSides = bdryCellLocIds_[i].size();
+      bdryCellVDofValues_[i].resize(numLocSides);
+      bdryCellTDofValues_[i].resize(numLocSides);
+      for (int j=0; j<numLocSides; ++j) {
+        int c = bdryCellLocIds_[i][j].size();
+        bdryCellVDofValues_[i][j] = scalar_view("bdryCellVDofValues", c, fv, d);
+        bdryCellTDofValues_[i][j] = scalar_view("bdryCellTDofValues", c, ft);
+        scalar_view Vcoords("Vcoords", c, fv, d);
+        scalar_view Tcoords("Tcoords", c, ft, d);
+        if (c > 0) {
+          feVel_->computeDofCoords(Vcoords, bdryCellNodes_[i][j]);
+          feThr_->computeDofCoords(Tcoords, bdryCellNodes_[i][j]);
+        }
+        for (int k=0; k<c; ++k) {
+          for (int l=0; l<fv; ++l) {
+            std::vector<Real> dofpoint(d);
+            //std::cout << "Sideset " << i << " LocalSide " << j << "  Cell " << k << "  Field " << l << "  Coord ";
+            for (int m=0; m<d; ++m) {
+              dofpoint[m] = Vcoords(k, l, m);
+              //std::cout << dofpoint[m] << "  ";
+            }
+
+            for (int m=0; m<d; ++m) {
+              (bdryCellVDofValues_[i][j])(k, l, m) = velocityDirichletFunc(dofpoint, i, j, m);
+              //std::cout << "  " << m << "-Value " << DirichletFunc(dofpoint, i, j, m);
+            }
+            //std::cout << std::endl;
+          }
+          for (int l=0; l<ft; ++l) {
+            std::vector<Real> dofpoint(d);
+            //std::cout << "Sideset " << i << " LocalSide " << j << "  Cell " << k << "  Field " << l << "  Coord ";
+            for (int m=0; m<d; ++m) {
+              dofpoint[m] = Tcoords(k, l, m);
+              //std::cout << dofpoint[m] << "  ";
+            }
+
+            (bdryCellTDofValues_[i][j])(k, l) = thermalDirichletFunc(dofpoint, i, j);
+            //std::cout << "    Value " << DirichletFunc(dofpoint, i, j);
+            //std::cout << std::endl;
+          }
+        }
+      }
+    }
+  }
+
+  Real ReynoldsNumber(void) const {
+    const std::vector<Real> param = PDE<Real,DeviceType>::getParameter();
+    Real val = Re_;
+    if (param.size()) {
+      int offset = Nbottom_ + Nright_ + Nleft_;
+      val /= (static_cast<Real>(1) + ReScale_*param[offset]);
+    }
+    return val;
+  }
+
+  Real PrandtlNumber(void) const {
+    const std::vector<Real> param = PDE<Real,DeviceType>::getParameter();
+    Real val = Pr_;
+    if (param.size()) {
+      int offset = Nbottom_ + Nright_ + Nleft_;
+      Real one(1);
+      val *= (one + ReScale_*param[offset])/(one + PrScale_*param[offset+1]);
+    }
+    return val;
+  }
+
+  Real GrashofNumber(void) const {
+    const std::vector<Real> param = PDE<Real,DeviceType>::getParameter();
+    Real val = Gr_;
+    if (param.size()) {
+      int offset = Nbottom_ + Nright_ + Nleft_;
+      Real one(1), muscale = one + ReScale_*param[offset];
+      val *= (one + GrScale_*param[offset+2])/(muscale*muscale);
+    }
+    return val;
+  }
+  
+  Real ThicknessNumber(const std::vector<Real> &x, const int sideset) const {
+    const std::vector<Real> param = PDE<Real,DeviceType>::getParameter();
+    Real val = h_;
+    if ( param.size()) {
+      if ( sideset == 2 ) {
+        int offset = Nbottom_;
+        Real root2(std::sqrt(2.0)), pi(M_PI), ln2(std::log(2.0));
+        for (int i = 0; i < Nleft_; ++i) {
+          Real di(i+1);
+          val += hScale_ * param[offset + i]/ln2 * (root2 * std::sin(di * pi * x[1]))/(di * pi);
+        }
+      }
+      else if ( sideset == 1 ) {
+        int offset = Nbottom_ + Nleft_;
+        Real root2(std::sqrt(2.0)), pi(M_PI), ln2(std::log(2.0));
+        for (int i = 0; i < Nright_; ++i) {
+          Real di(i+1);
+          val += hScale_ * param[offset + i]/ln2 * (root2 * std::sin(di * pi * x[1]))/(di * pi);
+        }
+      }
+    }
+    return val;
+  }
+
+  Real viscosityFunc(const std::vector<Real> &x) const {
+    Real ReynoldsNum = ReynoldsNumber();
+    return static_cast<Real>(1)/ReynoldsNum;
+  }
+
+  Real conductivityFunc(const std::vector<Real> &x) const {
+    Real ReynoldsNum = ReynoldsNumber();
+    Real PrandtlNum = PrandtlNumber();
+    return static_cast<Real>(1)/(ReynoldsNum*PrandtlNum);
+  }
+
+  Real gravityFunc(const std::vector<Real> &x) const {
+    Real ReynoldsNum = ReynoldsNumber();
+    Real GrashofNum = GrashofNumber();
+    return grav_*GrashofNum/(ReynoldsNum*ReynoldsNum);
+  }
+
+  Real thermalRobinFunc(const std::vector<Real> &x, const int sideset) const {
+    return ThicknessNumber(x, sideset) * conductivityFunc(x);
+  }
+
+  void computeCoefficients(scalar_view &nu,
+                           scalar_view &grav,
+                           scalar_view &kappa) const {
+    int c = feVel_->gradN().extent_int(0);
+    int p = feVel_->gradN().extent_int(2);
+    int d = feVel_->gradN().extent_int(3);
+    std::vector<Real> pt(d);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for ( int k = 0; k < d; ++k)
+          pt[k] = (feVel_->cubPts())(i,j,k);
+        // Compute spatially dependent coefficients
+        nu(i,j)    = viscosityFunc(pt);
+        grav(i,j)  = gravityFunc(pt);
+        kappa(i,j) = conductivityFunc(pt);
+      }
+    }
+  }
+
+  void computeThermalRobin(scalar_view &robin,
+                           const scalar_view u,
+                           const scalar_view z,
+                           const int sideset,
+                           const int locSideId,
+                           const int deriv = 0,
+                           const int component = 1) const {
+    const int c = u.extent_int(0);
+    const int p = u.extent_int(1);
+    const int d = feThr_->gradN().extent_int(3);
+    std::vector<Real> x(d);
+    Real h(0);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k)
+          x[k] = (feThrBdry_[sideset][locSideId]->cubPts())(i,j,k);
+        h = thermalRobinFunc(x, sideset);
+        if ( deriv == 0 )
+          robin(i,j) = h * (u(i,j) - z(i,j));
+        else if ( deriv == 1 )
+          robin(i,j) = ((component==0) ? h : -h);
+        else
+          robin(i,j) = static_cast<Real>(0);
+      }
+    }
+  }
+
+  scalar_view getThermalBoundaryCoeff(const scalar_view cell_coeff, int sideSet, int cell) const {
+    std::vector<int> bdryCellLocId = bdryCellLocIds_[sideSet][cell];
+    const int numCellsSide = bdryCellLocId.size();
+    const int f = basisPtrThr_->getCardinality();
+    
+    scalar_view bdry_coeff("bdry_coeff", numCellsSide, f);
+    for (int i = 0; i < numCellsSide; ++i) {
+      for (int j = 0; j < f; ++j)
+        bdry_coeff(i, j) = cell_coeff(bdryCellLocId[i], j);
+    }
+    return bdry_coeff;
+  }
+
+public:
+  PDE_ThermalFluids_ex03(ROL::ParameterList &parlist) : grav_(-1) {
+    // Finite element fields -- NOT DIMENSION INDEPENDENT!
+    basisPtrVel_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C2_FEM<DeviceType,Real,Real>>();
+    basisPtrPrs_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceType,Real,Real>>();
+    basisPtrThr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C2_FEM<DeviceType,Real,Real>>();
+    // Volume quadrature rules.
+    shards::CellTopology cellType = basisPtrVel_->getBaseCellTopology();     // get the cell type from any basis
+    Intrepid2::DefaultCubatureFactory cubFactory;                            // create cubature factory
+    int cubDegree = parlist.sublist("Problem").get("Cubature Degree", 4);    // set cubature degree, e.g., 4
+    cellCub_ = cubFactory.create<DeviceType,Real,Real>(cellType, cubDegree); // create default cubature
+    // Boundary quadrature rules.
+    int d = cellType.getDimension();
+    shards::CellTopology bdryCellType = cellType.getCellTopologyData(d-1, 0);
+    int bdryCubDegree = parlist.sublist("Problem").get("Boundary Cubature Degree",4); // set cubature degree, e.g., 4
+    bdryCub_ = cubFactory.create<DeviceType,Real,Real>(bdryCellType, bdryCubDegree);
+    // Fill finite element basis container
+    basisPtrs_.clear();
+    basisPtrs_.resize(d,basisPtrVel_);  // Velocity
+    basisPtrs_.push_back(basisPtrPrs_); // Pressure
+    basisPtrs_.push_back(basisPtrThr_); // Temperature
+
+    // Other problem parameters.
+    Re_      = parlist.sublist("Problem").get("Reynolds Number",    200.0);
+    Pr_      = parlist.sublist("Problem").get("Prandtl Number",     0.72);
+    Gr_      = parlist.sublist("Problem").get("Grashof Number",     40000.0);
+    h_       = parlist.sublist("Problem").get("Robin Coefficient",  1.0);
+    // Stochastic Expansion Information.
+    Nbottom_ = parlist.sublist("Problem").get("Bottom KL Truncation Order",5);
+    Nleft_   = parlist.sublist("Problem").get("Left KL Truncation Order",5);
+    Nright_  = parlist.sublist("Problem").get("Right KL Truncation Order",5);
+    // Stochastic scales
+    ReScale_ = parlist.sublist("Problem").get("Reynolds Number Noise Scale",0.05);
+    PrScale_ = parlist.sublist("Problem").get("Prandtl Number Noise Scale",0.05);
+    GrScale_ = parlist.sublist("Problem").get("Grashof Number Noise Scale",0.05);
+    hScale_  = parlist.sublist("Problem").get("Robin Noise Scale",0.2);
+    TScale_  = parlist.sublist("Problem").get("Bottom Temperature Noise Scale",0.2);
+    // Pin pressure
+    pinPressure_ = parlist.sublist("Problem").get("Pin Pressure",true);
+
+    numDofs_ = 0;
+    numFields_ = basisPtrs_.size();
+    offset_.resize(numFields_);
+    numFieldDofs_.resize(numFields_);
+    for (int i=0; i<numFields_; ++i) {
+      if (i==0)
+        offset_[i] = 0;
+      else
+        offset_[i] = offset_[i-1] + basisPtrs_[i-1]->getCardinality();
+      numFieldDofs_[i] = basisPtrs_[i]->getCardinality();
+      numDofs_ += numFieldDofs_[i];
+    }
+  }
+
+  void residual(scalar_view & res,
+                const scalar_view u_coeff,
+                const scalar_view z_coeff = scalar_view(),
+                const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    int c  = u_coeff.extent_int(0);
+    int p  = cellCub_->getNumPoints();
+    int fv = basisPtrVel_->getCardinality();
+    int fp = basisPtrPrs_->getCardinality();
+    int fh = basisPtrThr_->getCardinality();
+    int d  = cellCub_->getDimension();
+ 
+    // Initialize residuals.
+    std::vector<scalar_view> R(d+2);
+    for (int i = 0; i < d; ++i)
+      R[i] = scalar_view("res", c, fv);
+    R[d]   = scalar_view("res", c, fp);
+    R[d+1] = scalar_view("res", c, fh);
+
+    // Split u_coeff into components.
+    std::vector<scalar_view> U, Z;
+    fieldHelper_->splitFieldCoeff(U, u_coeff);
+    fieldHelper_->splitFieldCoeff(Z, z_coeff);
+
+    // Evaluate problem coefficients at quadrature points.
+    scalar_view nu("nu", c, p);
+    scalar_view grav("grav", c, p);
+    scalar_view kappa("kappa", c, p);
+    computeCoefficients(nu,grav,kappa);
+
+    // Evaluate/interpolate finite element fields on cells.
+    std::vector<scalar_view> valVel_vec(d);
+    for (int i = 0; i < d; ++i) {
+      valVel_vec[i] = scalar_view("valVel_vec", c, p);
+      feVel_->evaluateValue(valVel_vec[i], U[i]);
+    }
+    scalar_view valPres_eval("valPres_eval", c, p);
+    scalar_view valHeat_eval("valHeat_eval", c, p);
+    fePrs_->evaluateValue(valPres_eval, U[d]);
+    feThr_->evaluateValue(valHeat_eval, U[d+1]);
+    // Evaluate/interpolate gradient of finite element fields on cells.
+    std::vector<scalar_view> gradVel_vec(d);
+    for (int i = 0; i < d; ++i) {
+      gradVel_vec[i] = scalar_view("gradVel_vec", c, p, d);
+      feVel_->evaluateGradient(gradVel_vec[i], U[i]);
+    }
+    scalar_view gradHeat_eval("gradHeat_eval", c, p, d);
+    feThr_->evaluateGradient(gradHeat_eval, U[d+1]);
+
+    // Assemble the velocity vector and its divergence.
+    scalar_view valVel_eval("valVel_eval", c, p, d);
+    scalar_view divVel_eval("divVel_eval", c, p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        divVel_eval(i,j) = static_cast<Real>(0);
+        for (int k = 0; k < d; ++k) {
+          valVel_eval(i,j,k) = (valVel_vec[k])(i,j);
+          divVel_eval(i,j)  += (gradVel_vec[k])(i,j,k);
+        }
+      }
+    }
+
+    /**************************************************************************/
+    /*** NAVIER STOKES ********************************************************/
+    /**************************************************************************/
+    std::vector<scalar_view> nuGradVel_vec(d), valVelDotgradVel_vec(d);
+    for (int i = 0; i < d; ++i) {
+      // Multiply velocity gradients with viscosity.
+      nuGradVel_vec[i] = scalar_view("nuGradVel_vec", c, p, d);
+      fst::tensorMultiplyDataData(nuGradVel_vec[i], nu, gradVel_vec[i]);
+      // Compute nonlinear terms in the Navier-Stokes equations.
+      valVelDotgradVel_vec[i] = scalar_view("valVelDotgradVel_vec", c, p);
+      fst::dotMultiplyDataData(valVelDotgradVel_vec[i], valVel_eval, gradVel_vec[i]);
+    }
+    // Negative pressure
+    rst::scale(valPres_eval,static_cast<Real>(-1));
+    // Compute gravitational force.
+    scalar_view gravForce("gravForce", c, p);
+    fst::scalarMultiplyDataData(gravForce, grav, valHeat_eval);
+
+    /**************************************************************************/
+    /*** THERMAL **************************************************************/
+    /**************************************************************************/
+    scalar_view kappaGradHeat_eval("kappaGradHeat_eval", c, p, d);
+    scalar_view velDotGradHeat_eval("velDotGradHeat_eval", c, p);
+    // Multiply temperature gradient with conductivity
+    fst::tensorMultiplyDataData(kappaGradHeat_eval, kappa, gradHeat_eval);
+    // Dot multiply scaled velocity with temperature gradient
+    fst::dotMultiplyDataData(velDotGradHeat_eval, valVel_eval, gradHeat_eval);
+
+    /**************************************************************************/
+    /*** EVALUATE WEAK FORM OF RESIDUAL ***************************************/
+    /**************************************************************************/
+    // Velocity equation.
+    for (int i = 0; i < d; ++i) {
+      fst::integrate(R[i],nuGradVel_vec[i],feVel_->gradNdetJ(),false);
+      fst::integrate(R[i],valVelDotgradVel_vec[i],feVel_->NdetJ(),true);
+      fst::integrate(R[i],valPres_eval,feVel_->DNDdetJ(i),true);
+    }
+    // Apply gravitational force.
+    fst::integrate(R[d-1],gravForce,feVel_->NdetJ(),true);
+    // Pressure equation.
+    fst::integrate(R[d],divVel_eval,fePrs_->NdetJ(),false);
+    rst::scale(R[d],static_cast<Real>(-1));
+    // Heat equation.
+    fst::integrate(R[d+1],kappaGradHeat_eval,feThr_->gradNdetJ(),false);
+    fst::integrate(R[d+1],velDotGradHeat_eval,feThr_->NdetJ(),true);
+
+    /**************************************************************************/
+    /*** APPLY BOUNDARY CONDITIONS ********************************************/
+    /**************************************************************************/
+    // -->        Control boundaries: i=1,2
+    // -->        No slip boundaries: i=0,1,2
+    // --> Outflow/Inflow boundaries: i=3
+    // -->              Pressure Pin: i=4
+    int numSideSets = bdryCellLocIds_.size();
+    const int numCubPerSide = bdryCub_->getNumPoints();
+    if (numSideSets > 0) {
+      // ROBIN CONDITIONS
+      for (int i = 0; i < numSideSets; ++i) {
+        // Control boundaries
+        if (i==1 || i==2) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            if (numCellsSide) {
+              // Get U and Z coefficients on Robin boundary
+              scalar_view u_coeff_bdry = getThermalBoundaryCoeff(U[d+1], i, j);
+              scalar_view z_coeff_bdry = getThermalBoundaryCoeff(Z[d+1], i, j);
+              // Evaluate U and Z on FE basis
+              scalar_view valU_eval_bdry("valU_eval_bdry", numCellsSide, numCubPerSide);
+              scalar_view valZ_eval_bdry("valZ_eval_bdry", numCellsSide, numCubPerSide);
+              feThrBdry_[i][j]->evaluateValue(valU_eval_bdry, u_coeff_bdry);
+              feThrBdry_[i][j]->evaluateValue(valZ_eval_bdry, z_coeff_bdry);
+              // Compute Stefan-Boltzmann residual
+              scalar_view robinVal("robinVal", numCellsSide, numCubPerSide);
+              computeThermalRobin(robinVal,valU_eval_bdry,valZ_eval_bdry,i,j,0);
+              // Evaluate boundary integral
+              scalar_view robinRes("robinRes", numCellsSide, fh);
+              fst::integrate(robinRes,robinVal,feThrBdry_[i][j]->NdetJ(),false);
+              for (int k = 0; k < numCellsSide; ++k) {
+                int cidx = bdryCellLocIds_[i][j][k];
+                // Thermal boundary conditions.
+                for (int l = 0; l < fh; ++l)
+                  (R[d+1])(cidx,l) += robinRes(k,l);
+              }
+            }
+          }
+        }
+      }
+      // DIRICHLET CONDITIONS
+      computeDirichlet();
+      // Velocity Boundary Conditions
+      for (int i = 0; i < numSideSets; ++i) {
+        if (i<4) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numVBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numVBdryDofs; ++l) {
+                for (int m = 0; m < d; ++m)
+                  (R[m])(cidx,fvidx_[j][l]) = (U[m])(cidx,fvidx_[j][l]) - (bdryCellVDofValues_[i][j])(k,fvidx_[j][l],m);
+              }
+            }
+          }
+        }
+        // Thermal Boundary Conditions
+        if ( (i==0) || (i==5) ) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();   // Number of sides per cell
+          for (int j = 0; j < numLocalSideIds; ++j) {        // Loop over sides of cell: Quad = {0, 1, 2, 3}
+            int numCellsSide = bdryCellLocIds_[i][j].size(); // Number of cells with side j
+            int numHBdryDofs = fhidx_[j].size();             // Number of thermal boundary DOFs
+            for (int k = 0; k < numCellsSide; ++k) {         // Loop over cells with side j
+              int cidx = bdryCellLocIds_[i][j][k];           // Cell index
+              for (int l = 0; l < numHBdryDofs; ++l)         // Loop over all fields of cell k on side j
+                (R[d+1])(cidx,fhidx_[j][l]) = (U[d+1])(cidx,fhidx_[j][l]) - (bdryCellTDofValues_[i][j])(k,fhidx_[j][l]);
+            }
+          }
+        }
+        // Pressure pinning
+        if (i==7 && pinPressure_) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int l = 0, cidx = bdryCellLocIds_[i][j][k];
+              (R[d])(cidx,fpidx_[j][l]) = (U[d])(cidx,fpidx_[j][l]) - static_cast<Real>(0);
+            }
+          }
+        }
+      }
+    }
+
+    // Combine the residuals.
+    fieldHelper_->combineFieldCoeff(res, R);
+  }
+
+  void Jacobian_1(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    int c  = u_coeff.extent_int(0);
+    int p  = cellCub_->getNumPoints();
+    int fv = basisPtrVel_->getCardinality();
+    int fp = basisPtrPrs_->getCardinality();
+    int fh = basisPtrThr_->getCardinality();
+    int d  = cellCub_->getDimension();
+ 
+    // Initialize jacobians.
+    std::vector<std::vector<scalar_view>> J(d+2);
+    for (int i = 0; i < d+2; ++i) J[i].resize(d+2);
+    for (int i = 0; i < d; ++i) {
+      for (int j = 0; j < d; ++j)
+        J[i][j] = scalar_view("jac", c, fv, fv);
+      J[d][i]   = scalar_view("jac", c, fp, fv);
+      J[i][d]   = scalar_view("jac", c, fv, fp);
+      J[d+1][i] = scalar_view("jac", c, fh, fv);
+      J[i][d+1] = scalar_view("jac", c, fv, fh);
+    }
+    J[d][d]     = scalar_view("jac", c, fp, fp);
+    J[d+1][d]   = scalar_view("jac", c, fh, fp);
+    J[d][d+1]   = scalar_view("jac", c, fp, fh);
+    J[d+1][d+1] = scalar_view("jac", c, fh, fh);
+
+    // Split u_coeff into components.
+    std::vector<scalar_view> U, Z;
+    fieldHelper_->splitFieldCoeff(U, u_coeff);
+    fieldHelper_->splitFieldCoeff(Z, z_coeff);
+
+    // Evaluate problem coefficients at quadrature points.
+    scalar_view nu("nu", c, p);
+    scalar_view grav("grav", c, p);
+    scalar_view kappa("kappa", c, p);
+    computeCoefficients(nu,grav,kappa);
+
+    // Evaluate/interpolate finite element fields on cells.
+    std::vector<scalar_view> valVel_vec(d);
+    for (int i = 0; i < d; ++i) {
+      valVel_vec[i] = scalar_view("valVel_vec", c, p);
+      feVel_->evaluateValue(valVel_vec[i], U[i]);
+    }
+    // Evaluate/interpolate gradient of finite element fields on cells.
+    std::vector<scalar_view> gradVel_vec(d);
+    for (int i = 0; i < d; ++i) {
+      gradVel_vec[i] = scalar_view("gradVel_vec", c, p, d);
+      feVel_->evaluateGradient(gradVel_vec[i], U[i]);
+    }
+    scalar_view gradHeat_eval("gradHeat_eval", c, p, d);
+    feThr_->evaluateGradient(gradHeat_eval, U[d+1]);
+
+    // Assemble the velocity vector and its divergence.
+    scalar_view valVel_eval("valVel_eval", c, p, d);
+    std::vector<std::vector<scalar_view>> dVel_vec(d);
+    std::vector<scalar_view> gradHeat_vec(d);
+    for (int k = 0; k < d; ++k) {
+      dVel_vec[k].resize(d);
+      for (int l = 0; l < d; ++l) {
+        dVel_vec[k][l] = scalar_view("dVel_vec", c, p);
+        for (int i = 0; i < c; ++i) {
+          for (int j = 0; j < p; ++j)
+            (dVel_vec[k][l])(i,j) = (gradVel_vec[k])(i,j,l);
+        }
+      }
+      gradHeat_vec[k] = scalar_view("gradHeat_vec", c, p);
+      for (int i = 0; i < c; ++i) {
+        for (int j = 0; j < p; ++j) {
+          valVel_eval(i,j,k)  = (valVel_vec[k])(i,j);
+          (gradHeat_vec[k])(i,j) = gradHeat_eval(i,j,k);
+        }
+      }
+    }
+
+    /**************************************************************************/
+    /*** NAVIER STOKES ********************************************************/
+    /**************************************************************************/
+    std::vector<std::vector<scalar_view>> dVelPhi_vec(d);
+    for (int i = 0; i < d; ++i) {
+      // Compute nonlinear terms in the Navier-Stokes equations.
+      dVelPhi_vec[i].resize(d);
+      for (int j = 0; j < d; ++j) {
+        dVelPhi_vec[i][j] = scalar_view("dVelPhi_vec", c, fv, p);
+        fst::scalarMultiplyDataField(dVelPhi_vec[i][j], dVel_vec[i][j], feVel_->N());
+      }
+    }
+    // Multiply velocity gradients with viscosity.
+    scalar_view nuGradPhi_eval("nuGradPhi_eval", c, fv, p, d);
+    fst::tensorMultiplyDataField(nuGradPhi_eval, nu, feVel_->gradN());
+    // Compute nonlinear terms in the Navier-Stokes equations.
+    scalar_view valVelDotgradPhi_eval("valVelDotgradPhi_eval", c, fv, p);
+    fst::dotMultiplyDataField(valVelDotgradPhi_eval, valVel_eval, feVel_->gradN());
+    // Negative pressure basis.
+    scalar_view negPrsPhi(c,fp,p);
+    Kokkos::deep_copy(negPrsPhi,fePrs_->N());
+    rst::scale(negPrsPhi,static_cast<Real>(-1));
+    // Compute gravity Jacobian
+    scalar_view gravPhi("gravPhi", c, fh, p);
+    fst::scalarMultiplyDataField(gravPhi, grav, feThr_->N());
+
+    /**************************************************************************/
+    /*** THERMAL **************************************************************/
+    /**************************************************************************/
+    scalar_view kappaGradPhi("kappaGradPhi", c, fh, p, d);
+    scalar_view VelPhi("VelPhi", c, fh, p);
+    std::vector<scalar_view> dHeatPhi(d);
+    // Compute kappa times gradient of basis.
+    fst::tensorMultiplyDataField(kappaGradPhi, kappa, feThr_->gradN());
+    // Compute scaled velocity.
+    fst::dotMultiplyDataField(VelPhi, valVel_eval, feThr_->gradN());
+    // Thermal-velocity Jacobians.
+    for (int i = 0; i < d; ++i) {
+      dHeatPhi[i] = scalar_view("dHeatPhi", c, fh, p);
+      fst::scalarMultiplyDataField(dHeatPhi[i], gradHeat_vec[i], feThr_->N());
+    }
+
+    /*** Evaluate weak form of the Jacobian. ***/
+    // X component of velocity equation.
+    for (int i = 0; i < d; ++i) {
+      // Velocity components
+      for (int j = 0; j < d; ++j)
+        fst::integrate(J[i][j],feVel_->NdetJ(),dVelPhi_vec[i][j],false);
+      fst::integrate(J[i][i],nuGradPhi_eval,feVel_->gradNdetJ(),true);
+      fst::integrate(J[i][i],feVel_->NdetJ(),valVelDotgradPhi_eval,true);
+      // Pressure components
+      fst::integrate(J[i][d],feVel_->DNDdetJ(i),negPrsPhi,false);
+      fst::integrate(J[d][i],fePrs_->NdetJ(),feVel_->DND(i),false);
+      rst::scale(J[d][i],static_cast<Real>(-1));
+      // Thermal components
+      fst::integrate(J[d+1][i],dHeatPhi[i],feVel_->NdetJ(),false);
+    }
+    // Gravitational component
+    fst::integrate(J[d-1][d+1],feVel_->NdetJ(),gravPhi,false);
+    // Thermal components
+    fst::integrate(J[d+1][d+1],kappaGradPhi,feThr_->gradNdetJ(),false);
+    fst::integrate(J[d+1][d+1],feThr_->NdetJ(),VelPhi,true);
+
+    // APPLY BOUNDARY CONDITIONS
+    int numSideSets = bdryCellLocIds_.size();
+    const int numCubPerSide = bdryCub_->getNumPoints();
+    if (numSideSets > 0) {
+      // ROBIN CONDITIONS
+      for (int i = 0; i < numSideSets; ++i) {
+        // Control boundaries
+        if (i==1 || i==2) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            if (numCellsSide) {
+              // Get U and Z coefficients on Robin boundary
+              scalar_view u_coeff_bdry = getThermalBoundaryCoeff(U[d+1], i, j);
+              scalar_view z_coeff_bdry = getThermalBoundaryCoeff(Z[d+1], i, j);
+              // Evaluate U and Z on FE basis
+              scalar_view valU_eval_bdry("valU_eval_bdry", numCellsSide, numCubPerSide);
+              scalar_view valZ_eval_bdry("valZ_eval_bdry", numCellsSide, numCubPerSide);
+              feThrBdry_[i][j]->evaluateValue(valU_eval_bdry, u_coeff_bdry);
+              feThrBdry_[i][j]->evaluateValue(valZ_eval_bdry, z_coeff_bdry);
+              // Compute Stefan-Boltzmann residual
+              ROL::Ptr< Intrepid::FieldContainer<Real>> robinVal1
+              scalar_view robinVal1("robinVal1", numCellsSide, numCubPerSide);
+              scalar_view robinVal("robinVal", numCellsSide, fh, numCubPerSide);
+              computeThermalRobin(robinVal1,valU_eval_bdry,valZ_eval_bdry,i,j,1,0);
+              fst::scalarMultiplyDataField(robinVal,robinVal1,feThrBdry_[i][j]->N());
+              // Evaluate boundary integral
+              scalar_view robinJac("robinJac", numCellsSide, fh, fh);
+              fst::integrate(robinJac,robinVal,feThrBdry_[i][j]->NdetJ(),false);
+              for (int k = 0; k < numCellsSide; ++k) {
+                int cidx = bdryCellLocIds_[i][j][k];
+                for (int l = 0; l < fh; ++l) {
+                  for (int m = 0; m < fh; ++m)
+                    (J[d+1][d+1])(cidx,l,m) += robinJac(k,l,m);
+                }
+              }
+            }
+          }
+        }
+      }
+      // DIRICHLET CONDITIONS
+      for (int i = 0; i < numSideSets; ++i) {
+        // Velocity Boundary Conditions
+        if (i<4) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numVBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numVBdryDofs; ++l) {
+                for (int m = 0; m < fv; ++m) {
+                  for (int n = 0; n < d; ++n) {
+                    for (int p = 0; p < d; ++p)
+                      (J[n][p])(cidx,fvidx_[j][l],m) = static_cast<Real>(0);
+                    (J[n][n])(cidx,fvidx_[j][l],fvidx_[j][l]) = static_cast<Real>(1);
+                  }
+                }
+                for (int m = 0; m < d; ++m) {
+                  for (int n = 0; n < fp; ++n)
+                    (J[m][d])(cidx,fvidx_[j][l],n) = static_cast<Real>(0);
+                  for (int n = 0; n < fh; ++n)
+                    (J[m][d+1])(cidx,fvidx_[j][l],n) = static_cast<Real>(0);
+                }
+              }
+            }
+          }
+        }
+        // Thermal boundary conditions
+        if ( (i==0) || (i==5) ) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numHBdryDofs = fhidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numHBdryDofs; ++l) {
+                for (int m = 0; m < fv; ++m) {
+                  for (int n = 0; n < d; ++n)
+                    (J[d+1][n])(cidx,fhidx_[j][l],m) = static_cast<Real>(0);
+                }
+                for (int m = 0; m < fp; ++m)
+                  (J[d+1][d])(cidx,fhidx_[j][l],m) = static_cast<Real>(0);
+                for (int m = 0; m < fh; ++m)
+                  (J[d+1][d+1])(cidx,fhidx_[j][l],m) = static_cast<Real>(0);
+                (J[d+1][d+1])(cidx,fhidx_[j][l],fhidx_[j][l]) = static_cast<Real>(1);
+              }
+            }
+          }
+        }
+        // Pressure pinning
+        if (i==7 && pinPressure_) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int l = 0, cidx = bdryCellLocIds_[i][j][k];
+              for (int m = 0; m < fv; ++m) {
+                for (int n = 0; n < d; ++n)
+                  (J[d][n])(cidx,fpidx_[j][l],m) = static_cast<Real>(0);
+              }
+              for (int m = 0; m < fp; ++m)
+                (J[d][d])(cidx,fpidx_[j][l],m) = static_cast<Real>(0);
+              (J[d][d])(cidx,fpidx_[j][l],fpidx_[j][l]) = static_cast<Real>(1);
+              for (int m = 0; m < fh; ++m)
+                (J[d][d+1])(cidx,fpidx_[j][l],m) = static_cast<Real>(0);
+            }
+          }
+        }
+      }
+    }
+
+    // Combine the jacobians.
+    fieldHelper_->combineFieldCoeff(jac, J);
+  }
+
+
+  void Jacobian_2(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    int c  = u_coeff.extent_int(0);
+    int fv = basisPtrVel_->getCardinality();
+    int fp = basisPtrPrs_->getCardinality();
+    int fh = basisPtrThr_->getCardinality();
+    int d  = cellCub_->getDimension();
+ 
+    // Initialize jacobians.
+    std::vector<std::vector<scalar_view>> J(d+2);
+    for (int i = 0; i < d+2; ++i) J[i].resize(d+2);
+    for (int i = 0; i < d; ++i) {
+      for (int j = 0; j < d; ++j)
+        J[i][j] = scalar_view("jac", c, fv, fv);
+      J[d][i]   = scalar_view("jac", c, fp, fv);
+      J[i][d]   = scalar_view("jac", c, fv, fp);
+      J[d+1][i] = scalar_view("jac", c, fh, fv);
+      J[i][d+1] = scalar_view("jac", c, fv, fh);
+    }
+    J[d][d]     = scalar_view("jac", c, fp, fp);
+    J[d+1][d]   = scalar_view("jac", c, fh, fp);
+    J[d][d+1]   = scalar_view("jac", c, fp, fh);
+    J[d+1][d+1] = scalar_view("jac", c, fh, fh);
+
+    // Split u_coeff into components.
+    std::vector<scalar_view> U, Z;
+    fieldHelper_->splitFieldCoeff(U, u_coeff);
+    fieldHelper_->splitFieldCoeff(Z, z_coeff);
+
+    // APPLY DIRICHLET CONDITIONS
+    int numSideSets = bdryCellLocIds_.size();
+    const int numCubPerSide = bdryCub_->getNumPoints();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        // Control boundaries
+        if (i==1 || i==2) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            if (numCellsSide) {
+              // Get U and Z coefficients on Robin boundary
+              scalar_view u_coeff_bdry = getThermalBoundaryCoeff(U[d+1], i, j);
+              scalar_view z_coeff_bdry = getThermalBoundaryCoeff(Z[d+1], i, j);
+              // Evaluate U and Z on FE basis
+              scalar_view valU_eval_bdry("valU_eval_bdry", numCellsSide, numCubPerSide);
+              scalar_view valZ_eval_bdry("valZ_eval_bdry", numCellsSide, numCubPerSide);
+              feThrBdry_[i][j]->evaluateValue(valU_eval_bdry, u_coeff_bdry);
+              feThrBdry_[i][j]->evaluateValue(valZ_eval_bdry, z_coeff_bdry);
+              // Compute Stefan-Boltzmann residual
+              scalar_view robinVal1("robinVal1", numCellsSide, numCubPerSide);
+              scalar_view robinVal("robinVal", numCellsSide, fh, numCubPerSide);
+              computeThermalRobin(robinVal1,valU_eval_bdry,valZ_eval_bdry,i,j,1,1);
+              fst::scalarMultiplyDataField(robinVal,robinVal1,feThrBdry_[i][j]->N());
+              // Evaluate boundary integral
+              scalar_view robinJac("robinJac", numCellsSide, fh, fh);
+              fst::integrate(robinJac,robinVal,feThrBdry_[i][j]->NdetJ(),false);
+              for (int k = 0; k < numCellsSide; ++k) {
+                int cidx = bdryCellLocIds_[i][j][k];
+                for (int l = 0; l < fh; ++l) {
+                  for (int m = 0; m < fh; ++m)
+                    (J[d+1][d+1])(cidx,l,m) += robinJac(k,l,m);
+                }
+              }
+            }
+          }
+        }
+      }
+      // DIRICHLET CONDITIONS
+      for (int i = 0; i < numSideSets; ++i) {
+        // Velocity Boundary Conditions
+        if (i<4) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numVBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numVBdryDofs; ++l) {
+                for (int m = 0; m < fv; ++m) {
+                  for (int n = 0; n < d; ++n) {
+                    for (int p = 0; p < d; ++p)
+                      (J[n][p])(cidx,fvidx_[j][l],m) = static_cast<Real>(0);
+                  }
+                }
+                for (int m = 0; m < d; ++m) {
+                  for (int n = 0; n < fp; ++n)
+                    (J[m][d])(cidx,fvidx_[j][l],n) = static_cast<Real>(0);
+                  for (int n = 0; n < fh; ++n)
+                    (J[m][d+1])(cidx,fvidx_[j][l],n) = static_cast<Real>(0);
+                }
+              }
+            }
+          }
+        }
+        // Thermal boundary conditions
+        if ( (i==0) || (i==5) ) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numHBdryDofs = fhidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numHBdryDofs; ++l) {
+                for (int m = 0; m < fv; ++m) {
+                  for (int n = 0; n < d; ++n)
+                    (J[d+1][n])(cidx,fhidx_[j][l],m) = static_cast<Real>(0);
+                }
+                for (int m = 0; m < fp; ++m)
+                  (J[d+1][d])(cidx,fhidx_[j][l],m) = static_cast<Real>(0);
+                for (int m = 0; m < fh; ++m)
+                  (J[d+1][d+1])(cidx,fhidx_[j][l],m) = static_cast<Real>(0);
+              }
+            }
+          }
+        }
+        // Pressure pinning
+        if (i==7 && pinPressure_) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int l = 0, cidx = bdryCellLocIds_[i][j][k];
+              for (int m = 0; m < fv; ++m) {
+                for (int n = 0; n < d; ++n)
+                  (J[d][n])(cidx,fpidx_[j][l],m) = static_cast<Real>(0);
+              }
+              for (int m = 0; m < fp; ++m)
+                (J[d][d])(cidx,fpidx_[j][l],m) = static_cast<Real>(0);
+              for (int m = 0; m < fh; ++m)
+                (J[d][d+1])(cidx,fpidx_[j][l],m) = static_cast<Real>(0);
+            }
+          }
+        }
+      }
+    }
+
+    // Combine the jacobians.
+    fieldHelper_->combineFieldCoeff(jac, J);
+  }
+
+  void Hessian_11(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+//    throw Exception::NotImplemented("");
+    // Retrieve dimensions.
+    int c  = u_coeff.extent_int(0);
+    int p  = cellCub_->getNumPoints();
+    int fv = basisPtrVel_->getCardinality();
+    int fp = basisPtrPrs_->getCardinality();
+    int fh = basisPtrThr_->getCardinality();
+    int d  = cellCub_->getDimension();
+ 
+    // Initialize jacobians.
+    std::vector<std::vector<scalar_view>> H(d+2);
+    for (int i = 0; i < d+2; ++i) H[i].resize(d+2);
+    for (int i = 0; i < d; ++i) {
+      for (int j = 0; j < d; ++j)
+        H[i][j] = scalar_view("hess", c, fv, fv);
+    }
+    for (int i = 0; i < d; ++i) {
+      H[d][i]   = scalar_view("hess", c, fp, fv);
+      H[i][d]   = scalar_view("hess", c, fv, fp);
+      H[d+1][i] = scalar_view("hess", c, fh, fv);
+      H[i][d+1] = scalar_view("hess", c, fv, fh);
+    }
+    H[d][d]     = scalar_view("hess", c, fp, fp);
+    H[d+1][d]   = scalar_view("hess", c, fh, fp);
+    H[d][d+1]   = scalar_view("hess", c, fp, fh);
+    H[d+1][d+1] = scalar_view("hess", c, fh, fh);
+
+    // Split l_coeff into components.
+    std::vector<scalar_view> L;
+    fieldHelper_->splitFieldCoeff(L, l_coeff);
+
+    // APPLY DIRICHLET CONDITIONS
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      // DIRICHLET CONDITIONS
+      for (int i = 0; i < numSideSets; ++i) {
+        // Velocity boundary conditions
+        if (i<4) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numVBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numVBdryDofs; ++l) {
+                for (int m = 0; m < d; ++m) {
+                  (L[m])(cidx,fvidx_[j][l]) = static_cast<Real>(0);
+                }
+              }
+            }
+          }
+        }
+        // Thermal boundaries
+        if ( (i==0) || (i==5) ) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numHBdryDofs = fhidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numHBdryDofs; ++l) {
+                (L[d+1])(cidx,fhidx_[j][l]) = static_cast<Real>(0);
+              }
+            }
+          }
+        }
+        // Pressure pinning
+        if (i==7 && pinPressure_) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int l = 0, cidx = bdryCellLocIds_[i][j][k];
+              (L[d])(cidx,fpidx_[j][l]) = static_cast<Real>(0);
+            }
+          }
+        }
+      }
+    }
+
+    // Evaluate/interpolate finite element fields on cells.
+    std::vector<scalar_view> valVel_vec(d);
+    for (int i = 0; i < d; ++i) {
+      valVel_vec[i] = scalar_view("valVel_vec", c, p);
+      feVel_->evaluateValue(valVel_vec[i], L[i]);
+    }
+    scalar_view valHeat_eval("valHeat_eval", c, p);
+    feThr_->evaluateValue(valHeat_eval, L[3]);
+
+    // Compute nonlinear terms in the Navier-Stokes equations.
+    std::vector<scalar_view> valVelPhi_vec(d);
+    for (int i = 0; i < d; ++i) {
+      valVelPhi_vec[i] = scalar_view("valVelPhi_vec", c, fv, p);
+      fst::scalarMultiplyDataField(valVelPhi_vec[i], valVel_vec[i], feVel_->N());
+    }
+
+    // Compute nonlinear terms in the thermal equation.
+    scalar_view valHeatVelPhi("valHeaVelPhi", c, fv, p);
+    fst::scalarMultiplyDataField(valHeatVelPhi, valHeat_eval, feVel_->N());
+
+    /*** Evaluate weak form of the Hessian. ***/
+    for (int i = 0; i < d; ++i) {
+      // velocity equation.
+      for (int j = 0; j < d; ++j) {
+        fst::integrate(H[i][j],valVelPhi_vec[j],feVel_->DNDdetJ(i),false);
+        fst::integrate(H[i][j],feVel_->DNDdetJ(j),valVelPhi_vec[i],true);
+      }
+      fst::integrate(H[i][d+1],valHeatVelPhi,feThr_->DNDdetJ(i),false);
+      // Thermal equation.
+      fst::integrate(H[d+1][i],feThr_->DNDdetJ(i),valHeatVelPhi,false);
+    }
+
+    // Combine the Hessians.
+    fieldHelper_->combineFieldCoeff(hess, H);
+  }
+
+  void Hessian_12(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_ThermalFluids::Hessian_12): Hessian is zero.");
+  }
+
+  void Hessian_21(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_ThermalFluids::Hessian_21): Hessian is zero.");
+  }
+
+  void Hessian_22(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_ThermalFluids::Hessian_22): Hessian is zero.");
+  }
+
+  void RieszMap_1(scalar_view & riesz) override {
+    // Retrieve dimensions.
+    int c  = feVel_->N().extent_int(0);
+    int fv = basisPtrVel_->getCardinality();
+    int fp = basisPtrPrs_->getCardinality();
+    int fh = basisPtrThr_->getCardinality();
+    int d  = cellCub_->getDimension();
+ 
+    // Initialize jacobians.
+    std::vector<std::vector<scalar_view>> J(d+2);
+    for (int i = 0; i < d+2; ++i) J[i].resize(d+2);
+    for (int i = 0; i < d; ++i) {
+      for (int j = 0; j < d; ++j)
+        J[i][j] = scalar_view("riesz1", c, fv, fv);
+      J[d][i]   = scalar_view("riesz1", c, fp, fv);
+      J[i][d]   = scalar_view("riesz1", c, fv, fp);
+      J[d+1][i] = scalar_view("riesz1", c, fh, fv);
+      J[i][d+1] = scalar_view("riesz1", c, fv, fh);
+    }
+    J[d][d]     = scalar_view("riesz1", c, fp, fp);
+    J[d+1][d]   = scalar_view("riesz1", c, fh, fp);
+    J[d][d+1]   = scalar_view("riesz1", c, fp, fh);
+    J[d+1][d+1] = scalar_view("riesz1", c, fh, fh);
+
+    for (int i = 0; i < d; ++i) {
+      Kokkos::deep_copy(J[i][i],feVel_->stiffMat());
+      rst::add(J[i][i],feVel_->massMat());
+    }
+    Kokkos::deep_copy(J[d][d],fePrs_->massMat());
+    Kokkos::deep_copy(J[d+1][d+1],feThr_->stiffMat());
+    rst::add(J[d+1][d+1],feThr_->massMat());
+
+    // Combine the jacobians.
+    fieldHelper_->combineFieldCoeff(riesz, J);
+  }
+
+  void RieszMap_2(scalar_view & riesz) override {
+    // Retrieve dimensions.
+    int c  = feVel_->N().extent_int(0);
+    int fv = basisPtrVel_->getCardinality();
+    int fp = basisPtrPrs_->getCardinality();
+    int fh = basisPtrThr_->getCardinality();
+    int d  = cellCub_->getDimension();
+ 
+    // Initialize jacobians.
+    std::vector<std::vector<scalar_view>> J(d+2);
+    for (int i = 0; i < d+2; ++i) J[i].resize(d+2);
+    for (int i = 0; i < d; ++i) {
+      for (int j = 0; j < d; ++j)
+        J[i][j] = scalar_view("riesz2", c, fv, fv);
+      J[d][i]   = scalar_view("riesz2", c, fp, fv);
+      J[i][d]   = scalar_view("riesz2", c, fv, fp);
+      J[d+1][i] = scalar_view("riesz2", c, fh, fv);
+      J[i][d+1] = scalar_view("riesz2", c, fv, fh);
+    }
+    J[d][d]     = scalar_view("riesz2", c, fp, fp);
+    J[d+1][d]   = scalar_view("riesz2", c, fh, fp);
+    J[d][d+1]   = scalar_view("riesz2", c, fp, fh);
+    J[d+1][d+1] = scalar_view("riesz2", c, fh, fh);
+
+    for (int i = 0; i < d; ++i)
+      Kokkos::deep_copy(J[i][i],feVel_->massMat());
+    Kokkos::deep_copy(J[d][d],fePrs_->massMat());
+    Kokkos::deep_copy(J[d+1][d+1],feThr_->massMat());
+
+    // Combine the jacobians.
+    fieldHelper_->combineFieldCoeff(riesz, J);
+  }
+
+  std::vector<basis_ptr> getFields() override {
+    return basisPtrs_;
+  }
+
+  void setCellNodes(const scalar_view &volCellNodes,
+                    const std::vector<std::vector<scalar_view>> &bdryCellNodes,
+                    const std::vector<std::vector<std::vector<int>>> &bdryCellLocIds) override {
+    volCellNodes_ = volCellNodes;
+    bdryCellNodes_ = bdryCellNodes;
+    bdryCellLocIds_ = bdryCellLocIds;
+    // Finite element definition.
+    feVel_ = ROL::makePtr<fe_type>(volCellNodes_,basisPtrVel_,cellCub_);
+    fePrs_ = ROL::makePtr<fe_type>(volCellNodes_,basisPtrPrs_,cellCub_);
+    feThr_ = ROL::makePtr<fe_type>(volCellNodes_,basisPtrThr_,cellCub_);
+    // Get boundary degrees of freedom.
+    fvidx_ = feVel_->getBoundaryDofs();
+    fpidx_ = fePrs_->getBoundaryDofs();
+    fhidx_ = feThr_->getBoundaryDofs();
+    // Construct boundary FEs
+    const int numSideSets = bdryCellNodes.size();
+    feVelBdry_.resize(numSideSets);
+    fePrsBdry_.resize(numSideSets);
+    feThrBdry_.resize(numSideSets);
+    for (int i = 0; i < numSideSets; ++i) {
+      int numLocSides = bdryCellNodes[i].size();
+      feVelBdry_[i].resize(numLocSides);
+      fePrsBdry_[i].resize(numLocSides);
+      feThrBdry_[i].resize(numLocSides);
+      for (int j = 0; j < numLocSides; ++j) {
+        if (bdryCellNodes[i][j] != scalar_view()) {
+          feVelBdry_[i][j] = ROL::makePtr<fe_type>(bdryCellNodes[i][j],basisPtrVel_,bdryCub_,j);
+          fePrsBdry_[i][j] = ROL::makePtr<fe_type>(bdryCellNodes[i][j],basisPtrPrs_,bdryCub_,j);
+          feThrBdry_[i][j] = ROL::makePtr<fe_type>(bdryCellNodes[i][j],basisPtrThr_,bdryCub_,j);
+        }
+      }
+    }
+  }
+
+  void setFieldPattern(const std::vector<std::vector<int>> & fieldPattern) override {
+    fieldPattern_ = fieldPattern;
+    fieldHelper_ = ROL::makePtr<FieldHelper<Real,DeviceType>>(numFields_, numDofs_, numFieldDofs_, fieldPattern_);
+  }
+
+  const ROL::Ptr<fe_type> getVelocityFE(void) const {
+    return feVel_;
+  }
+
+  const ROL::Ptr<fe_type> getPressureFE(void) const {
+    return fePrs_;
+  }
+
+  const ROL::Ptr<fe_type> getThermalFE(void) const {
+    return feThr_;
+  }
+
+  const std::vector<std::vector<ROL::Ptr<fe_type>>> getVelocityBdryFE(void) const {
+    return feVelBdry_;
+  }
+
+  const std::vector<std::vector<ROL::Ptr<fe_type>>> getPressureBdryFE(void) const {
+    return fePrsBdry_;
+  }
+
+  const std::vector<std::vector<ROL::Ptr<fe_type>>> getThermalBdryFE(void) const {
+    return feThrBdry_;
+  }
+
+  const std::vector<std::vector<std::vector<int>>> getBdryCellLocIds(void) const {
+    return bdryCellLocIds_;
+  }
+
+  const ROL::Ptr<FieldHelper<Real,DeviceType>> getFieldHelper(void) const {
+    return fieldHelper_;
+  }
+
+}; // PDE_ThermalFluids
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/published/MPA_KouriSurowiec2019/CMakeLists.txt b/packages/rol/example/PDE-OPT/published/MPA_KouriSurowiec2019/CMakeLists.txt
index 5510631b1565..475dea69ae7f 100644
--- a/packages/rol/example/PDE-OPT/published/MPA_KouriSurowiec2019/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/published/MPA_KouriSurowiec2019/CMakeLists.txt
@@ -10,25 +10,25 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/teuchos/src/sol)
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     Burgers
     SOURCES example_Burgers.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     1dPoisson
     SOURCES example_1dPoisson.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     2dPoisson
     SOURCES example_2dPoisson.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     MPAKouriSurowiec2019DataCopy
     SOURCE_FILES
       input_Burgers.xml
@@ -38,5 +38,33 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
   )
 
+ENDIF()
+
+IF(${PROJECT_NAME}_ENABLE_Intrepid2 AND
+   ${PROJECT_NAME}_ENABLE_Ifpack2   AND
+   ${PROJECT_NAME}_ENABLE_MueLu     AND
+   ${PROJECT_NAME}_ENABLE_Amesos2   AND
+   ${PROJECT_NAME}_ENABLE_Tpetra )
+
+  # Need ROL_TpetraMultiVector.hpp
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/teuchos/src/sol)
+
+  ROL_ADD_EXECUTABLE(
+    2dPoisson_Kokkos
+    SOURCES example_2dPoissonK.cpp
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_COPY_FILES_TO_BINARY_DIR(
+    MPAKouriSurowiec2019DataCopyK
+    SOURCE_FILES
+      input_Burgers.xml
+      input_1dPoisson.xml
+      input_2dPoisson.xml
+    SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}"
+    DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
+  )
 
 ENDIF()
diff --git a/packages/rol/example/PDE-OPT/published/MPA_KouriSurowiec2019/example_1dPoisson.cpp b/packages/rol/example/PDE-OPT/published/MPA_KouriSurowiec2019/example_1dPoisson.cpp
index c6c77cdc784d..2c6f5967d3eb 100644
--- a/packages/rol/example/PDE-OPT/published/MPA_KouriSurowiec2019/example_1dPoisson.cpp
+++ b/packages/rol/example/PDE-OPT/published/MPA_KouriSurowiec2019/example_1dPoisson.cpp
@@ -22,7 +22,7 @@
 
 // Teuchos includes
 #include "Teuchos_Comm.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -37,7 +37,7 @@ int main( int argc, char *argv[] ) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int>> comm = Tpetra::getDefaultComm();
 
   const int myRank = comm->getRank();
@@ -165,7 +165,7 @@ int main( int argc, char *argv[] ) {
       problem->finalize(false,true,*outStream);
       ROL::PrimalDualRisk<RealT> solver(problem, sampler, *parlist);
       if (parlist->sublist("Problem Data").get("Run Derivative Check",false)) {
-        problem->check(*outStream);
+        problem->check(true,*outStream);
         solver.check(*outStream);
       }
       solver.run(*outStream);
diff --git a/packages/rol/example/PDE-OPT/published/MPA_KouriSurowiec2019/example_2dPoisson.cpp b/packages/rol/example/PDE-OPT/published/MPA_KouriSurowiec2019/example_2dPoisson.cpp
index 52b5f1e8f0eb..37bfc3abbecd 100644
--- a/packages/rol/example/PDE-OPT/published/MPA_KouriSurowiec2019/example_2dPoisson.cpp
+++ b/packages/rol/example/PDE-OPT/published/MPA_KouriSurowiec2019/example_2dPoisson.cpp
@@ -12,7 +12,7 @@
 */
 
 #include "Teuchos_Comm.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
 
@@ -49,7 +49,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int>> comm
     = Tpetra::getDefaultComm();
   ROL::Ptr<const Teuchos::Comm<int>> serial_comm
@@ -203,7 +203,7 @@ int main(int argc, char *argv[]) {
       ROL::PrimalDualRisk<RealT> solver(problem, sampler, *parlist);
       bool checkDeriv = parlist->sublist("Problem").get("Check Derivatives",false);
       if ( checkDeriv ) {
-        problem->check(*outStream);
+        problem->check(true,*outStream);
         solver.check(*outStream);
       }
       solver.run(*outStream);
diff --git a/packages/rol/example/PDE-OPT/published/MPA_KouriSurowiec2019/example_2dPoissonK.cpp b/packages/rol/example/PDE-OPT/published/MPA_KouriSurowiec2019/example_2dPoissonK.cpp
new file mode 100644
index 000000000000..c35e93e05f04
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/published/MPA_KouriSurowiec2019/example_2dPoissonK.cpp
@@ -0,0 +1,233 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_02.cpp
+    \brief Shows how to solve the stochastic advection-diffusion problem.
+*/
+
+#include "Teuchos_Comm.hpp"
+#include "ROL_GlobalMPISession.hpp"
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+//#include <fenv.h>
+
+#include "ROL_Stream.hpp"
+#include "ROL_ParameterList.hpp"
+#include "ROL_Bounds.hpp"
+#include "ROL_Reduced_Objective_SimOpt.hpp"
+#include "ROL_MonteCarloGenerator.hpp"
+#include "ROL_TpetraTeuchosBatchManager.hpp"
+#include "ROL_Solver.hpp"
+#include "ROL_StochasticProblem.hpp"
+#include "ROL_PrimalDualRisk.hpp"
+
+#include "../../TOOLS/meshmanagerK.hpp"
+#include "../../TOOLS/pdeconstraintK.hpp"
+#include "../../TOOLS/pdeobjectiveK.hpp"
+#include "../../TOOLS/pdevectorK.hpp"
+#include "../../TOOLS/batchmanagerK.hpp"
+#include "pde_stoch_adv_diffK.hpp"
+#include "obj_stoch_adv_diffK.hpp"
+#include "mesh_stoch_adv_diffK.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+int main(int argc, char *argv[]) {
+  //feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
+
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  Kokkos::ScopeGuard kokkosScope (argc, argv);
+  auto comm = Tpetra::getDefaultComm();
+  auto serial_comm = ROL::makePtr<Teuchos::SerialComm<int>>();
+  const int myRank = comm->getRank();
+  if ((iprint > 0) && (myRank == 0))
+    outStream = ROL::makePtrFromRef(std::cout);
+  else
+    outStream = ROL::makePtrFromRef(bhs);
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+
+    /*** Read in XML input ***/
+    auto parlist = ROL::getParametersFromXmlFile("input_2dPoisson.xml");
+
+    // Problem dimensions
+    const int controlDim = 9, stochDim = 37;
+    const RealT one(1); 
+
+    /*************************************************************************/
+    /***************** BUILD GOVERNING PDE ***********************************/
+    /*************************************************************************/
+    /*** Initialize main data structure. ***/
+    auto meshMgr = ROL::makePtr<MeshManager_stoch_adv_diff<RealT,DeviceT>>(*parlist);
+    // Initialize PDE describing advection-diffusion equation
+    auto pde = ROL::makePtr<PDE_stoch_adv_diff<RealT,DeviceT>>(*parlist);
+    auto con = ROL::makePtr<PDE_Constraint<RealT,DeviceT>>(pde,meshMgr,serial_comm,*parlist,*outStream);
+    con->getAssembler()->printMeshData(*outStream);
+    con->setSolveParameters(*parlist);
+
+    /*************************************************************************/
+    /***************** BUILD VECTORS *****************************************/
+    /*************************************************************************/
+    auto u_ptr = con->getAssembler()->createStateVector();
+    auto p_ptr = con->getAssembler()->createStateVector();
+    auto r_ptr = con->getAssembler()->createResidualVector();
+    auto z_ptr = ROL::makePtr<std::vector<RealT>>(controlDim);
+    auto u = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(u_ptr,pde,con->getAssembler());
+    auto p = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(p_ptr,pde,con->getAssembler());
+    auto r = ROL::makePtr<PDE_DualSimVector<RealT,DeviceT>>(r_ptr,pde,con->getAssembler());
+    auto z = ROL::makePtr<PDE_OptVector<RealT>>(ROL::makePtr<ROL::StdVector<RealT>>(z_ptr));
+
+    /*************************************************************************/
+    /***************** BUILD COST FUNCTIONAL *********************************/
+    /*************************************************************************/
+    std::vector<ROL::Ptr<QoI<RealT,DeviceT>>> qoi_vec(2,ROL::nullPtr);
+    qoi_vec[0] = ROL::makePtr<QoI_State_Cost_stoch_adv_diff<RealT,DeviceT>>(pde->getFE());
+    qoi_vec[1] = ROL::makePtr<QoI_Control_Cost_stoch_adv_diff<RealT,DeviceT>>();
+    RealT stateCost   = parlist->sublist("Problem").get("State Cost",1.e5);
+    RealT controlCost = parlist->sublist("Problem").get("Control Cost",1.e0);
+    std::vector<RealT> wts = {stateCost, controlCost};
+    auto obj = ROL::makePtr<PDE_Objective<RealT,DeviceT>>(qoi_vec,wts,con->getAssembler());
+    bool storage = parlist->sublist("Problem").get("Use State and Adjoint Storage",true);
+    auto objReduced = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(obj, con, u, z, p, storage, false);
+
+    /*************************************************************************/
+    /***************** BUILD BOUND CONSTRAINT ********************************/
+    /*************************************************************************/
+    auto zlo = ROL::makePtr<PDE_OptVector<RealT>>(ROL::makePtr<ROL::StdVector<RealT>>(controlDim, 0.0));
+    auto zup = ROL::makePtr<PDE_OptVector<RealT>>(ROL::makePtr<ROL::StdVector<RealT>>(controlDim, 1.0));
+    auto bnd = ROL::makePtr<ROL::Bounds<RealT>>(zlo, zup);
+
+    /*************************************************************************/
+    /***************** BUILD SAMPLER *****************************************/
+    /*************************************************************************/
+    int nsamp = parlist->sublist("Problem").get("Number of Samples",100);
+    std::vector<RealT> tmp = {-one,one};
+    std::vector<std::vector<RealT>> bounds(stochDim,tmp);
+    auto bman = ROL::makePtr<PDE_OptVector_BatchManager<RealT>>(comm);
+    auto sampler = ROL::makePtr<ROL::MonteCarloGenerator<RealT>>(nsamp,bounds,bman);
+
+    /*************************************************************************/
+    /***************** SOLVE STOCHASTIC PROBLEM ******************************/
+    /*************************************************************************/
+    std::string method = parlist->sublist("Problem").get("Method","PD-Risk");
+    if (method == "Epi-Reg") {
+      ROL::ParameterList list = *parlist;
+      std::string rm = list.sublist("SOL").sublist("Objective").sublist("Risk Measure").get("Name","CVaR");
+      list.sublist("Step").set("Type","Trust Region");
+      list.sublist("SOL").sublist("Objective").sublist("Risk Measure").sublist(rm).sublist("Distribution").set("Name","Parabolic");
+      list.sublist("SOL").sublist("Objective").sublist("Risk Measure").sublist(rm).sublist("Distribution").sublist("Parabolic").set("Lower Bound",-0.5);
+      list.sublist("SOL").sublist("Objective").sublist("Risk Measure").sublist(rm).sublist("Distribution").sublist("Parabolic").set("Upper Bound", 0.5);
+
+      RealT eps0 = parlist->sublist("SOL").sublist("Primal Dual Risk").get("Initial Gradient Tolerance",1e-2);
+      RealT eps1 = parlist->sublist("Status Test").get("Gradient Tolerance",1e-10);
+      RealT epss = parlist->sublist("SOL").sublist("Primal Dual Risk").get("Solver Tolerance Update Scale",1e-1);
+      RealT rho0 = parlist->sublist("SOL").sublist("Primal Dual Risk").get("Initial Penalty Parameter",1e1);
+      RealT rho1 = parlist->sublist("SOL").sublist("Primal Dual Risk").get("Maximum Penalty Parameter",1e16);
+      RealT rhos = parlist->sublist("SOL").sublist("Primal Dual Risk").get("Penalty Update Scale",1e1);
+      int maxIt = static_cast<int>(std::floor(std::log(rho1/rho0)/std::log(rhos)));
+
+      auto zp = z->clone(); zp->set(*z);
+      RealT eps = eps0, rho = rho0, tau = 1.0/rho, stat(1), statp(1), err(1);
+      for (int i = 0; i < maxIt; ++i) {
+        // Set tolerances
+        list.sublist("SOL").sublist("Objective").sublist("Risk Measure").sublist(rm).set("Smoothing Parameter",tau);
+        list.sublist("Status Test").set("Gradient Tolerance",eps);
+        // Set statistic
+        list.sublist("SOL").sublist("Objective").set("Initial Statistic",stat);
+        // Solver smoothed problem
+        auto problem = ROL::makePtr<ROL::StochasticProblem<RealT>>(objReduced, z);
+        problem->addBoundConstraint(bnd);
+        problem->makeObjectiveStochastic(list, sampler);
+        problem->finalize(false,true,*outStream);
+        ROL::Solver<RealT> solver(problem,list);
+        solver.solve(*outStream);
+        // Get solution statistic
+        stat = problem->getSolutionStatistic();
+        // Compute iteration errors
+        zp->axpy(-1.0,*z);
+        err = std::sqrt(zp->dot(*zp) + std::pow(stat-statp,2));
+        *outStream << std::endl << std::endl;
+        *outStream << "  iter = " << i
+                   << "  rho = " << rho
+                   << "  eps = " << eps
+                   << "  err = " << err;
+        *outStream << std::endl << std::endl;
+        if (eps <= eps1 && err <= eps1) break;
+        zp->set(*z);
+        statp = stat;
+        // Update tolerances
+        eps *= epss;
+        rho *= rhos;
+        tau  = 1.0/rho;
+        if (eps < eps1) eps = eps1;
+      }
+    }
+    else {
+      auto problem = ROL::makePtr<ROL::Problem<RealT>>(objReduced, z);
+      problem->addBoundConstraint(bnd);
+      problem->finalize(false,true,*outStream);
+      ROL::PrimalDualRisk<RealT> solver(problem, sampler, *parlist);
+      bool checkDeriv = parlist->sublist("Problem").get("Check Derivatives",false);
+      if ( checkDeriv ) {
+        problem->check(true,*outStream);
+        solver.check(*outStream);
+      }
+      solver.run(*outStream);
+    }
+
+    /*************************************************************************/
+    /***************** OUTPUT RESULTS ****************************************/
+    /*************************************************************************/
+    //std::clock_t timer_print = std::clock();
+    // Output control to file
+    if ( myRank == 0 ) {
+      std::ofstream zfile;
+      zfile.open("control.txt");
+      for (int i = 0; i < controlDim; i++) {
+        zfile << std::scientific << std::setprecision(15);
+        zfile << (*z_ptr)[i] << std::endl;
+      }
+      zfile.close();
+    }
+
+    RealT tol(1e-12);
+    Teuchos::Array<RealT> res(1,0);
+    con->solve(*r,*u,*z,tol);
+    r_ptr->norm2(res.view(0,1));
+
+    /*************************************************************************/
+    /***************** CHECK RESIDUAL NORM ***********************************/
+    /*************************************************************************/
+    *outStream << "Residual Norm: " << res[0] << std::endl << std::endl;
+    errorFlag += (res[0] > 1.e-6 ? 1 : 0);
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/published/MPA_KouriSurowiec2019/example_Burgers.cpp b/packages/rol/example/PDE-OPT/published/MPA_KouriSurowiec2019/example_Burgers.cpp
index 08626e3f3fb6..8235e3ccc46d 100644
--- a/packages/rol/example/PDE-OPT/published/MPA_KouriSurowiec2019/example_Burgers.cpp
+++ b/packages/rol/example/PDE-OPT/published/MPA_KouriSurowiec2019/example_Burgers.cpp
@@ -19,7 +19,7 @@ int main(int argc, char* argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int>> comm = Tpetra::getDefaultComm();
 
   const int myRank = comm->getRank();
@@ -70,7 +70,7 @@ int main(int argc, char* argv[]) {
       problem->makeObjectiveStochastic(*parlist, sampler);
       problem->finalize(false,true,*outStream);
       if (parlist->sublist("Problem").get("Run Derivative Check",false)) {
-        problem->check(*outStream);
+        problem->check(true,*outStream);
       }
       parlist->sublist("Step").set("Type","Bundle");
       parlist->sublist("Step").sublist("Bundle").set("Distance Measure Coefficient",0.0);
@@ -137,7 +137,7 @@ int main(int argc, char* argv[]) {
       if (parlist->sublist("Problem").get("Run Derivative Check",false)) {
         std::vector<RealT> param(4,0);
         robj->setParameter(param);
-        problem->check(*outStream);
+        problem->check(true,*outStream);
         solver.check(*outStream);
       }
       solver.run(*outStream);
diff --git a/packages/rol/example/PDE-OPT/published/MPA_KouriSurowiec2019/example_Burgers.hpp b/packages/rol/example/PDE-OPT/published/MPA_KouriSurowiec2019/example_Burgers.hpp
index 0b5dd0b29b6a..3f6d3137ef4b 100644
--- a/packages/rol/example/PDE-OPT/published/MPA_KouriSurowiec2019/example_Burgers.hpp
+++ b/packages/rol/example/PDE-OPT/published/MPA_KouriSurowiec2019/example_Burgers.hpp
@@ -9,7 +9,7 @@
 
 #include "Teuchos_LAPACK.hpp"
 #include "Teuchos_Comm.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
diff --git a/packages/rol/example/PDE-OPT/published/MPA_KouriSurowiec2019/mesh_stoch_adv_diffK.hpp b/packages/rol/example/PDE-OPT/published/MPA_KouriSurowiec2019/mesh_stoch_adv_diffK.hpp
new file mode 100644
index 000000000000..48d5638e0ce7
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/published/MPA_KouriSurowiec2019/mesh_stoch_adv_diffK.hpp
@@ -0,0 +1,70 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+#include "../../TOOLS/meshmanagerK.hpp"
+
+template <class Real, class DeviceType>
+class MeshManager_stoch_adv_diff : public MeshManager_Rectangle<Real,DeviceType> {
+
+private:
+
+  int nx_;
+  int ny_;
+  ROL::Ptr<std::vector<std::vector<std::vector<int>>>>  meshSideSets_;
+
+public: 
+
+  MeshManager_stoch_adv_diff(ROL::ParameterList &parlist) : MeshManager_Rectangle<Real,DeviceType>(parlist)
+  {
+    nx_ = parlist.sublist("Geometry").get("NX", 3);
+    ny_ = parlist.sublist("Geometry").get("NY", 3);
+    computeSideSets();
+  }
+
+
+  void computeSideSets() {
+
+    int numSideSets = 2;
+    meshSideSets_ = ROL::makePtr<std::vector<std::vector<std::vector<int>>>>(numSideSets);
+
+    // Dirichlet
+    (*meshSideSets_)[0].resize(4);
+    (*meshSideSets_)[0][0].resize(nx_);
+    (*meshSideSets_)[0][1].resize(0);
+    (*meshSideSets_)[0][2].resize(nx_);
+    (*meshSideSets_)[0][3].resize(ny_);
+    // Neumann
+    (*meshSideSets_)[1].resize(4);
+    (*meshSideSets_)[1][0].resize(0);
+    (*meshSideSets_)[1][1].resize(ny_);
+    (*meshSideSets_)[1][2].resize(0);
+    (*meshSideSets_)[1][3].resize(0);
+    
+    for (int i=0; i<nx_; ++i)
+      (*meshSideSets_)[0][0][i] = i;
+    for (int i=0; i<ny_; ++i)
+      (*meshSideSets_)[1][1][i] = (i+1)*nx_-1;
+    for (int i=0; i<nx_; ++i)
+      (*meshSideSets_)[0][2][i] = i + nx_*(ny_-1);
+    for (int i=0; i<ny_; ++i)
+      (*meshSideSets_)[0][3][i] = i*nx_;
+
+  } // computeSideSets
+
+  ROL::Ptr<std::vector<std::vector<std::vector<int>>>> getSideSets(
+              const bool verbose = false,
+              std::ostream & outStream = std::cout) const { 
+    if (verbose) {
+      outStream << "Mesh_stoch_adv_diff: getSideSets called" << std::endl;
+      outStream << "Mesh_stoch_adv_diff: numSideSets = "     << meshSideSets_->size() << std::endl;
+    }
+    return meshSideSets_;
+  }
+  
+}; // MeshManager_stoch_adv_diff
diff --git a/packages/rol/example/PDE-OPT/published/MPA_KouriSurowiec2019/obj_stoch_adv_diffK.hpp b/packages/rol/example/PDE-OPT/published/MPA_KouriSurowiec2019/obj_stoch_adv_diffK.hpp
new file mode 100644
index 000000000000..3a36d68469ab
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/published/MPA_KouriSurowiec2019/obj_stoch_adv_diffK.hpp
@@ -0,0 +1,317 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  obj.hpp
+    \brief Provides the interface for local (cell-based) objective function computations.
+*/
+
+#ifndef PDEOPT_QOI_STOCH_ADV_DIFFK_HPP
+#define PDEOPT_QOI_STOCH_ADV_DIFFK_HPP
+
+#include "../../TOOLS/qoiK.hpp"
+#include "pde_stoch_adv_diffK.hpp"
+
+template <class Real, class DeviceType>
+class QoI_State_Cost_stoch_adv_diff : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  ROL::Ptr<fe_type> fe_;
+
+public:
+  QoI_State_Cost_stoch_adv_diff(const ROL::Ptr<fe_type> &fe) : fe_(fe) {
+    const int d = fe_->cubPts().extent_int(2);
+    std::vector<Real> pt(d);
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c = fe_->gradN().extent_int(0);
+    const int p = fe_->gradN().extent_int(2);
+    // Initialize output val
+    val = scalar_view("val", c);
+    // Evaluate state on FE basis
+    scalar_view valU_eval("valU_eval", c, p);
+    fe_->evaluateValue(valU_eval, u_coeff);
+    // Compute squared L2-norm of diff
+    fe_->computeIntegral(val,valU_eval,valU_eval);
+    // Scale by one half
+    rst::scale(val,static_cast<Real>(0.5));
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c = fe_->gradN().extent_int(0);
+    const int f = fe_->gradN().extent_int(1);
+    const int p = fe_->gradN().extent_int(2);
+    // Initialize output grad
+    grad = scalar_view("grad", c, f);
+    // Evaluate state on FE basis
+    scalar_view valU_eval("valU_eval", c, p);
+    fe_->evaluateValue(valU_eval, u_coeff);
+    // Compute gradient of squared L2-norm of diff
+    fst::integrate(grad,valU_eval,fe_->NdetJ(),false);
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_State_Cost_stoch_adv_diff::gradient_2 is zero.");
+  }
+
+  std::vector<Real> gradient_3(std::vector<scalar_view> & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_State_Cost_stoch_adv_diff::gradient_3 is zero.");
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    const int c = fe_->gradN().extent_int(0);
+    const int f = fe_->gradN().extent_int(1);
+    const int p = fe_->gradN().extent_int(2);
+    scalar_view valV_eval("valV_eval", c, p);
+    hess = scalar_view("hess", c, f);
+    fe_->evaluateValue(valV_eval, v_coeff);
+    fst::integrate(hess,valV_eval,fe_->NdetJ(),false);
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_State_Cost_stoch_adv_diff::HessVec_12 is zero.");
+  }
+
+  void HessVec_13(scalar_view & hess,
+                  const ROL::Ptr<const std::vector<Real>> & v_param,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_State_Cost_stoch_adv_diff::HessVec_13 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_State_Cost_stoch_adv_diff::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_State_Cost_stoch_adv_diff::HessVec_22 is zero.");
+  }
+
+  void HessVec_23(scalar_view & hess,
+                  const ROL::Ptr<const std::vector<Real>> & v_param,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_State_Cost_stoch_adv_diff::HessVec_23 is zero.");
+  }
+
+  std::vector<Real> HessVec_31(std::vector<scalar_view> & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_State_Cost_stoch_adv_diff::HessVec_31 is zero.");
+  }
+
+  std::vector<Real> HessVec_32(std::vector<scalar_view> & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_State_Cost_stoch_adv_diff::HessVec_32 is zero.");
+  }
+
+  std::vector<Real> HessVec_33(std::vector<scalar_view> & hess,
+                  const ROL::Ptr<const std::vector<Real>> & v_param,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_State_Cost_stoch_adv_diff::HessVec_33 is zero.");
+  }
+
+}; // QoI_State_Cost
+
+template <class Real, class DeviceType>
+class QoI_Control_Cost_stoch_adv_diff : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+public:
+  QoI_Control_Cost_stoch_adv_diff(void) {}
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    Real sum(0);
+    for (const auto zi : *z_param) sum += zi;
+    val = scalar_view();
+    return sum;
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Control_Cost_stoch_adv_diff::gradient_1 is zero.");
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Control_Cost_stoch_adv_diff::gradient_2 is zero.");
+  }
+
+  std::vector<Real> gradient_3(std::vector<scalar_view> & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    const int size = z_param->size();
+    std::vector<Real> g(size,static_cast<Real>(1));
+    return g;
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Control_Cost_stoch_adv_diff::HessVec_11 is zero.");
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Control_Cost_stoch_adv_diff::HessVec_12 is zero.");
+  }
+
+  void HessVec_13(scalar_view & hess,
+                  const ROL::Ptr<const std::vector<Real>> & v_param,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Control_Cost_stoch_adv_diff::HessVec_13 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Control_Cost_stoch_adv_diff::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Control_Cost_stoch_adv_diff::HessVec_22 is zero.");
+  }
+
+  void HessVec_23(scalar_view & hess,
+                  const ROL::Ptr<const std::vector<Real>> & v_param,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Control_Cost_stoch_adv_diff::HessVec_23 is zero.");
+  }
+
+  std::vector<Real> HessVec_31(std::vector<scalar_view> & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Control_Cost_stoch_adv_diff::HessVec_31 is zero.");
+  }
+
+  std::vector<Real> HessVec_32(std::vector<scalar_view> & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Control_Cost_stoch_adv_diff::HessVec_32 is zero.");
+  }
+
+  std::vector<Real> HessVec_33(std::vector<scalar_view> & hess,
+                  const ROL::Ptr<const std::vector<Real>> & v_param,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Control_Cost_stoch_adv_diff::HessVec_33 is zero.");
+  }
+
+}; // QoI_Control_Cost
+
+template <class Real>
+class StdObjective_stoch_adv_diff : public ROL::StdObjective<Real> {
+private:
+  Real alpha1_, alpha2_;
+
+public:
+  StdObjective_stoch_adv_diff(ROL::ParameterList &parlist) {
+    alpha1_ = parlist.sublist("Problem").get("State Cost",1.e5);
+    alpha2_ = parlist.sublist("Problem").get("Control Cost",1.e0);
+  }
+
+  Real value(const std::vector<Real> &x, Real &tol) {
+    return alpha1_*x[0] + alpha2_*x[1];
+  }
+
+  void gradient(std::vector<Real> &g, const std::vector<Real> &x, Real &tol) {
+    g[0] = alpha1_;
+    g[1] = alpha2_;
+  }
+
+  void hessVec(std::vector<Real> &hv, const std::vector<Real> &v, const std::vector<Real> &x, Real &tol) {
+    const Real zero(0);
+    hv[0] = zero;
+    hv[1] = zero;
+  }
+
+}; // OBJ_SCALAR
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/published/MPA_KouriSurowiec2019/pde_stoch_adv_diffK.hpp b/packages/rol/example/PDE-OPT/published/MPA_KouriSurowiec2019/pde_stoch_adv_diffK.hpp
new file mode 100644
index 000000000000..f172e0eaaec4
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/published/MPA_KouriSurowiec2019/pde_stoch_adv_diffK.hpp
@@ -0,0 +1,471 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  pde.hpp
+    \brief Implements the local PDE interface for the Poisson control problem.
+*/
+
+#ifndef PDE_STOCH_ADV_DIFFK_HPP
+#define PDE_STOCH_ADV_DIFFK_HPP
+
+#include "../../TOOLS/pdeK.hpp"
+#include "../../TOOLS/feK.hpp"
+
+#include "Intrepid2_HGRAD_QUAD_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_QUAD_C2_FEM.hpp"
+#include "Intrepid2_DefaultCubatureFactory.hpp"
+#include "Intrepid2_FunctionSpaceTools.hpp"
+#include "Intrepid2_CellTools.hpp"
+
+#include "ROL_Ptr.hpp"
+
+template <class Real, class DeviceType>
+class PDE_stoch_adv_diff : public PDE<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  // Finite element basis information
+  basis_ptr basisPtr_;
+  std::vector<basis_ptr> basisPtrs_;
+  // Cell cubature information
+  ROL::Ptr<Intrepid2::Cubature<DeviceType,Real,Real>> cellCub_;
+  // Cell node information
+  scalar_view volCellNodes_;
+  std::vector<std::vector<scalar_view>> bdryCellNodes_;
+  std::vector<std::vector<std::vector<int>>> bdryCellLocIds_;
+  // Finite element definition
+  ROL::Ptr<fe_type> fe_vol_;
+  // Local degrees of freedom on boundary, for each side of the reference cell (first index).
+  std::vector<std::vector<int>> fidx_;
+  // Coordinates of degrees freedom on boundary cells.
+  // Indexing:  [sideset number][local side id](cell number, value at dof)
+  std::vector<std::vector<scalar_view>> bdryCellDofValues_;
+
+public:
+  PDE_stoch_adv_diff(ROL::ParameterList &parlist) {
+    // Finite element fields.
+    int basisOrder = parlist.sublist("PDE Poisson").get("Basis Order",1);
+    if (basisOrder == 1)
+      basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceType,Real,Real>>();
+    else if (basisOrder == 2)
+      basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C2_FEM<DeviceType,Real,Real>>();
+    basisPtrs_.clear(); basisPtrs_.push_back(basisPtr_);
+    // Quadrature rules.
+    shards::CellTopology cellType = basisPtr_->getBaseCellTopology();        // get the cell type from any basis
+    Intrepid2::DefaultCubatureFactory cubFactory;                            // create cubature factory
+    int cubDegree = parlist.sublist("PDE Poisson").get("Cubature Degree",2); // set cubature degree, e.g., 2
+    cellCub_ = cubFactory.create<DeviceType,Real,Real>(cellType, cubDegree); // create default cubature
+  }
+
+  void residual(scalar_view & res,
+                const scalar_view u_coeff,
+                const scalar_view z_coeff = scalar_view(),
+                const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // GET DIMENSIONS
+    int c = fe_vol_->gradN().extent_int(0);
+    int f = fe_vol_->gradN().extent_int(1);
+    int p = fe_vol_->gradN().extent_int(2);
+    int d = fe_vol_->gradN().extent_int(3);
+    // INITIALIZE RESIDUAL
+    res = scalar_view("res", c, f);
+    // COMPUTE PDE COEFFICIENTS
+    scalar_view kappa("kappa", c, p);
+    scalar_view V("V", c, p, d);
+    scalar_view rhs("rhs", c, p);
+    computeCoefficients(kappa,V,rhs);
+    // COMPUTE DIFFUSION TERM
+    // Compute grad(U)
+    scalar_view gradU_eval("gradU_eval", c, p, d);
+    fe_vol_->evaluateGradient(gradU_eval, u_coeff);
+    // Multiply kappa * grad(U)
+    scalar_view kappa_gradU("kappa_gradU", c, p, d);
+    fst::tensorMultiplyDataData(kappa_gradU,kappa,gradU_eval);
+    // Integrate (kappa * grad(U)) . grad(N)
+    fst::integrate(res,kappa_gradU,fe_vol_->gradNdetJ(),false);
+    // ADD ADVECTION TERM TO RESIDUAL
+    // Multiply V . grad(U)
+    scalar_view V_gradU("V_gradU", c, p);
+    fst::dotMultiplyDataData(V_gradU,V,gradU_eval);
+    // Integrate (V . grad(U)) * N
+    fst::integrate(res,V_gradU,fe_vol_->NdetJ(),true);
+    // ADD RHS TO RESIDUAL
+    fst::integrate(res,rhs,fe_vol_->NdetJ(),true);
+
+    // ADD CONTROL TERM TO RESIDUAL
+    int size = z_param->size();
+    scalar_view ctrl("ctrl", c, p);
+    for (int i = 0; i < size; ++i) {
+      computeControlOperator(ctrl,(*z_param)[i],i);
+      fst::integrate(res,ctrl,fe_vol_->NdetJ(),true);
+    }
+    // APPLY DIRICHLET CONDITIONS
+    int numLocalSideIds = bdryCellLocIds_[0].size();
+    for (int j = 0; j < numLocalSideIds; ++j) {
+      int numCellsSide = bdryCellLocIds_[0][j].size();
+      int numBdryDofs = fidx_[j].size();
+      for (int k = 0; k < numCellsSide; ++k) {
+        int cidx = bdryCellLocIds_[0][j][k];
+        for (int l = 0; l < numBdryDofs; ++l) {
+          res(cidx,fidx_[j][l])
+            = u_coeff(cidx,fidx_[j][l]) - (bdryCellDofValues_[0][j])(k,fidx_[j][l]);
+        }
+      }
+    }
+  }
+
+  void Jacobian_1(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // GET DIMENSIONS
+    int c = fe_vol_->gradN().extent_int(0);
+    int f = fe_vol_->gradN().extent_int(1);
+    int p = fe_vol_->gradN().extent_int(2);
+    int d = fe_vol_->gradN().extent_int(3);
+    // INITILAIZE JACOBIAN
+    jac = scalar_view("jac", c, f, f);
+    // COMPUTE PDE COEFFICIENTS
+    scalar_view kappa("kappa", c, p);
+    scalar_view V("V", c, p, d);
+    scalar_view rhs("rhs", c, p);
+    computeCoefficients(kappa,V,rhs);
+    // COMPUTE DIFFUSION TERM
+    // Multiply kappa * grad(N)
+    scalar_view kappa_gradN("kappa_gradN", c, f, p, d);
+    fst::tensorMultiplyDataField(kappa_gradN,kappa,fe_vol_->gradN());
+    // Integrate (kappa * grad(N)) . grad(N)
+    fst::integrate(jac,kappa_gradN,fe_vol_->gradNdetJ(),false);
+    // ADD ADVECTION TERM TO JACOBIAN
+    // Multiply V . grad(N)
+    scalar_view V_gradN("V_gradN", c, f, p);
+    fst::dotMultiplyDataField(V_gradN,V,fe_vol_->gradN());
+    // Integrate (V . grad(U)) * N
+    fst::integrate(jac,fe_vol_->NdetJ(),V_gradN,true);
+    // APPLY DIRICHLET CONDITIONS
+    int numLocalSideIds = bdryCellLocIds_[0].size();
+    for (int j = 0; j < numLocalSideIds; ++j) {
+      int numCellsSide = bdryCellLocIds_[0][j].size();
+      int numBdryDofs = fidx_[j].size();
+      for (int k = 0; k < numCellsSide; ++k) {
+        int cidx = bdryCellLocIds_[0][j][k];
+        for (int l = 0; l < numBdryDofs; ++l) {
+          //std::cout << "\n   j=" << j << "  l=" << l << "  " << fidx[j][l];
+          for (int m = 0; m < f; ++m) {
+            jac(cidx,fidx_[j][l],m) = static_cast<Real>(0);
+          }
+          jac(cidx,fidx_[j][l],fidx_[j][l]) = static_cast<Real>(1);
+        }
+      }
+    }
+  }
+
+  void Jacobian_2(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_stoch_adv_diff::Jacobian_2): Jacobian is zero.");
+  }
+
+  void Jacobian_3(std::vector<scalar_view> & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // GET DIMENSIONS
+    int c = fe_vol_->gradN().extent_int(0);
+    int f = fe_vol_->gradN().extent_int(1);
+    int p = fe_vol_->gradN().extent_int(2);
+    // ADD CONTROL TERM TO RESIDUAL
+    int size = z_param->size();
+    scalar_view ctrl("ctrl", c, p);
+    for (int i = 0; i < size; ++i) {
+      jac[i] = scalar_view("jac", c, f);
+      computeControlOperator(ctrl,static_cast<Real>(1),i);
+      fst::integrate(jac[i],ctrl,fe_vol_->NdetJ(),false);
+      // APPLY DIRICHLET CONDITIONS
+      int numLocalSideIds = bdryCellLocIds_[0].size();
+      for (int j = 0; j < numLocalSideIds; ++j) {
+        int numCellsSide = bdryCellLocIds_[0][j].size();
+        int numBdryDofs = fidx_[j].size();
+        for (int k = 0; k < numCellsSide; ++k) {
+          int cidx = bdryCellLocIds_[0][j][k];
+          for (int l = 0; l < numBdryDofs; ++l) {
+            (jac[i])(cidx,fidx_[j][l]) = static_cast<Real>(0);
+          }
+        }
+      }
+    }
+  }
+
+  void Hessian_11(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_stoch_adv_diff::Hessian_11): Hessian is zero.");
+  }
+
+  void Hessian_12(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_stoch_adv_diff::Hessian_12): Hessian is zero.");
+  }
+
+  void Hessian_13(std::vector<scalar_view> & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_stoch_adv_diff::Hessian_13): Hessian is zero.");
+  }
+
+  void Hessian_21(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_stoch_adv_diff::Hessian_21): Hessian is zero.");
+  }
+
+  void Hessian_22(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_stoch_adv_diff::Hessian_22): Hessian is zero.");
+  }
+
+  void Hessian_23(std::vector<scalar_view> & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_stoch_adv_diff::Hessian_23): Hessian is zero.");
+  }
+
+  void Hessian_31(std::vector<scalar_view> & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_stoch_adv_diff::Hessian_31): Hessian is zero.");
+  }
+
+  void Hessian_32(std::vector<scalar_view> & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_stoch_adv_diff::Hessian_32): Hessian is zero.");
+  }
+
+  void Hessian_33(std::vector<std::vector<scalar_view>> & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_stoch_adv_diff::Hessian_33): Hessian is zero.");
+  }
+
+  void RieszMap_1(scalar_view & riesz) override {
+    // GET DIMENSIONS
+    int c = fe_vol_->N().extent_int(0);
+    int f = fe_vol_->N().extent_int(1);
+    // INITIALIZE RIESZ
+    riesz = scalar_view("riesz1",c, f, f);
+    Kokkos::deep_copy(riesz,fe_vol_->stiffMat());
+    rst::add(riesz,fe_vol_->massMat());
+  }
+
+  void RieszMap_2(scalar_view & riesz) override {
+    // GET DIMENSIONS
+    int c = fe_vol_->N().extent_int(0);
+    int f = fe_vol_->N().extent_int(1);
+    // INITIALIZE RIESZ
+    riesz = scalar_view("riesz2",c, f, f);
+    Kokkos::deep_copy(riesz,fe_vol_->massMat());
+  }
+
+  std::vector<basis_ptr> getFields() override {
+    return basisPtrs_;
+  }
+
+  void setCellNodes(const scalar_view &volCellNodes,
+                    const std::vector<std::vector<scalar_view>> &bdryCellNodes,
+                    const std::vector<std::vector<std::vector<int>>> &bdryCellLocIds) override {
+    volCellNodes_ = volCellNodes;
+    bdryCellNodes_ = bdryCellNodes;
+    bdryCellLocIds_ = bdryCellLocIds;
+    // Finite element definition.
+    fe_vol_ = ROL::makePtr<fe_type>(volCellNodes_,basisPtr_,cellCub_);
+    // Set local boundary DOFs.
+    fidx_ = fe_vol_->getBoundaryDofs();
+    // Compute Dirichlet values at DOFs.
+    int d = basisPtr_->getBaseCellTopology().getDimension();
+    int numSidesets = bdryCellLocIds_.size();
+    bdryCellDofValues_.resize(numSidesets);
+    for (int i=0; i<numSidesets; ++i) {
+      int numLocSides = bdryCellLocIds_[i].size();
+      bdryCellDofValues_[i].resize(numLocSides);
+      for (int j=0; j<numLocSides; ++j) {
+        int c = bdryCellLocIds_[i][j].size();
+        int f = basisPtr_->getCardinality();
+        bdryCellDofValues_[i][j] = scalar_view("bdryCellDofValues", c, f);
+        scalar_view coords = scalar_view("coords", c, f, d);
+        if (c > 0)
+          fe_vol_->computeDofCoords(coords, bdryCellNodes_[i][j]);
+        for (int k=0; k<c; ++k) {
+          for (int l=0; l<f; ++l) {
+            std::vector<Real> dofpoint(d);
+            for (int m=0; m<d; ++m) dofpoint[m] = coords(k, l, m);
+            (bdryCellDofValues_[i][j])(k, l) = evaluateDirichlet(dofpoint, i, j);
+          }
+        }
+      }
+    }
+  }
+
+  const ROL::Ptr<fe_type> getFE(void) const {
+    return fe_vol_;
+  }
+
+private:
+
+  Real evaluateDirichlet(const std::vector<Real> & coords, int sideset, int locSideId) const {
+    return static_cast<Real>(0);
+  }
+
+  Real evaluateDiffusivity(const std::vector<Real> &x) const {
+    // random diffusion coefficient from i. babuska, f. nobile, r. tempone 2010.
+    // simplified model for random stratified media.
+    const int ns = 10;
+    const Real one(1), two(2), three(3), eight(8), sixteen(16), half(0.5);
+    const Real lc = one/sixteen, sqrtpi = std::sqrt(M_PI);
+    const Real xi = std::sqrt(sqrtpi*lc), sqrt3 = std::sqrt(three);
+    const std::vector<Real> param = PDE<Real,DeviceType>::getParameter();
+    Real f(0), phi(0);                                          
+    Real p25 = (param.size() > 25 ? param[25] : static_cast<Real>(0)); 
+    Real val = one + sqrt3*p25*std::sqrt(sqrtpi*lc*half);
+    Real arg = one + sqrt3*std::sqrt(sqrtpi*lc*half);
+    for (int i = 1; i < ns; ++i) {
+      f = floor(half*static_cast<Real>(i+1));     
+      phi = ((i+1)%2 ? std::sin(f*M_PI*x[0]) : std::cos(f*M_PI*x[0]));
+      Real pi25 = (static_cast<int>(param.size()) > i+25 ? param[i+25] : static_cast<Real>(0));
+      val += xi*std::exp(-std::pow(f*M_PI*lc,two)/eight)*phi*sqrt3*pi25;
+      arg += xi*std::exp(-std::pow(f*M_PI*lc,two)/eight)*std::abs(phi)*sqrt3;
+    }
+    return half + two*std::exp(val)/std::exp(arg);
+  }
+
+  void evaluateVelocity(std::vector<Real> &adv, const std::vector<Real> &x) const {
+    const Real half(0.5), one(1), five(5), ten(10);
+    const std::vector<Real> param = PDE<Real,DeviceType>::getParameter();
+    Real p35 = (param.size() > 35 ? param[35] : static_cast<Real>(0));
+    Real p36 = (param.size() > 36 ? param[36] : static_cast<Real>(0));
+    const Real a = five*half*(p36+one);
+    const Real b = five + (ten-five)*half*(p35+one);
+    adv[0] = b - a*x[0];
+    adv[1] =     a*x[1];
+  }
+
+  Real evaluateRHS(const std::vector<Real> &x) const {
+    const int ns = 5;             
+    const Real half(0.5), one(1), two(2);
+    const std::vector<Real> param = PDE<Real,DeviceType>::getParameter();
+    Real source(0), arg1(0), arg2(0), mag(0), x0(0), y0(0), sx(0), sy(0);
+    // Upper and lower bounds on source magintudes
+    const std::vector<Real> ml = {1.5, 1.2, 1.5, 1.2, 1.1};
+    const std::vector<Real> mu = {2.5, 1.8, 1.9, 2.6, 1.5};
+    // Upper and lower bounds on source locations
+    const std::vector<Real> xl = {0.45, 0.75, 0.40, 0.05, 0.85};
+    const std::vector<Real> xu = {0.55, 0.85, 0.60, 0.35, 0.95};
+    const std::vector<Real> yl = {0.25, 0.55, 0.50, 0.45, 0.45};
+    const std::vector<Real> yu = {0.35, 0.65, 0.70, 0.65, 0.55};
+    // Upper and lower bounds on source widths
+    const std::vector<Real> sxl = {0.03, 0.02, 0.01, 0.02, 0.015};
+    const std::vector<Real> sxu = {0.07, 0.04, 0.05, 0.04, 0.025};
+    const std::vector<Real> syl = {0.04, 0.01, 0.02, 0.02, 0.01};
+    const std::vector<Real> syu = {0.12, 0.05, 0.04, 0.04, 0.03};
+    for (int i=0; i<ns; ++i) {
+      Real pi  = (static_cast<int>(param.size()) > i      ? param[i]      : static_cast<Real>(0));
+      Real pi1 = (static_cast<int>(param.size()) > i+1*ns ? param[i+1*ns] : static_cast<Real>(0));
+      Real pi2 = (static_cast<int>(param.size()) > i+2*ns ? param[i+2*ns] : static_cast<Real>(0));
+      Real pi3 = (static_cast<int>(param.size()) > i+3*ns ? param[i+3*ns] : static_cast<Real>(0));
+      Real pi4 = (static_cast<int>(param.size()) > i+4*ns ? param[i+4*ns] : static_cast<Real>(0));
+
+      mag  = ml[i] + (mu[i]-ml[i])*half*(pi+one);
+      x0   = xl[i] + (xu[i]-xl[i])*half*(pi1+one);
+      y0   = yl[i] + (yu[i]-yl[i])*half*(pi3+one);
+      sx   = sxl[i] + (sxu[i]-sxl[i])*half*(pi2+one);
+      sy   = syl[i] + (syu[i]-syl[i])*half*(pi4+one);
+      arg1 = std::pow((x[0]-x0)/sx, two);
+      arg2 = std::pow((x[1]-y0)/sy, two);
+      source += mag*std::exp(-half*(arg1+arg2));
+    }
+    return source;
+  }
+
+  void computeCoefficients(scalar_view &kappa,
+                           scalar_view &V,
+                           scalar_view &rhs) const {
+    // GET DIMENSIONS
+    int c = fe_vol_->gradN().extent_int(0);
+    int p = fe_vol_->gradN().extent_int(2);
+    int d = fe_vol_->gradN().extent_int(3);
+    std::vector<Real> pt(d), adv(d);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for ( int k = 0; k < d; ++k)
+          pt[k] = (fe_vol_->cubPts())(i,j,k);
+        // Compute diffusivity kappa
+        kappa(i,j) = evaluateDiffusivity(pt);
+        // Compute advection velocity field V
+        evaluateVelocity(adv,pt);
+        for (int k = 0; k < d; ++k)
+          V(i,j,k) = adv[k];
+        // Compute forcing term f
+        rhs(i,j) = -evaluateRHS(pt);
+      }
+    }
+  }
+
+  Real evaluateControlOperator(const std::vector<Real> &x, int i) const {
+    const Real sx(0.05), sy(0.05), half(0.5);
+    const std::vector<Real> xl = {0.25, 0.50, 0.75, 0.25, 0.50, 0.75, 0.25, 0.50, 0.75};
+    const std::vector<Real> yl = {0.25, 0.25, 0.25, 0.50, 0.50, 0.50, 0.75, 0.75, 0.75};
+    return -std::exp(- half*(x[0]-xl[i])*(x[0]-xl[i]) / (sx*sx)
+                     - half*(x[1]-yl[i])*(x[1]-yl[i]) / (sy*sy));
+  }
+  
+  void computeControlOperator(scalar_view &ctrl,Real z, int I) const {
+    // GET DIMENSIONS
+    int c = fe_vol_->gradN().extent_int(0);
+    int p = fe_vol_->gradN().extent_int(2);
+    int d = fe_vol_->gradN().extent_int(3);
+    std::vector<Real> pt(d), adv(d);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for ( int k = 0; k < d; ++k)
+          pt[k] = (fe_vol_->cubPts())(i,j,k);
+        // Compute control operator
+        ctrl(i,j) = -z*evaluateControlOperator(pt,I);
+      }
+    }
+  }
+
+}; // PDE_stoch_adv_diff
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/CMakeLists.txt b/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/CMakeLists.txt
index defcd9155218..d19765cd6d98 100644
--- a/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/CMakeLists.txt
@@ -11,13 +11,13 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_01
     SOURCES example_01.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     NonsmoothTR_KouriBaraldi2022_DataCopy
     SOURCE_FILES
       input_ex01.xml
@@ -27,5 +27,34 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
   )
 
+ENDIF()
+
+IF(${PROJECT_NAME}_ENABLE_Intrepid2 AND 
+   ${PROJECT_NAME}_ENABLE_Amesos2   AND
+   ${PROJECT_NAME}_ENABLE_Ifpack2   AND
+   ${PROJECT_NAME}_ENABLE_MueLu     AND
+   ${PROJECT_NAME}_ENABLE_Tpetra )
+
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/teuchos/src/sol)
+
+  # Need ROL_TpetraMultiVector.hpp
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
+
+  ROL_ADD_EXECUTABLE(
+    example_01_Kokkos
+    SOURCES example_01K.cpp
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_COPY_FILES_TO_BINARY_DIR(
+    NonsmoothTR_KouriBaraldi2022_DataCopyK
+    SOURCE_FILES
+      input_ex01.xml
+      process3D.sh
+      meshfiles/truss3d.jou
+    SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}"
+    DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
+  )
 
 ENDIF()
diff --git a/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/example_01.cpp b/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/example_01.cpp
index a9823af36221..5b36374f1a0e 100644
--- a/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/example_01.cpp
@@ -15,8 +15,8 @@
 #include "Teuchos_Comm.hpp"
 #include "Teuchos_Time.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
+#include "ROL_ParameterList.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -48,7 +48,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int>> comm
     = Tpetra::getDefaultComm();
   const int myRank = comm->getRank();
diff --git a/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/example_01K.cpp b/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/example_01K.cpp
new file mode 100644
index 000000000000..0b12c97b2d3e
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/example_01K.cpp
@@ -0,0 +1,182 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_01.cpp
+    \brief Shows how to solve the stuctural topology optimization problem.
+
+*/
+
+#include "Teuchos_Comm.hpp"
+#include "Teuchos_Time.hpp"
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+#include "ROL_ParameterList.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+
+#include "ROL_Solver.hpp"
+#include "ROL_Bounds.hpp"
+#include "ROL_ConstraintFromObjective.hpp"
+#include "ROL_LinearCombinationObjective.hpp"
+
+#include "../../TOOLS/pdevectorK.hpp"
+#include "../../TOOLS/pdeobjectiveK.hpp"
+#include "../../TOOLS/meshreaderK.hpp"
+
+#include "src/pde_elasticityK.hpp"
+#include "src/pde_filterK.hpp"
+#include "src/filtered_compliance_robjK.hpp"
+#include "src/volume_conK.hpp"
+#include "src/obj_volumeK.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+int main(int argc, char *argv[]) {
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  Kokkos::ScopeGuard kokkosScope (argc, argv);
+  auto comm = Tpetra::getDefaultComm();
+  const int myRank = comm->getRank();
+  if ((iprint > 0) && (myRank == 0))
+    outStream = ROL::makePtrFromRef(std::cout);
+  else
+    outStream = ROL::makePtrFromRef(bhs);
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+    RealT tol(1e-8);
+
+    /*** Read in XML input ***/
+    std::string filename = "input_ex01.xml";
+    auto parlist = ROL::getParametersFromXmlFile(filename);
+
+    // Retrieve parameters.
+    int probDim                = 3;
+    const RealT volFraction    = parlist->sublist("Problem").get("Volume Fraction", 0.4);
+    const RealT initDens       = volFraction;
+    const std::string minType  = "Compliance";
+    const std::string hessAppr = "None";
+    parlist->sublist("Problem").set("Problem Dimension",probDim);
+    parlist->sublist("Mesh").set("File Name","meshfiles/truss3d.txt");
+    parlist->sublist("Problem").set("Use Inexact Linear Solves",true);
+    parlist->sublist("General").set("Inexact Objective Function",true);
+    parlist->sublist("General").set("Inexact Gradient",true);
+
+    /*** Initialize main data structure. ***/
+    int nProcs = comm->getSize();
+    if (nProcs == 1) nProcs = 0;
+    auto meshMgr = ROL::makePtr<MeshReader<RealT,DeviceT>>(*parlist,nProcs);
+    // Initialize PDE describing elasticity equations.
+    auto pde = ROL::makePtr<PDE_Elasticity<RealT,DeviceT>>(*parlist);
+
+    // Initialize the filter PDE.
+    auto pdeFilter = ROL::makePtr<PDE_Filter<RealT,DeviceT>>(*parlist);
+    pde->setDensityFields(pdeFilter->getFields());
+
+    // Initialize reduced compliance objective function.
+    auto robj_com = ROL::makePtr<TopOptFilteredComplianceObjective<RealT,DeviceT>>(
+                    pde,pdeFilter,meshMgr,comm,*parlist,*outStream);
+    auto assembler = robj_com->getAssembler();
+    auto assemblerFilter = robj_com->getFilterAssembler();
+
+    // Create vectors.
+    auto u_ptr = assembler->createStateVector();         u_ptr->putScalar(0.0);
+    auto p_ptr = assembler->createStateVector();         p_ptr->putScalar(0.0);
+    auto r_ptr = assembler->createResidualVector();      r_ptr->putScalar(0.0);
+    auto z_ptr = assemblerFilter->createControlVector(); z_ptr->putScalar(1.0);
+    auto up = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(u_ptr,pde,assembler,*parlist);
+    auto pp = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(p_ptr,pde,assembler,*parlist);
+    auto rp = ROL::makePtr<PDE_DualSimVector<RealT,DeviceT>>(r_ptr,pde,assembler,*parlist);
+    auto zp = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(z_ptr,pdeFilter,assemblerFilter,*parlist);
+    auto imul = ROL::makePtr<ROL::SingletonVector<RealT>>(0);
+
+    // Build volume objective function.
+    auto qoi_vol = ROL::makePtr<QoI_Volume_TopoOpt<RealT,DeviceT>>(pdeFilter->getDensityFE(),volFraction);
+    auto con_vol = ROL::makePtr<TopOptVolumeConstraint<RealT,DeviceT>>(qoi_vol,assemblerFilter,zp);
+
+    // Normalize compliance objective function
+    zp->setScalar(initDens);
+    RealT cs = ROL::dynamicPtrCast<TopOptFilteredComplianceObjective<RealT,DeviceT>>(robj_com)->normalize(*zp,tol);
+
+    // Output problem details
+    *outStream << std::endl;
+    *outStream << "Problem Data"          << std::endl;
+    *outStream << "  Dimension:         " << probDim << std::endl;
+    *outStream << "  SIMP Power:        "
+               << parlist->sublist("Problem").get("SIMP Power",3.0) << std::endl;
+    *outStream << "  Volume Fraction:   " << volFraction << std::endl;
+    *outStream << "  Initial Density:   " << initDens    << std::endl;
+    *outStream << "  Compliance Scale:  " << cs  << std::endl;
+    *outStream << std::endl;
+
+    // Create objective, constraint, multiplier and bounds
+
+    // Initialize bound constraints.
+    RealT lval = 0.0, uval = 1.0;
+    auto lop = zp->clone(); lop->setScalar(lval);
+    auto hip = zp->clone(); hip->setScalar(uval);
+    auto bnd = ROL::makePtr<ROL::Bounds<RealT>>(lop,hip);
+
+    // Set up optimization problem.
+    auto prob = ROL::makePtr<ROL::Problem<RealT>>(robj_com,zp);
+    prob->addBoundConstraint(bnd);
+    prob->addLinearConstraint("Volume",con_vol,imul);
+    prob->setProjectionAlgorithm(*parlist);
+    prob->finalize(false,true,*outStream);
+
+    // Check derivatives.
+    bool derivCheck = parlist->sublist("Problem").get("Check derivatives",false);
+    if (derivCheck) prob->check(true,*outStream);
+
+    // Solve optimization problem.
+    ROL::Ptr<ROL::Solver<RealT>> solver;
+
+    // TRSPG
+    zp->setScalar(initDens);
+    imul->zero();
+    robj_com->reset();
+    con_vol->reset();
+    parlist->sublist("Step").set("Type","Trust Region");
+    parlist->sublist("Step").sublist("Trust Region").set("Subproblem Model","SPG");
+    Teuchos::Time trspgTimer("TRSPG Time", true);
+    solver = ROL::makePtr<ROL::Solver<RealT>>(prob,*parlist);
+    solver->solve(*outStream);
+    trspgTimer.stop();
+    *outStream << "Total optimization time = " << trspgTimer.totalElapsedTime() << " seconds.\n";
+    // Output.
+    robj_com->summarize(*outStream);
+    con_vol->summarize(*outStream);
+    robj_com->printToFile(*zp,*outStream);
+
+    // Get a summary from the time monitor.
+    Teuchos::TimeMonitor::summarize();
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/input_ex01.xml b/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/input_ex01.xml
index 437583e99351..d3afb862b0a4 100644
--- a/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/input_ex01.xml
+++ b/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/input_ex01.xml
@@ -123,7 +123,9 @@
       <Parameter name="Output Frequency"      type="int"    value="10"    />
       <Parameter name="Verbosity"             type="int"    value="100"   />
       <!-- Pseudo Block CG -->
+      <!--
       <Parameter name="Assert Positive Definiteness" type="bool" value="false" />
+      -->
       <!-- Block GMRES -->
       <!--
       <Parameter name="Num Blocks"            type="int"    value="40"    />
diff --git a/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/compliance_robjK.hpp b/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/compliance_robjK.hpp
new file mode 100644
index 000000000000..be4733cc9c7f
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/compliance_robjK.hpp
@@ -0,0 +1,636 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+#ifndef TOPOT_COMPLIANCE_ROBJK_H
+#define TOPOT_COMPLIANCE_ROBJK_H
+
+#include "ROL_Objective.hpp"
+#include "ROL_VectorController.hpp"
+#include "ROL_SecantFactory.hpp"
+#include "../../../TOOLS/pdeK.hpp"
+#include "../../../TOOLS/assemblerK.hpp"
+#include "../../../TOOLS/solver.hpp"
+#include "../../../TOOLS/pdevectorK.hpp"
+
+template <class Real, class DeviceType>
+class TopOptComplianceObjective : public ROL::Objective<Real> {
+private:
+  // Elasticity PDE
+  const ROL::Ptr<PDE<Real,DeviceType>> pde_;
+  ROL::Ptr<Assembler<Real,DeviceType>> assembler_;
+  ROL::Ptr<Solver<Real>> solver_, solver_cache_, solver_tmp_;
+  ROL::Ptr<Tpetra::CrsMatrix<>> matJ1_, matJ1_cache_, matJ1_tmp_;
+  ROL::Ptr<Tpetra::CrsMatrix<>> matJ2_, matJ2_cache_, matJ2_tmp_;
+  ROL::Ptr<Tpetra::CrsMatrix<>> matH22_;
+  bool assembleJ1_, assembleJ2_, assembleH22_, assembleF_;
+  ROL::ParameterList solverList_;
+  Real tol_, maxtol_;
+  bool useInexact_;
+
+  // Output information
+  const std::string uname_, dname_;
+
+  // Vector Storage
+  ROL::Ptr<Tpetra::MultiVector<>> F_data_, state_sens_data_;
+  ROL::Ptr<Tpetra::MultiVector<>> dstat_data_, dctrl_data_;
+  ROL::Ptr<ROL::Vector<Real>> state_, dctrl_;
+  ROL::Ptr<ROL::VectorController<Real>> stateStore_;
+
+  bool storage_;
+  Real cmpScale_;
+  bool nuke_;
+  int printFreq_;
+
+  unsigned nstat_, nsens_;
+  unsigned nupda_, nfval_, ngrad_, nhess_, nprec_;
+  unsigned napJ1_, nasJ1_, napJ2_, nasJ2_, napH2_, nasH2_, nasLo_;
+
+  // Hessian Approximation Data
+  std::string hessAppr_;
+  Real H2scale_;
+  ROL::Ptr<ROL::Secant<Real>> secant_;
+  ROL::Ptr<Tpetra::MultiVector<>> s_data_;
+  ROL::Ptr<ROL::Vector<Real>> g_new_, g_old_, s_, g_;
+  bool computeGrad_;
+
+public:
+  TopOptComplianceObjective(
+      const ROL::Ptr<PDE<Real,DeviceType>>                    &pde,
+      const ROL::Ptr<MeshManager<Real,DeviceType>>            &mesh,
+      const ROL::Ptr<const Teuchos::Comm<int>>     &comm,
+      ROL::ParameterList                           &list,
+      std::ostream                                 &stream = std::cout,
+      std::string                                   uname  = "state",
+      std::string                                   dname  = "density")
+    : pde_(pde), assembleJ1_(true), assembleJ2_(true),
+      assembleH22_(true), assembleF_(true), tol_(1e-2), maxtol_(1e-2),
+      uname_(uname), dname_(dname),
+      nstat_(0), nsens_(0),
+      nupda_(0), nfval_(0), ngrad_(0), nhess_(0), nprec_(0),
+      napJ1_(0), nasJ1_(0), napJ2_(0), nasJ2_(0), napH2_(0), nasH2_(0), nasLo_(0), H2scale_(1) {
+    // Elasticity PDE
+    assembler_ = ROL::makePtr<Assembler<Real,DeviceType>>(pde_->getFields(),
+                                               pde_->getFields2(),
+                                               mesh,comm,list,stream);
+    assembler_->setCellNodes(*pde_);
+    solver_       = ROL::makePtr<Solver<Real>>(list.sublist("Solver"));
+    solver_cache_ = ROL::makePtr<Solver<Real>>(list.sublist("Solver"));
+    solverList_   = list.sublist("Solver");
+    useInexact_   = list.sublist("Problem").get("Use Inexact Linear Solves",false);
+
+    // Vector storage
+    F_data_           = assembler_->createResidualVector();
+    state_sens_data_  = assembler_->createStateVector();
+    dstat_data_       = assembler_->createResidualVector();
+    dctrl_data_       = assembler_->createControlVector();
+    state_            = ROL::makePtr<PDE_PrimalSimVector<Real,DeviceType>>(assembler_->createStateVector(),pde_,assembler_,list);
+    dctrl_            = ROL::makePtr<PDE_DualOptVector<Real,DeviceType>>(dctrl_data_,pde_,assembler_,list);
+    stateStore_       = ROL::makePtr<ROL::VectorController<Real>>();
+
+    storage_   = list.sublist("Problem").get("Use state storage", true);
+    cmpScale_  = list.sublist("Problem").get("Compliance Scaling", 1.0);
+    hessAppr_  = list.sublist("Problem").get("Hessian Approximation", "None");
+    nuke_      = list.sublist("Problem").get("Use Basic Update",false);
+    printFreq_ = list.sublist("Problem").get("Output Frequency",0);
+
+    computeGrad_ = true;
+    if (hessAppr_ == "Secant") {
+      s_data_ = assembler_->createControlVector();
+      s_      = ROL::makePtr<PDE_PrimalOptVector<Real,DeviceType>>(s_data_,pde_,assembler_,list);
+      g_      = ROL::makePtr<PDE_DualOptVector<Real,DeviceType>>(assembler_->createControlVector(),pde_,assembler_,list);
+      g_old_  = ROL::makePtr<PDE_DualOptVector<Real,DeviceType>>(assembler_->createControlVector(),pde_,assembler_,list);
+      g_new_  = ROL::makePtr<PDE_DualOptVector<Real,DeviceType>>(assembler_->createControlVector(),pde_,assembler_,list);
+      secant_ = ROL::SecantFactory<Real>(list);
+    }
+
+    initialize();
+  }
+
+  const ROL::Ptr<Assembler<Real,DeviceType>> getAssembler(void) const {
+    return assembler_;
+  }
+
+  void update(const ROL::Vector<Real> &z, ROL::UpdateType type, int iter = -1) {
+    nupda_++;
+    stateStore_->objectiveUpdate(type);
+    if (nuke_) {
+      update_temp(z,iter);
+    }
+    else {
+      switch (type) {
+        case ROL::UpdateType::Initial: update_initial(z,iter); break;
+        case ROL::UpdateType::Accept:  update_accept(z,iter);  break;
+        case ROL::UpdateType::Revert:  update_revert(z,iter);  break;
+        case ROL::UpdateType::Trial:   update_trial(z,iter);   break;
+        case ROL::UpdateType::Temp:    update_temp(z,iter);    break;
+      }
+    }
+    // Print
+    if (printFreq_ > 0 && iter%printFreq_ == 0) {
+      std::stringstream ufile, zfile;
+      ufile  << uname_ << "_" << iter << ".txt";
+      zfile  << dname_ << "_" << iter << ".txt";
+      ROL::Ptr<const Tpetra::MultiVector<>> z_data = getConstField(z);
+      ROL::Ptr<const Tpetra::MultiVector<>> u_data = getConstField(*state_);
+      assembler_->outputTpetraVector(u_data, ufile.str());
+      assembler_->outputTpetraVector(z_data, zfile.str());
+    }
+  }
+
+  void update( const ROL::Vector<Real> &z, bool flag = true, int iter = -1 ) {
+    nupda_++;
+    stateStore_->objectiveUpdate(true);
+    if (nuke_) {
+      update_temp(z,iter);
+    }
+    else {
+      // Trial:    flag = false, iter = -1
+      // Check:    flag = true,  iter = -1
+      // Reject:   flag = false, iter > -1
+      // Accept:   flag = true,  iter > -1
+      if (flag) {
+        if (iter > -1) {
+          update_accept(z,iter);
+        }
+        else {
+          update_temp(z,iter);
+        }
+      }
+      else {
+        if (iter > -1) {
+          update_revert(z,iter);
+        }
+        else {
+          update_trial(z,iter);
+        }
+      }
+    }
+  }
+
+  Real normalize(const ROL::Vector<Real> &z, Real &tol) {
+    update(z,ROL::UpdateType::Temp);
+    Real val = value(z,tol);
+    cmpScale_ /= val;
+    return cmpScale_;
+  }
+
+  void printToFile(const ROL::Vector<Real> &z, std::ostream &stream = std::cout,
+                   const std::string ufile = "state.txt", const std::string dfile = "density.txt") {
+    Real tol(1e-2*std::sqrt(ROL::ROL_EPSILON<Real>()));
+    update(z,ROL::UpdateType::Temp);
+    ROL::Ptr<const Tpetra::MultiVector<>> z_data = getConstField(z);
+    ROL::Ptr<Tpetra::MultiVector<>>       u_data = getField(*state_);
+    solve_state_equation(u_data, z_data, tol);
+    assembler_->outputTpetraVector(u_data, ufile);
+    assembler_->outputTpetraVector(z_data, dfile);
+    assembler_->printMeshData(stream);
+  }
+
+  void solveState(ROL::Vector<Real> &u, const ROL::Vector<Real> &z) {
+    Real tol(1e-2*std::sqrt(ROL::ROL_EPSILON<Real>()));
+    update(z,ROL::UpdateType::Temp);
+    ROL::Ptr<const Tpetra::MultiVector<>> z_data = getConstField(z);
+    ROL::Ptr<Tpetra::MultiVector<>>       u_data = getField(u);
+    solve_state_equation(u_data, z_data, tol);
+  }
+
+  Real value( const ROL::Vector<Real> &z, Real &tol ) {
+    std::cout << "Value: " << tol << std::endl;
+    nfval_++;
+    ROL::Ptr<const Tpetra::MultiVector<>> z_data = getConstField(z);
+    ROL::Ptr<Tpetra::MultiVector<>>       u_data = getField(*state_);
+    // Solve state equation
+    solve_state_equation(*state_,z,tol);
+    // Compute compliance
+    Teuchos::Array<Real> cmp(1,0);
+    F_data_->dot(*u_data, cmp.view(0,1));
+    return cmpScale_*cmp[0];
+  }
+
+  void gradient( ROL::Vector<Real> &g, const ROL::Vector<Real> &z, Real &tol ) {
+    ngrad_++;
+    std::cout << "Gradient: " << tol << std::endl;
+    if (hessAppr_ == "Secant") {
+      // Will not work for stochastic problems
+      if (computeGrad_) {
+        ROL::Ptr<Tpetra::MultiVector<>>       g_data = getField(g);
+        ROL::Ptr<const Tpetra::MultiVector<>> z_data = getConstField(z);
+        ROL::Ptr<Tpetra::MultiVector<>>       u_data = getField(*state_);
+        // Solve state equation
+        solve_state_equation(*state_,z,tol);
+        // Apply adjoint density jacobian
+        assembleJ2(u_data, z_data);
+        applyJacobian2(g_data, u_data, true);
+        g_data->scale(-cmpScale_);
+
+        g_->set(g);
+        computeGrad_ = false;
+      }
+      else {
+        g.set(*g_);
+      }
+    }
+    else {
+      ROL::Ptr<Tpetra::MultiVector<>>       g_data = getField(g);
+      ROL::Ptr<const Tpetra::MultiVector<>> z_data = getConstField(z);
+      ROL::Ptr<Tpetra::MultiVector<>>       u_data = getField(*state_);
+      // Solve state equation
+      solve_state_equation(*state_,z,tol);
+      // Apply adjoint density jacobian
+      assembleJ2(u_data, z_data);
+      applyJacobian2(g_data, u_data, true);
+      g_data->scale(-cmpScale_);
+    }
+  }
+
+  void hessVec( ROL::Vector<Real> &hv, const ROL::Vector<Real> &v, const ROL::Vector<Real> &z, Real &tol ) {
+    nhess_++;
+    if (hessAppr_ == "Zero") {
+      hv.zero();
+    }
+    else if (hessAppr_ == "Positive Riesz") {
+      hv.set(v.dual());
+    }
+    else if (hessAppr_ == "Negative Riesz") {
+      hv.set(v.dual());
+      hv.scale(static_cast<Real>(-1));
+    }
+    else if (hessAppr_ == "Negative") {
+      ROL::Ptr<Tpetra::MultiVector<>>       h_data = getField(hv);
+      ROL::Ptr<const Tpetra::MultiVector<>> v_data = getConstField(v);
+      ROL::Ptr<const Tpetra::MultiVector<>> z_data = getConstField(z);
+      ROL::Ptr<Tpetra::MultiVector<>>       u_data = getField(*state_);
+      // Solve state equation
+      solve_state_equation(*state_,z,tol_);
+      // Apply density hessian to direction
+      assembleH22(u_data, z_data);
+      applyHessian22(h_data, v_data);
+      h_data->scale(-cmpScale_);
+    }
+    else if (hessAppr_ == "Positive") {
+      ROL::Ptr<Tpetra::MultiVector<>>       h_data = getField(hv);
+      ROL::Ptr<const Tpetra::MultiVector<>> v_data = getConstField(v);
+      ROL::Ptr<const Tpetra::MultiVector<>> z_data = getConstField(z);
+      ROL::Ptr<Tpetra::MultiVector<>>       u_data = getField(*state_);
+      // Solve state equation
+      solve_state_equation(*state_,z,tol_);
+      // Solve state sensitivity equation
+      assembleJ2(u_data, z_data);
+      solve_state_sensitivity(state_sens_data_, v_data, tol_);
+      // Apply adjoint density jacobian to state sensitivity
+      applyJacobian2(h_data, state_sens_data_, true);
+      h_data->scale(static_cast<Real>(2)*cmpScale_);
+    }
+    else if (hessAppr_ == "No Solve") {
+      ROL::Ptr<Tpetra::MultiVector<>>       h_data = getField(hv);
+      ROL::Ptr<const Tpetra::MultiVector<>> v_data = getConstField(v);
+      ROL::Ptr<const Tpetra::MultiVector<>> z_data = getConstField(z);
+      ROL::Ptr<Tpetra::MultiVector<>>       u_data = getField(*state_);
+      // Solve state equation
+      solve_state_equation(*state_,z,tol_);
+      // Apply density hessian to direction
+      assembleH22(u_data, z_data);
+      applyHessian22(h_data, v_data);
+      // Approximate positive term
+      assembleJ2(u_data, z_data);
+      applyJacobian2(dstat_data_, v_data, false);
+      applyJacobian2(dctrl_data_, dstat_data_, true);
+      // Approximation of inv(K) by aI where a solves min||aF-U||
+      // or b=1/a solves min||F-bU||
+      Teuchos::Array<Real> UdotF(1,0), UdotU(1,0);
+      u_data->dot(*F_data_,UdotF.view(0,1));
+      F_data_->dot(*F_data_,UdotU.view(0,1));
+      H2scale_ = UdotF[0]/UdotU[0];
+      //u_data->dot(*u_data,UdotU.view(0,1));
+      //H2scale_ = UdotU[0]/UdotF[0];
+      h_data->update(static_cast<Real>(2)*cmpScale_*H2scale_,*dctrl_data_,-cmpScale_);
+    }
+    else if (hessAppr_ == "Secant") {
+      // Will not work for stochastic problems
+      ROL::Ptr<Tpetra::MultiVector<>>       h_data = getField(hv);
+      ROL::Ptr<const Tpetra::MultiVector<>> v_data = getConstField(v);
+      ROL::Ptr<const Tpetra::MultiVector<>> z_data = getConstField(z);
+      ROL::Ptr<Tpetra::MultiVector<>>       u_data = getField(*state_);
+      // Solve state equation
+      solve_state_equation(*state_,z,tol_);
+      // Apply density hessian to direction
+      assembleH22(u_data, z_data);
+      applyHessian22(h_data, v_data);
+      // Apply secant approximation of positive term
+      secant_->applyB(*dctrl_, v);
+      h_data->update(static_cast<Real>(2)*cmpScale_,*dctrl_data_,-cmpScale_);
+    }
+    else {
+      ROL::Ptr<Tpetra::MultiVector<>>       h_data = getField(hv);
+      ROL::Ptr<const Tpetra::MultiVector<>> v_data = getConstField(v);
+      ROL::Ptr<const Tpetra::MultiVector<>> z_data = getConstField(z);
+      ROL::Ptr<Tpetra::MultiVector<>>       u_data = getField(*state_);
+      // Solve state equation
+      std::cout << "HessVec 1: " << tol_ << std::endl;
+      solve_state_equation(*state_,z,tol_);
+      // Solve state sensitivity equation
+      assembleJ2(u_data, z_data);
+      std::cout << "HessVec 2: " << tol_ << std::endl;
+      solve_state_sensitivity(state_sens_data_, v_data, tol_);
+      // Apply density hessian to direction
+      assembleH22(u_data, z_data);
+      applyHessian22(h_data, v_data);
+      // Apply adjoint density jacobian to state sensitivity
+      applyJacobian2(dctrl_data_, state_sens_data_, true);
+      h_data->update(static_cast<Real>(2)*cmpScale_,*dctrl_data_,-cmpScale_);
+    }
+  }
+
+  /** \brief Apply a reduced Hessian preconditioner.
+  */
+  virtual void precond( ROL::Vector<Real> &Pv, const ROL::Vector<Real> &v, const ROL::Vector<Real> &z, Real &tol ) {
+    nprec_++;
+    Pv.set(v.dual());
+  }
+
+  void summarize(std::ostream &stream) const {
+    stream << std::endl;
+    stream << std::string(114,'=') << std::endl;
+    stream << "  Objective_Compliance::summarize" << std::endl;
+    stream << "    Number of calls to update:          " << nupda_ << std::endl;
+    stream << "    Number of calls to value:           " << nfval_ << std::endl;
+    stream << "    Number of calls to gradient:        " << ngrad_ << std::endl;
+    stream << "    Number of calls to hessVec:         " << nhess_ << std::endl;
+    stream << "    Number of calls to precond:         " << nprec_ << std::endl;
+    stream << "    Number of state solves:             " << nstat_ << std::endl;
+    stream << "    Number of state sensitivity solves: " << nsens_ << std::endl;
+    stream << "    Number of load assemblies:          " << nasLo_ << std::endl;
+    stream << "    Number of Jacobian_1 assemblies:    " << nasJ1_ << std::endl;
+    stream << "    Number of Jacobian_1 sovles:        " << napJ1_ << std::endl;
+    stream << "    Number of Jacobian_2 assemblies:    " << nasJ2_ << std::endl;
+    stream << "    Number of Jacobian_2 applies:       " << napJ2_ << std::endl;
+    stream << "    Number of Hessian_22 assemblies:    " << nasH2_ << std::endl;
+    stream << "    Number of Hessian_22 applies:       " << napH2_ << std::endl;
+    stream << std::string(114,'=') << std::endl;
+    stream << std::endl;
+  }
+
+  void reset(void) {
+    nupda_ = 0; nfval_ = 0; ngrad_ = 0; nhess_ = 0; nprec_ = 0; nstat_ = 0; nsens_ = 0;
+    nasLo_ = 0; nasJ1_ = 0; napJ1_ = 0; nasJ2_ = 0; napJ2_ = 0; nasH2_ = 0; napH2_ = 0;
+  }
+
+// For parametrized (stochastic) objective functions and constraints
+public:
+  void setParameter(const std::vector<Real> &param) {
+    ROL::Objective<Real>::setParameter(param);
+    pde_->setParameter(param);
+    assembleF_ = true;
+  }
+
+private:
+
+  ROL::Ptr<const Tpetra::MultiVector<>> getConstField(const ROL::Vector<Real> &x) const {
+    ROL::Ptr<const Tpetra::MultiVector<>> xp;
+    try {
+      xp = dynamic_cast<const ROL::TpetraMultiVector<Real>&>(x).getVector();
+    }
+    catch (std::exception &e) {
+      try {
+        ROL::Ptr<const ROL::TpetraMultiVector<Real>> xvec
+          = dynamic_cast<const PDE_OptVector<Real>&>(x).getField();
+        if (xvec == ROL::nullPtr) {
+          xp = ROL::nullPtr;
+        }
+        else {
+          xp = xvec->getVector();
+        }
+      }
+      catch (std::exception &ee) {
+        xp = ROL::nullPtr;
+      }
+    }
+    return xp;
+  }
+
+  ROL::Ptr<Tpetra::MultiVector<>> getField(ROL::Vector<Real> &x) const {
+    ROL::Ptr<Tpetra::MultiVector<>> xp;
+    try {
+      xp = dynamic_cast<ROL::TpetraMultiVector<Real>&>(x).getVector();
+    }
+    catch (std::exception &e) {
+      try {
+        ROL::Ptr<ROL::TpetraMultiVector<Real>> xvec
+          = dynamic_cast<PDE_OptVector<Real>&>(x).getField();
+        if ( xvec == ROL::nullPtr ) {
+          xp = ROL::nullPtr;
+        }
+        else {
+          xp = xvec->getVector();
+        }
+      }
+      catch (std::exception &ee) {
+        xp = ROL::nullPtr;
+      }
+    }
+    return xp;
+  }
+
+  void initialize(void) {
+    ROL::Ptr<Tpetra::MultiVector<>> u_data = getField(*state_);
+    u_data->putScalar(0.0);
+    dctrl_data_->putScalar(1.0);
+    assembleJ1_     = true;
+    assembleJ2_     = true;
+    assembleJ1(u_data, dctrl_data_,false);
+    assembleJ2(u_data, dctrl_data_);
+    matJ1_tmp_      = matJ1_;
+    matJ2_tmp_      = matJ2_;
+    matJ1_          = matJ1_cache_;
+    matJ2_          = matJ2_cache_;
+    assembleJ1_     = true;
+    assembleJ2_     = true;
+    assembleJ1(u_data, dctrl_data_,false);
+    assembleJ2(u_data, dctrl_data_);
+    matJ1_cache_    = matJ1_;
+    matJ2_cache_    = matJ2_;
+    solver_tmp_     = solver_;
+    solver_         = solver_cache_;
+    assembleJ1_     = true;
+    assembleJ2_     = true;
+    // Upon exit, matJ1, matJ2 and solver are set to their cached versions
+  }
+
+  // Assemble load vector
+  void assembleF(void) {
+    if (assembleF_) {
+      nasLo_++;
+      ROL::Ptr<Tpetra::MultiVector<>> u0 = assembler_->createStateVector();
+      ROL::Ptr<Tpetra::MultiVector<>> z1 = assembler_->createControlVector();
+      u0->putScalar(0.0);
+      z1->putScalar(1.0);
+      assembler_->assemblePDEResidual(F_data_,pde_,u0,z1);
+      F_data_->scale(static_cast<Real>(-1));
+      assembleF_ = false;
+    }
+  }
+
+  void assembleJ2(const ROL::Ptr<const Tpetra::MultiVector<>> &u, const ROL::Ptr<const Tpetra::MultiVector<>> &z) {
+    if (assembleJ2_) {
+      nasJ2_++;
+      assembler_->assemblePDEJacobian2(matJ2_,pde_,u,z);
+      assembleJ2_ = false;
+    }
+  }
+
+  void applyJacobian2(ROL::Ptr<Tpetra::MultiVector<>> &Jv, const ROL::Ptr<const Tpetra::MultiVector<>> &v,
+                      const bool transpose) {
+    napJ2_++;
+    if (transpose) {
+      matJ2_->apply(*v,*Jv,Teuchos::TRANS);
+    }
+    else {
+      matJ2_->apply(*v,*Jv);
+    }
+  }
+
+  void assembleH22(const ROL::Ptr<const Tpetra::MultiVector<>> &u, const ROL::Ptr<const Tpetra::MultiVector<>> &z) {
+    if (assembleH22_) {
+      nasH2_++;
+      assembler_->assemblePDEHessian22(matH22_,pde_,u,u,z);
+      assembleH22_ = false;
+    }
+  }
+
+  void applyHessian22(ROL::Ptr<Tpetra::MultiVector<>> &Hv, const ROL::Ptr<const Tpetra::MultiVector<>> &v) {
+    matH22_->apply(*v,*Hv);
+    napH2_++;
+  }
+
+  void assembleJ1(const ROL::Ptr<const Tpetra::MultiVector<>> &u, const ROL::Ptr<const Tpetra::MultiVector<>> &z, bool setSolve = true) {
+    if (assembleJ1_) {
+      nasJ1_++;
+      assembler_->assemblePDEJacobian1(matJ1_,pde_,u,z);
+      if (setSolve) {
+        solver_->setA(matJ1_);
+      }
+      assembleJ1_ = false;
+    }
+  }
+
+  void solve(ROL::Ptr<Tpetra::MultiVector<>> &x, const ROL::Ptr<const Tpetra::MultiVector<>> &b, Real &tol) {
+    napJ1_++;
+    if (useInexact_) {
+      solverList_.sublist("Belos").set("Convergence Tolerance",tol);
+      solver_->setParameters(solverList_);
+    }
+    solver_->solve(x,b,false);
+  }
+
+  void solve_state_equation(ROL::Vector<Real> &u, const ROL::Vector<Real> &z, Real &tol) {
+    bool isComputed = false;
+    Real mytol = std::min(maxtol_,tol);
+    std::cout << "Tolerance for state: " << mytol << std::endl;
+    if (storage_ && tol_ <= mytol) {
+      isComputed = stateStore_->get(u, ROL::Objective<Real>::getParameter());
+    }
+    if (!isComputed || !storage_ || tol_ > mytol) {
+      ROL::Ptr<const Tpetra::MultiVector<>> z_data = getConstField(z);
+      ROL::Ptr<Tpetra::MultiVector<>>       u_data = getField(u);
+      solve_state_equation(u_data,z_data,mytol);
+      tol_ = mytol;
+      if (storage_) {
+        stateStore_->set(u, ROL::Objective<Real>::getParameter());
+      }
+    }
+  }
+
+  void solve_state_equation(ROL::Ptr<Tpetra::MultiVector<>> &u, const ROL::Ptr<const Tpetra::MultiVector<>> &z, Real &tol) {
+    nstat_++;
+    assembleF(); assembleJ1(u,z,true);
+    solve(u,F_data_,tol);
+  }
+
+  void solve_state_sensitivity(ROL::Ptr<Tpetra::MultiVector<>> &s, const ROL::Ptr<const Tpetra::MultiVector<>> &v, Real &tol) {
+    // This assumes that assembleJ1 and assemble J2 were called
+    nsens_++;
+    applyJacobian2(dstat_data_, v, false);
+    solve(s, dstat_data_, tol);
+  }
+
+  void update_initial( const ROL::Vector<Real> &z, int iter ) {
+    assembleJ1_   = true;
+    assembleJ2_   = true;
+    assembleH22_  = true;
+
+    // For secant approximation
+    if (hessAppr_ == "Secant") {
+      computeGrad_ = true;
+      const Real half(0.5), one(1);
+      Real tol(std::sqrt(ROL::ROL_EPSILON<Real>()));
+      gradient(*g_old_,z,tol);       // g_old = J'(z)
+      g_old_->scale(half/cmpScale_); // g_old = J'(z)/2c
+      s_->set(z); s_->scale(-one);   // s     = -z
+    }
+  }
+
+  void update_accept( const ROL::Vector<Real> &z, int iter ) {
+    // Push trial state storage into base state storage.
+    solver_tmp_       = solver_cache_;
+    solver_cache_     = solver_;
+    matJ1_tmp_        = matJ1_cache_;
+    matJ1_cache_      = matJ1_;
+    matJ2_tmp_        = matJ2_cache_;
+    matJ2_cache_      = matJ2_;
+    assembleJ1_       = false;
+    // If assembleJ2 was not called, then assembleJ2_ = true
+    assembleH22_      = true;
+
+    // For secant update
+    // Secant Equation: B s = 1/2c(J'(z) - (J'(z_old) + H s))
+    if (hessAppr_ == "Secant") {
+      computeGrad_ = true;
+      const Real half(0.5), one(1);
+      Real tol(std::sqrt(ROL::ROL_EPSILON<Real>()));
+      s_->plus(z);                         // s     = z - z_old
+      applyHessian22(dctrl_data_,s_data_); // dctrl = -H s
+      g_old_->axpy(-half,*dctrl_);         // g_old = (J'(z_old) + cH s)/2c
+      gradient(*g_new_,z,tol);             // gnew  = J'(z)
+      g_new_->scale(half/cmpScale_);       // gnew  = J'(z)/2c
+      secant_->updateStorage(z,*g_new_,*g_old_,*s_,s_->norm(),iter);
+      s_->set(z); s_->scale(-one);         // s     = -z
+      g_old_->set(*g_new_);                // g_old = J'(z)/2c
+    }
+  }
+
+  void update_temp( const ROL::Vector<Real> &z, int iter ) {
+    assembleJ1_     = true;
+    assembleJ2_     = true;
+    assembleH22_    = true;
+    if (hessAppr_ == "Secant") {
+      computeGrad_    = true;
+    }
+  }
+
+  void update_trial( const ROL::Vector<Real> &z, int iter ) {
+    solver_       = solver_tmp_;
+    matJ1_        = matJ1_tmp_;
+    matJ2_        = matJ2_tmp_;
+    assembleJ1_   = true;
+    assembleJ2_   = true;
+    assembleH22_  = false;
+  }
+
+  void update_revert( const ROL::Vector<Real> &z, int iter ) {
+    solver_       = solver_cache_;
+    matJ1_        = matJ1_cache_;
+    matJ2_        = matJ2_cache_;
+    assembleJ1_   = false;
+    // If assembleJ2 was not called, then assembleJ2_ = true
+    assembleH22_  = false;
+    // If gradient was not called, then computeGrad_ = true
+  }
+}; // class TopOptComplianceObjective
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/dirichletK.hpp b/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/dirichletK.hpp
new file mode 100644
index 000000000000..87a43e825336
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/dirichletK.hpp
@@ -0,0 +1,191 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  dirichlet.hpp
+    \brief Implements Dirichlet boundary conditions for the structural
+           topology optimization problem.
+*/
+
+#ifndef ROL_PDEOPT_ELASTICITY_DIRICHLETK_HPP
+#define ROL_PDEOPT_ELASTICITY_DIRICHLETK_HPP
+
+#include "ROL_Ptr.hpp"
+#include "ROL_ParameterList.hpp"
+#include "../../../TOOLS/feK.hpp"
+#include <vector>
+
+template<class Real, class DeviceType>
+class Dirichlet {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  std::vector<int> sidesets_, types_;
+  std::vector<std::vector<scalar_view>> bdryCellNodes_;
+  std::vector<std::vector<std::vector<int>>> bdryCellLocIds_;
+  std::vector<std::vector<int>> fidx_;
+
+  std::vector<int> getConstrainedDimensions(int type, int dim) const {
+    std::vector<int> conDim;
+    if ( type == 1 ) {
+      conDim.push_back(0);
+    }
+    else if ( type == 2 ) {
+      conDim.push_back(1);
+    }
+    else if ( type == 3 && dim > 2 ) {
+      conDim.push_back(2);
+    }
+    else if ( type == 4 && dim > 2 ) {
+      conDim.push_back(0);
+      conDim.push_back(1);
+    }
+    else if ( type == 5 && dim > 2 ) {
+      conDim.push_back(0);
+      conDim.push_back(2);
+    }
+    else if ( type == 6 && dim > 2 ) {
+      conDim.push_back(1);
+      conDim.push_back(2);
+    }
+    else {
+      for (int i = 0; i < dim; ++i) {
+        conDim.push_back(i);
+      }
+    }
+    return conDim;
+  }
+
+public:
+  Dirichlet(void) {
+    sidesets_.push_back(0);
+    types_.push_back(0);
+  }
+
+  void setCellNodes(const std::vector<std::vector<scalar_view>> &bdryCellNodes,
+                    const std::vector<std::vector<std::vector<int>>> &bdryCellLocIds,
+                    const std::vector<std::vector<int>> fidx) {
+    bdryCellNodes_  = bdryCellNodes;
+    bdryCellLocIds_ = bdryCellLocIds;
+    fidx_           = fidx;
+  }
+
+  void applyResidual(std::vector<scalar_view> &R,
+                     const std::vector<scalar_view> &U) const {
+    const int d = R.size();
+    const int numSideSets = sidesets_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        const std::vector<int> conDim = getConstrainedDimensions(types_[i],d);
+        const int numConDim = conDim.size();
+        const int numLocalSideIds = bdryCellLocIds_[sidesets_[i]].size();
+        for (int j = 0; j < numLocalSideIds; ++j) {
+          const int numCellsSide = bdryCellLocIds_[sidesets_[i]][j].size();
+          const int numBdryDofs = fidx_[j].size();
+          for (int k = 0; k < numCellsSide; ++k) {
+            const int cidx = bdryCellLocIds_[sidesets_[i]][j][k];
+            for (int l = 0; l < numBdryDofs; ++l) {
+              for (int m=0; m < numConDim; ++m) {
+                (R[conDim[m]])(cidx,fidx_[j][l]) = (U[conDim[m]])(cidx,fidx_[j][l]);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void applyJacobian1(std::vector<std::vector<scalar_view>> &J) const {
+    const int d = J.size();
+    const int f = J[0][0].extent_int(1);
+    const int numSideSets = sidesets_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        const std::vector<int> conDim = getConstrainedDimensions(types_[i],d);
+        const int numConDim = conDim.size();
+        const int numLocalSideIds = bdryCellLocIds_[sidesets_[i]].size();
+        for (int j = 0; j < numLocalSideIds; ++j) {
+          const int numCellsSide = bdryCellLocIds_[sidesets_[i]][j].size();
+          const int numBdryDofs = fidx_[j].size();
+          for (int k = 0; k < numCellsSide; ++k) {
+            const int cidx = bdryCellLocIds_[sidesets_[i]][j][k];
+            for (int l = 0; l < numBdryDofs; ++l) {
+              for (int m=0; m < f; ++m) {
+                for (int n=0; n < numConDim; ++n) {
+                  for (int p=0; p < d; ++p) {
+                    (J[conDim[n]][p])(cidx,fidx_[j][l],m) = static_cast<Real>(0);
+                  }
+                  (J[conDim[n]][conDim[n]])(cidx,fidx_[j][l],fidx_[j][l]) = static_cast<Real>(1);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void applyJacobian2(std::vector<std::vector<scalar_view>> &J) const {
+    const int d = J.size();
+    const int f = J[0][0].extent_int(2);
+    const int numSideSets = sidesets_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        const std::vector<int> conDim = getConstrainedDimensions(types_[i],d);
+        const int numConDim = conDim.size();
+        const int numLocalSideIds = bdryCellLocIds_[sidesets_[i]].size();
+        for (int j = 0; j < numLocalSideIds; ++j) {
+          const int numCellsSide = bdryCellLocIds_[sidesets_[i]][j].size();
+          const int numBdryDofs = fidx_[j].size();
+          for (int k = 0; k < numCellsSide; ++k) {
+            const int cidx = bdryCellLocIds_[sidesets_[i]][j][k];
+            for (int l = 0; l < numBdryDofs; ++l) {
+              for (int m=0; m < f; ++m) {
+                for (int n=0; n < numConDim; ++n) {
+                  (J[conDim[n]][0])(cidx,fidx_[j][l],m) = static_cast<Real>(0);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void applyMultiplier(std::vector<scalar_view> &L) const {
+    const int d = L.size();
+    const int numSideSets = sidesets_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        const std::vector<int> conDim = getConstrainedDimensions(types_[i],d);
+        const int numConDim = conDim.size();
+        const int numLocalSideIds = bdryCellLocIds_[sidesets_[i]].size();
+        for (int j = 0; j < numLocalSideIds; ++j) {
+          const int numCellsSide = bdryCellLocIds_[sidesets_[i]][j].size();
+          const int numBdryDofs = fidx_[j].size();
+          for (int k = 0; k < numCellsSide; ++k) {
+            const int cidx = bdryCellLocIds_[sidesets_[i]][j][k];
+            for (int l = 0; l < numBdryDofs; ++l) {
+              for (int m=0; m < numConDim; ++m) {
+                (L[conDim[m]])(cidx,fidx_[j][l]) = static_cast<Real>(0);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/filtered_compliance_robjK.hpp b/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/filtered_compliance_robjK.hpp
new file mode 100644
index 000000000000..b62c8037ebed
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/filtered_compliance_robjK.hpp
@@ -0,0 +1,309 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+#ifndef TOPOPT_FILTERED_COMPLIANCE_ROBJK_H
+#define TOPOPT_FILTERED_COMPLIANCE_ROBJK_H
+
+#include "ROL_Objective.hpp"
+#include "../../../TOOLS/pdeK.hpp"
+#include "../../../TOOLS/assemblerK.hpp"
+#include "../../../TOOLS/solver.hpp"
+#include "../../../TOOLS/pdevectorK.hpp"
+#include "compliance_robjK.hpp"
+
+template <class Real, class DeviceType>
+class TopOptFilteredComplianceObjective : public ROL::Objective<Real> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  // Compliance objective function
+  ROL::Ptr<TopOptComplianceObjective<Real,DeviceType>> comp_;
+
+  // FIlter PDE
+  const ROL::Ptr<PDE<Real,DeviceType>> pde_filter_;
+  ROL::Ptr<Assembler<Real,DeviceType>> assembler_filter_;
+  ROL::Ptr<Solver<Real>> solver_filter_;
+  ROL::Ptr<Tpetra::CrsMatrix<>> matF1_, matF2_;
+
+  // Vector Storage
+  ROL::Ptr<Tpetra::MultiVector<>> f_state_, f_res_;
+  ROL::Ptr<ROL::Vector<Real>> Fz_, Fzcache_, Fv_, dctrl_;
+
+  bool nuke_;
+  int printFreq_;
+
+  unsigned nupda_, nfval_, ngrad_, nhess_, nprec_;
+  unsigned nasFi_, napFi_;
+
+public:
+  TopOptFilteredComplianceObjective(
+      const ROL::Ptr<PDE<Real,DeviceType>>         &pde,
+      const ROL::Ptr<PDE<Real,DeviceType>>         &filter,
+      const ROL::Ptr<MeshManager<Real,DeviceType>> &mesh,
+      const ROL::Ptr<const Teuchos::Comm<int>>     &comm,
+      ROL::ParameterList                           &list,
+      std::ostream                                 &stream = std::cout)
+    : pde_filter_(filter),
+      nupda_(0), nfval_(0), ngrad_(0), nhess_(0), nprec_(0),
+      nasFi_(0), napFi_(0) {
+    // Compliance objective function
+    comp_ = ROL::makePtr<TopOptComplianceObjective<Real,DeviceType>>(pde,mesh,comm,list,stream,"state","filtered_density");
+
+    // Filter PDE
+    assembler_filter_ = ROL::makePtr<Assembler<Real,DeviceType>>(pde_filter_->getFields(),
+                                                      pde_filter_->getFields2(),
+                                                      mesh,comm,list,stream);
+    assembler_filter_->setCellNodes(*pde_filter_);
+    ROL::ParameterList solverList = list.sublist("Solver");
+    solverList.set("Preconditioner","MueLu");
+    solverList.sublist("MueLu").set("number of equations",1);
+    solverList.sublist("MueLu").set("problem: symmetric",true);
+    solverList.sublist("Belos").set("Convergence Tolerance",1e-10);
+    solver_filter_    = ROL::makePtr<Solver<Real>>(solverList);
+
+    // Vector storage
+    f_state_ = assembler_filter_->createStateVector();
+    f_res_   = assembler_filter_->createResidualVector();
+    Fz_      = ROL::makePtr<PDE_PrimalSimVector<Real,DeviceType>>(assembler_filter_->createStateVector(),pde_filter_,assembler_filter_,list);
+    Fzcache_ = ROL::makePtr<PDE_PrimalSimVector<Real,DeviceType>>(assembler_filter_->createStateVector(),pde_filter_,assembler_filter_,list);
+    Fv_      = ROL::makePtr<PDE_PrimalSimVector<Real,DeviceType>>(assembler_filter_->createStateVector(),pde_filter_,assembler_filter_,list);
+    dctrl_   = ROL::makePtr<PDE_DualSimVector<Real,DeviceType>>(assembler_filter_->createStateVector(),pde_filter_,assembler_filter_,list);
+
+    nuke_      = list.sublist("Problem").get("Use Basic Update",false);
+    printFreq_ = list.sublist("Problem").get("Output Frequency",0);
+
+    assembleFilter();
+  }
+
+  const ROL::Ptr<Assembler<Real,DeviceType>> getAssembler(void) const {
+    return comp_->getAssembler();
+  }
+
+  const ROL::Ptr<Assembler<Real,DeviceType>> getFilterAssembler(void) const {
+    return assembler_filter_;
+  }
+
+  void update(const ROL::Vector<Real> &z, ROL::UpdateType type, int iter = -1) {
+    nupda_++;
+    if (nuke_) {
+      update_temp(z,iter);
+    }
+    else {
+      switch (type) {
+        case ROL::UpdateType::Initial: update_initial(z,iter); break;
+        case ROL::UpdateType::Accept:  update_accept(z,iter);  break;
+        case ROL::UpdateType::Revert:  update_revert(z,iter);  break;
+        case ROL::UpdateType::Trial:   update_trial(z,iter);   break;
+        case ROL::UpdateType::Temp:    update_temp(z,iter);   break;
+      }
+    }
+    comp_->update(*Fz_,type,iter);
+    // Print
+    if (printFreq_ > 0 && iter%printFreq_ == 0) {
+      std::stringstream dfile;
+      dfile  << "density_"          << iter << ".txt";
+      ROL::Ptr<const Tpetra::MultiVector<>> d_data = getConstField(z);
+      assembler_filter_->outputTpetraVector(d_data, dfile.str());
+    }
+  }
+
+  void update( const ROL::Vector<Real> &z, bool flag = true, int iter = -1 ) {
+    nupda_++;
+    if (nuke_) {
+      update_temp(z,iter);
+    }
+    else {
+      // Trial:    flag = false, iter = -1
+      // Check:    flag = true,  iter = -1
+      // Reject:   flag = false, iter > -1
+      // Accept:   flag = true,  iter > -1
+      if (flag) {
+        if (iter > -1) update_accept(z,iter);
+        else           update_temp(z,iter);
+      }
+      else {
+        if (iter > -1) update_revert(z,iter);
+        else           update_trial(z,iter);
+      }
+    }
+    comp_->update(*Fz_,flag,iter);
+  }
+
+  Real normalize(const ROL::Vector<Real> &z, Real &tol) {
+    applyFilter(*Fz_,z,false);
+    return comp_->normalize(*Fz_, tol);
+  }
+
+  void printToFile(const ROL::Vector<Real> &z, std::ostream &stream = std::cout,
+                   std::string ufile = "state.txt", std::string dfile = "density.txt",
+                   std::string ffile = "filtered_density.txt") {
+    update(z,ROL::UpdateType::Temp);
+    ROL::Ptr<const Tpetra::MultiVector<>> dens_data = getConstField(z);
+    assembler_filter_->outputTpetraVector(dens_data,  dfile);
+    comp_->printToFile(*Fz_, stream, ufile, ffile);
+  }
+
+  void solveState(ROL::Vector<Real> &u, const ROL::Vector<Real> &z) {
+    update(z,ROL::UpdateType::Temp);
+    comp_->solveState(u,*Fz_);
+  }
+
+  Real value( const ROL::Vector<Real> &z, Real &tol ) {
+    nfval_++;
+    return comp_->value(*Fz_,tol);
+  }
+
+  void gradient( ROL::Vector<Real> &g, const ROL::Vector<Real> &z, Real &tol ) {
+    ngrad_++;
+    comp_->gradient(*dctrl_, *Fz_, tol);
+    applyFilter(g, *dctrl_, true);
+  }
+
+  void hessVec( ROL::Vector<Real> &hv, const ROL::Vector<Real> &v, const ROL::Vector<Real> &z, Real &tol ) {
+    nhess_++;
+    applyFilter(*Fv_, v, false);
+    comp_->hessVec(*dctrl_, *Fv_, *Fz_, tol);
+    applyFilter(hv, *dctrl_, true);
+  }
+
+  /** \brief Apply a reduced Hessian preconditioner.
+  */
+  virtual void precond( ROL::Vector<Real> &Pv, const ROL::Vector<Real> &v, const ROL::Vector<Real> &z, Real &tol ) {
+    nprec_++;
+    Pv.set(v.dual());
+  }
+
+  void summarize(std::ostream &stream) const {
+    stream << std::endl;
+    stream << std::string(114,'=') << std::endl;
+    stream << "  Filtered_Objective_Compliance::summarize" << std::endl;
+    stream << "    Number of calls to update:          " << nupda_ << std::endl;
+    stream << "    Number of calls to value:           " << nfval_ << std::endl;
+    stream << "    Number of calls to gradient:        " << ngrad_ << std::endl;
+    stream << "    Number of calls to hessVec:         " << nhess_ << std::endl;
+    stream << "    Number of calls to precond:         " << nprec_ << std::endl;
+    stream << "    Number of filter assemblies:        " << nasFi_ << std::endl;
+    stream << "    Number of filter applies:           " << napFi_ << std::endl;
+    stream << std::string(114,'=') << std::endl;
+    stream << std::endl;
+    comp_->summarize(stream);
+  }
+
+  void reset(void) {
+    nupda_ = 0; nfval_ = 0; ngrad_ = 0; nhess_ = 0; nprec_ = 0; nasFi_ = 0; napFi_ = 0;
+    comp_->reset();
+  }
+
+// For parametrized (stochastic) objective functions and constraints
+public:
+  void setParameter(const std::vector<Real> &param) {
+    ROL::Objective<Real>::setParameter(param);
+    comp_->setParameter(param);
+  }
+
+private:
+
+  ROL::Ptr<const Tpetra::MultiVector<>> getConstField(const ROL::Vector<Real> &x) const {
+    ROL::Ptr<const Tpetra::MultiVector<>> xp;
+    try {
+      xp = dynamic_cast<const ROL::TpetraMultiVector<Real>&>(x).getVector();
+    }
+    catch (std::exception &e) {
+      try {
+        ROL::Ptr<const ROL::TpetraMultiVector<Real>> xvec
+          = dynamic_cast<const PDE_OptVector<Real>&>(x).getField();
+        if (xvec == ROL::nullPtr) {
+          xp = ROL::nullPtr;
+        }
+        else {
+          xp = xvec->getVector();
+        }
+      }
+      catch (std::exception &ee) {
+        xp = ROL::nullPtr;
+      }
+    }
+    return xp;
+  }
+
+  ROL::Ptr<Tpetra::MultiVector<>> getField(ROL::Vector<Real> &x) const {
+    ROL::Ptr<Tpetra::MultiVector<>> xp;
+    try {
+      xp = dynamic_cast<ROL::TpetraMultiVector<Real>&>(x).getVector();
+    }
+    catch (std::exception &e) {
+      try {
+        ROL::Ptr<ROL::TpetraMultiVector<Real>> xvec
+          = dynamic_cast<PDE_OptVector<Real>&>(x).getField();
+        if ( xvec == ROL::nullPtr ) {
+          xp = ROL::nullPtr;
+        }
+        else {
+          xp = xvec->getVector();
+        }
+      }
+      catch (std::exception &ee) {
+        xp = ROL::nullPtr;
+      }
+    }
+    return xp;
+  }
+
+  void assembleFilter(void) {
+    nasFi_++;
+    ROL::Ptr<Tpetra::MultiVector<>> f_ctrl = assembler_filter_->createControlVector();
+    assembler_filter_->assemblePDEJacobian1(matF1_,pde_filter_,f_state_,f_ctrl);
+    assembler_filter_->assemblePDEJacobian2(matF2_,pde_filter_,f_state_,f_ctrl);
+    solver_filter_->setA(matF1_);
+  }
+
+  void applyFilter(ROL::Vector<Real> &Fx, const ROL::Vector<Real> &x, const bool transpose) {
+    napFi_++;
+    ROL::Ptr<Tpetra::MultiVector<>>      Fx_data = getField(Fx);
+    ROL::Ptr<const Tpetra::MultiVector<>> x_data = getConstField(x);
+    if (transpose) {
+      solver_filter_->solve(f_state_,x_data,false);
+      matF2_->apply(*f_state_,*Fx_data,Teuchos::TRANS);
+    }
+    else {
+      matF2_->apply(*x_data,*f_res_);
+      solver_filter_->solve(Fx_data,f_res_,false);
+    }
+    Fx.scale(static_cast<Real>(-1));
+  }
+
+  void update_initial( const ROL::Vector<Real> &z, int iter ) {
+    applyFilter(*Fz_,z,false);
+    Fzcache_->set(*Fz_);
+  }
+
+  void update_accept( const ROL::Vector<Real> &z, int iter ) {
+    Fzcache_->set(*Fz_);
+  }
+
+  void update_temp( const ROL::Vector<Real> &z, int iter ) {
+    applyFilter(*Fz_,z,false);
+  }
+
+  void update_trial( const ROL::Vector<Real> &z, int iter ) {
+    applyFilter(*Fz_,z,false);
+  }
+
+  void update_revert( const ROL::Vector<Real> &z, int iter ) {
+    Fz_->set(*Fzcache_);
+  }
+}; // class TopOptComplianceObjective
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/materialtensorK.hpp b/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/materialtensorK.hpp
new file mode 100644
index 000000000000..4d83e594db3e
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/materialtensorK.hpp
@@ -0,0 +1,338 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  obj.hpp
+    \brief Provides the interface for local (cell-based) objective function computations.
+*/
+
+#ifndef PDEOPT_MATERIALTENSORK_HPP
+#define PDEOPT_MATERIALTENSORK_HPP
+
+#include "../../../TOOLS/feK.hpp"
+
+template <class Real, class DeviceType>
+class MaterialTensor {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  ROL::Ptr<fe_type> fe_;
+
+  // Problem parameters.
+  Real youngsModulus_;
+  Real poissonRatio_;
+  bool isPlainStress_;
+  Real minDensity_;
+  Real maxDensity_;
+  Real powerSIMP_;
+  bool usePhaseField_;
+  bool useProjection_;
+  Real beta_, eta_;
+  std::vector<std::vector<Real>> materialMat_;
+
+  // Precomputed quantities.
+  std::vector<scalar_view> BMat_;
+  std::vector<scalar_view> BdetJMat_;
+  std::vector<scalar_view> NMat_;
+  std::vector<scalar_view> NdetJMat_;
+  std::vector<scalar_view> CBdetJMat_;
+
+  void computeMaterialTensor(const int d) {
+    int matd = (d*(d+1))/2;
+    std::vector<Real> tmpvec(matd);
+    materialMat_.resize(matd, tmpvec);
+    if ((d==2) && (isPlainStress_)) {
+      Real one(1), half(0.5);
+      Real factor1 = youngsModulus_ /(one-poissonRatio_*poissonRatio_);
+      materialMat_[0][0] = factor1;
+      materialMat_[0][1] = factor1 * poissonRatio_;
+      materialMat_[1][0] = factor1 * poissonRatio_;
+      materialMat_[1][1] = factor1;
+      materialMat_[2][2] = factor1 * half * (one-poissonRatio_);
+    }
+    else if ((d==2) && (!isPlainStress_)) {
+      Real one(1), two(2), half(0.5);
+      Real factor2 = youngsModulus_ /(one+poissonRatio_)/(one-two*poissonRatio_);
+      materialMat_[0][0] = factor2 * (one-poissonRatio_);
+      materialMat_[0][1] = factor2 * poissonRatio_;
+      materialMat_[1][0] = factor2 * poissonRatio_;
+      materialMat_[1][1] = factor2 * (one-poissonRatio_);
+      materialMat_[2][2] = factor2 * half * (one-two*poissonRatio_);
+    }
+    else {
+      Real one(1), two(2), half(0.5);
+      Real lam = youngsModulus_*poissonRatio_/(one+poissonRatio_)/(one-two*poissonRatio_);
+      Real mu = half*youngsModulus_/(one+poissonRatio_);
+      materialMat_[0][0] = lam + two*mu;
+      materialMat_[0][1] = lam;
+      materialMat_[0][2] = lam;
+      materialMat_[1][0] = lam;
+      materialMat_[1][1] = lam + two*mu;
+      materialMat_[1][2] = lam;
+      materialMat_[2][0] = lam;
+      materialMat_[2][1] = lam;
+      materialMat_[2][2] = lam + two*mu;
+      materialMat_[3][3] = mu;
+      materialMat_[4][4] = mu;
+      materialMat_[5][5] = mu;
+    }
+  }
+
+  void computeNBmats(void) {
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    int p = fe_->gradN().extent_int(2);
+    int d = fe_->gradN().extent_int(3);
+    int matd = materialMat_.size();
+
+    for (int i=0; i<d; ++i) {
+      BMat_.push_back(scalar_view("BMat",c,f,p,matd));
+      BdetJMat_.push_back(scalar_view("BdetJMat",c,f,p,matd));
+      CBdetJMat_.push_back(scalar_view("CBdetJMat",c,f,p,matd));
+      NMat_.push_back(scalar_view("NMat",c,f,p,d));
+      NdetJMat_.push_back(scalar_view("NdetJMat",c,f,p,d));
+    }
+
+    if (d==2) {
+      for (int i=0; i<c; ++i) {
+        for (int j=0; j<f; ++j) {
+      	  for (int k=0; k<p; ++k) {
+            (NMat_[0])(i, j, k, 0) = (fe_->N())(i, j, k);
+            (NMat_[1])(i, j, k, 1) = (fe_->N())(i, j, k);
+            (NdetJMat_[0])(i, j, k, 0) = (fe_->NdetJ())(i, j, k);
+            (NdetJMat_[1])(i, j, k, 1) = (fe_->NdetJ())(i, j, k);
+                
+            (BMat_[0])(i, j, k, 0) = (fe_->gradN())(i, j, k, 0);
+            (BMat_[1])(i, j, k, 1) = (fe_->gradN())(i, j, k, 1);
+            (BMat_[0])(i, j, k, 2) = (fe_->gradN())(i, j, k, 1);
+            (BMat_[1])(i, j, k, 2) = (fe_->gradN())(i, j, k, 0);
+
+            (BdetJMat_[0])(i, j, k, 0) = (fe_->gradNdetJ())(i, j, k, 0);
+            (BdetJMat_[1])(i, j, k, 1) = (fe_->gradNdetJ())(i, j, k, 1);
+            (BdetJMat_[0])(i, j, k, 2) = (fe_->gradNdetJ())(i, j, k, 1);
+            (BdetJMat_[1])(i, j, k, 2) = (fe_->gradNdetJ())(i, j, k, 0);
+          }
+        }
+      }
+    }
+
+    if(d==3) {
+      for (int i=0; i<c; ++i) {
+        for (int j=0; j<f; ++j) {
+          for (int k=0; k<p; ++k) {
+            (NMat_[0])(i, j, k, 0) = (fe_->N())(i, j, k);
+            (NMat_[1])(i, j, k, 1) = (fe_->N())(i, j, k);
+            (NMat_[2])(i, j, k, 2) = (fe_->N())(i, j, k);
+            (NdetJMat_[0])(i, j, k, 0) = (fe_->NdetJ())(i, j, k);
+            (NdetJMat_[1])(i, j, k, 1) = (fe_->NdetJ())(i, j, k);
+            (NdetJMat_[2])(i, j, k, 2) = (fe_->NdetJ())(i, j, k);
+            
+            (BMat_[0])(i, j, k, 0) = (fe_->gradN())(i, j, k, 0);
+            (BMat_[1])(i, j, k, 1) = (fe_->gradN())(i, j, k, 1);
+            (BMat_[2])(i, j, k, 2) = (fe_->gradN())(i, j, k, 2);
+            (BMat_[1])(i, j, k, 3) = (fe_->gradN())(i, j, k, 2);
+            (BMat_[2])(i, j, k, 3) = (fe_->gradN())(i, j, k, 1);
+            (BMat_[0])(i, j, k, 4) = (fe_->gradN())(i, j, k, 2);
+            (BMat_[2])(i, j, k, 4) = (fe_->gradN())(i, j, k, 0);
+            (BMat_[0])(i, j, k, 5) = (fe_->gradN())(i, j, k, 1);
+            (BMat_[1])(i, j, k, 5) = (fe_->gradN())(i, j, k, 0);
+                
+            (BdetJMat_[0])(i, j, k, 0) = (fe_->gradNdetJ())(i, j, k, 0);
+            (BdetJMat_[1])(i, j, k, 1) = (fe_->gradNdetJ())(i, j, k, 1);
+            (BdetJMat_[2])(i, j, k, 2) = (fe_->gradNdetJ())(i, j, k, 2);
+            (BdetJMat_[1])(i, j, k, 3) = (fe_->gradNdetJ())(i, j, k, 2);
+            (BdetJMat_[2])(i, j, k, 3) = (fe_->gradNdetJ())(i, j, k, 1);
+            (BdetJMat_[0])(i, j, k, 4) = (fe_->gradNdetJ())(i, j, k, 2);
+            (BdetJMat_[2])(i, j, k, 4) = (fe_->gradNdetJ())(i, j, k, 0);
+            (BdetJMat_[0])(i, j, k, 5) = (fe_->gradNdetJ())(i, j, k, 1);
+            (BdetJMat_[1])(i, j, k, 5) = (fe_->gradNdetJ())(i, j, k, 0);
+          }
+        }
+      }
+    } 
+
+    for (int i=0; i<c; ++i) {
+      for (int j=0; j<f; ++j) {
+        for (int k=0; k<p; ++k) {
+          for (int l=0; l<d; ++l) {
+            for (int m=0; m<matd; ++m) {
+              for (int n=0; n<matd; ++n) {
+                (CBdetJMat_[l])(i,j,k,m) += materialMat_[m][n] * (BdetJMat_[l])(i,j,k,n);
+              }
+            }
+          }
+        }
+      }
+    }
+
+  }
+
+public:
+  MaterialTensor(ROL::ParameterList &parlist) {
+    // Problem parameters.
+    youngsModulus_  = parlist.get("Young's Modulus",      1.0);
+    poissonRatio_   = parlist.get("Poisson Ratio",        0.3);
+    isPlainStress_  = parlist.get("Use Plain Stress",     true);
+    minDensity_     = parlist.get("Minimum Density",      1e-4);
+    maxDensity_     = parlist.get("Maximum Density",      1.0);
+    powerSIMP_      = parlist.get("SIMP Power",           3.0);
+    usePhaseField_  = parlist.get("Use Phase Field",      false);
+    useProjection_  = parlist.get("Use Projection",       false);
+    beta_           = parlist.get("Projection Parameter", 1.0);
+    eta_            = parlist.get("Projection Threshold", 0.5);
+  }
+
+  void setFE(ROL::Ptr<fe_type> &fe) {
+    fe_ = fe;
+    int d = fe_->gradN().extent_int(3);
+    computeMaterialTensor(d);
+    computeNBmats();
+  }
+
+  void applyTensor(scalar_view & out, const scalar_view & inData) const {
+    int c = fe_->gradN().extent_int(0);
+    int p = fe_->gradN().extent_int(2);
+    int matd = materialMat_.size();
+
+    for (int i=0; i<c; ++i) {
+      for (int j=0; j<p; ++j) {
+        for (int k=0; k<matd; ++k) {
+          for (int l=0; l<matd; ++l) {
+            out(i,j,k) += materialMat_[k][l] * inData(i,j,l);
+          }
+        }
+      }
+    }
+  }
+
+  void computeUmat(scalar_view & UMat, std::vector<scalar_view > & gradU) const {
+    int c = fe_->gradN().extent_int(0);
+    int p = fe_->gradN().extent_int(2);
+    int d = fe_->gradN().extent_int(3);
+    int matd = materialMat_.size();
+
+    UMat = scalar_view("UMat",c,p,matd);
+
+    if (d==2) {
+      for (int i=0; i<c; ++i) {
+        for (int j=0; j<p; ++j) {
+          UMat(i, j, 0) = (gradU[0])(i, j, 0);
+          UMat(i, j, 1) = (gradU[1])(i, j, 1);
+          UMat(i, j, 2) = (gradU[0])(i, j, 1) + (gradU[1])(i, j, 0);
+        }
+      }
+    }
+
+    if(d==3) {
+      for (int i=0; i<c; ++i) {
+        for (int j=0; j<p; ++j) {
+          (UMat)(i, j, 0) = (gradU[0])(i, j, 0);
+          (UMat)(i, j, 1) = (gradU[1])(i, j, 1);
+          (UMat)(i, j, 2) = (gradU[2])(i, j, 2);
+          (UMat)(i, j, 3) = (gradU[1])(i, j, 2) + (gradU[2])(i, j, 1);
+          (UMat)(i, j, 4) = (gradU[0])(i, j, 2) + (gradU[2])(i, j, 0);
+          (UMat)(i, j, 5) = (gradU[0])(i, j, 1) + (gradU[1])(i, j, 0);
+        }
+      }
+    } 
+
+  }
+
+  void computeDensity(scalar_view & rho,
+                      const scalar_view & Z,
+                      const int deriv = 0) const {
+    // Retrieve dimensions.
+    int c = rho.extent_int(0);
+    int p = rho.extent_int(1);
+
+    Real z(0);
+    const Real half(0.5), one(1), two(2), diff = maxDensity_-minDensity_;
+    for (int i=0; i<c; ++i) {
+      for (int j=0; j<p; ++j) {
+        z = Z(i,j);
+        if (!usePhaseField_) {
+          if (!useProjection_) {
+            if (deriv==0)      rho(i,j) = minDensity_ + diff*std::pow(z, powerSIMP_);
+            else if (deriv==1) rho(i,j) = powerSIMP_*diff*std::pow(z, powerSIMP_-one);
+            else if (deriv==2) rho(i,j) = powerSIMP_*(powerSIMP_-one)*diff*std::pow(z, powerSIMP_-two);
+          }
+          else {
+            if (deriv==0) {
+              Real pz = project(z,0);
+              rho(i,j) = minDensity_ + diff*std::pow(pz, powerSIMP_);
+            }
+            else if (deriv==1) {
+              Real pz = project(z,0), pz1 = project(z,1);
+              rho(i,j) = powerSIMP_*diff*std::pow(pz, powerSIMP_-one)*pz1;
+            }
+            else if (deriv==2) {
+              Real pz = project(z,0), pz1 = project(z,1), pz2 = project(z,2);
+              rho(i,j) = powerSIMP_*(powerSIMP_-one)*diff*std::pow(pz, powerSIMP_-two)*pz1*pz1
+                         +powerSIMP_*diff*std::pow(pz, powerSIMP_-one)*pz2;
+            }
+          }
+        }
+        else {
+          z += one;
+          z *= half;
+          if (deriv==0)      rho(i,j) = minDensity_ + diff*std::pow(z, 2);
+          else if (deriv==1) rho(i,j) = diff*z;
+          else if (deriv==2) rho(i,j) = half*diff;
+        }
+      }
+    }
+  }
+
+  const scalar_view B(const int i) const {
+    return BMat_[i];
+  }
+
+  const scalar_view BdetJ(const int i) const {
+    return BdetJMat_[i];
+  }
+
+  const scalar_view N(const int i) const {
+    return NMat_[i];
+  }
+
+  const scalar_view NdetJ(const int i) const{
+    return NdetJMat_[i];
+  }
+
+  const scalar_view CBdetJ(const int i) const {
+    return CBdetJMat_[i];
+  }
+
+  int getMatrixDim(void) const {
+    return materialMat_.size();
+  }
+
+private:
+  Real project(Real rho, int deriv) const {
+    if (deriv==1) {
+      const Real one(1), tbe(std::tanh(beta_*eta_)), tbe1(std::tanh(beta_*(one-eta_)));
+      const Real tber(std::tanh(beta_*(rho-eta_)));
+      return (one-tber*tber)/(tbe+tbe1)*beta_;
+    }
+    else if (deriv==2) {
+      const Real one(1), tbe(std::tanh(beta_*eta_)), tbe1(std::tanh(beta_*(one-eta_)));
+      const Real tber(std::tanh(beta_*(rho-eta_))), two(2);
+      return -two*tber*(one-tber*tber)/(tbe+tbe1)*beta_*beta_;
+    }
+    const Real one(1), tbe(std::tanh(beta_*eta_)), tbe1(std::tanh(beta_*(one-eta_)));
+    return (tbe+std::tanh(beta_*(rho-eta_)))/(tbe+tbe1);
+  }
+};
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/obj_volumeK.hpp b/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/obj_volumeK.hpp
new file mode 100644
index 000000000000..46aeee0408b5
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/obj_volumeK.hpp
@@ -0,0 +1,123 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  obj.hpp
+    \brief Provides the interface for local (cell-based) objective function computations.
+*/
+
+#ifndef PDEOPT_QOI_VOLUMEK_HPP
+#define PDEOPT_QOI_VOLUMEK_HPP
+
+#include "../../../TOOLS/qoiK.hpp"
+#include "../../../TOOLS/feK.hpp"
+
+template <class Real, class DeviceType>
+class QoI_Volume_TopoOpt : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> fe_;
+  scalar_view ones_, volFrac_;
+
+public:
+  QoI_Volume_TopoOpt(const ROL::Ptr<fe_type> &fe, Real v0 = Real(0))
+  : fe_(fe) {
+    // Get relevant dimensions
+    int c = fe_->cubPts().extent_int(0);
+    int p = fe_->cubPts().extent_int(1);
+    ones_ = scalar_view("ones",c,p);
+    volFrac_ = scalar_view("volFrac",c,p);
+    Kokkos::deep_copy(ones_,static_cast<Real>(1));
+    Kokkos::deep_copy(volFrac_,v0);
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = fe_->gradN().extent_int(0);
+    int p = fe_->gradN().extent_int(2);
+
+    // Initialize output val
+    val = scalar_view("val",c);
+
+    // Evaluate on FE basis
+    scalar_view valZ_eval("valZ_eval",c, p);
+    fe_->evaluateValue(valZ_eval, z_coeff);
+    rst::subtract(valZ_eval,volFrac_);
+
+    // Compute energy
+    fe_->computeIntegral(val,valZ_eval,ones_);
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Volume_TopoOpt::gradient_1 is zero.");
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+
+    // Initialize output grad
+    grad = scalar_view("grad", c, f);
+
+    // Compute gradient of energy
+    fst::integrate(grad,ones_,fe_->NdetJ(),false);
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Volume_TopoOpt::HessVec_11 is zero.");
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Volume_TopoOpt::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Volume_TopoOpt::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Volume_TopoOpt::HessVec_22 is zero.");
+  }
+
+}; // QoI_VolumeObj
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/pde_elasticityK.hpp b/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/pde_elasticityK.hpp
new file mode 100644
index 000000000000..c1e4e2276e88
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/pde_elasticityK.hpp
@@ -0,0 +1,619 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  pde_elasticity.hpp
+    \brief Implements the local PDE interface for the structural topology
+           optimization problem.
+*/
+
+#ifndef PDE_TOPO_OPT_ELASTICITYK_HPP
+#define PDE_TOPO_OPT_ELASTICITYK_HPP
+
+#include "../../../TOOLS/pdeK.hpp"
+#include "../../../TOOLS/feK.hpp"
+#include "../../../TOOLS/fieldhelperK.hpp"
+
+#include "dirichletK.hpp"
+#include "tractionK.hpp"
+#include "materialtensorK.hpp"
+
+#include "Intrepid2_HVOL_C0_FEM.hpp"
+#include "Intrepid2_HGRAD_QUAD_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_QUAD_C2_FEM.hpp"
+#include "Intrepid2_HGRAD_HEX_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_HEX_C2_FEM.hpp"
+#include "Intrepid2_DefaultCubatureFactory.hpp"
+#include "Intrepid2_FunctionSpaceTools.hpp"
+#include "Intrepid2_CellTools.hpp"
+
+#include "ROL_Ptr.hpp"
+
+template <class Real, class DeviceType>
+class PDE_Elasticity : public PDE<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  // Finite element basis information
+  basis_ptr basisPtr_, basisPtrDens_;
+  std::vector<basis_ptr> basisPtrs_, basisPtrsDens_;
+  // Cell cubature information
+  ROL::Ptr<Intrepid2::Cubature<DeviceType,Real,Real>> cellCub_, bdryCub_;
+  // Cell node information
+  scalar_view volCellNodes_;
+  std::vector<std::vector<scalar_view>> bdryCellNodes_;
+  std::vector<std::vector<std::vector<int>>> bdryCellLocIds_;
+  // Finite element definition
+  ROL::Ptr<fe_type> fe_, feDens_;
+  std::vector<std::vector<ROL::Ptr<fe_type>>> feBdry_;
+  // Local degrees of freedom on boundary, for each side of the reference cell (first index).
+  std::vector<std::vector<int>> fidx_;
+  // Coordinates of degrees freedom on boundary cells.
+  // Indexing:  [sideset number][local side id](cell number, value at dof)
+  std::vector<std::vector<scalar_view>> bdryCellDofValues_;
+  // Field pattern, offsets, etc.
+  std::vector<std::vector<int>> fieldPattern_;   // local Field/DOF pattern; set from DOF manager 
+  int numFields_;                                // number of fields (equations in the PDE)
+  int numDofs_;                                  // total number of degrees of freedom for all (local) fields
+  std::vector<int> offset_;                      // for each field, a counting offset
+  std::vector<int> numFieldDofs_;                // for each field, number of degrees of freedom
+  std::vector<std::vector<int>> fieldPatternDens_;   // local Field/DOF pattern; set from DOF manager 
+  int numFieldsDens_;                                // number of fields (equations in the PDE)
+  int numDofsDens_;                                  // total number of degrees of freedom for all (local) fields
+  std::vector<int> offsetDens_;                      // for each field, a counting offset
+  std::vector<int> numFieldDofsDens_;                // for each field, number of degrees of freedom
+
+  ROL::Ptr<MaterialTensor<Real,DeviceType>>  matTensor_;
+  ROL::Ptr<Dirichlet<Real,DeviceType>>       dirichlet_;
+  ROL::Ptr<Traction<Real,DeviceType>>        traction_;
+  ROL::Ptr<FieldUtils::FieldInfo> fieldInfo_, fieldInfoDens_;
+
+  bool getFields2called_;
+
+public:
+  PDE_Elasticity(ROL::ParameterList &parlist) : getFields2called_(false) {
+    // Finite element fields.
+    int basisOrder     = parlist.sublist("Problem").get("Basis Order",1);
+    int basisOrderDens = parlist.sublist("Problem").get("Density Basis Order",0);
+    int cubDegree      = parlist.sublist("Problem").get("Cubature Degree",4);
+    int bdryCubDegree  = parlist.sublist("Problem").get("Boundary Cubature Degree",2);
+    int probDim        = parlist.sublist("Problem").get("Problem Dimension",3);
+    TEUCHOS_TEST_FOR_EXCEPTION(probDim > 3 || probDim < 2, std::invalid_argument,
+      ">>> PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/pde_elasticity.hpp: Problem dimension is not 2 or 3!");
+    TEUCHOS_TEST_FOR_EXCEPTION(basisOrder > 2 || basisOrder < 1, std::invalid_argument,
+      ">>> PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/pde_elasticity.hpp: Basis order is not 1 or 2!");
+    if (probDim == 2) {
+      if (basisOrder == 1)
+        basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceType,Real,Real>>();
+      else if (basisOrder == 2)
+        basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C2_FEM<DeviceType,Real,Real>>();
+    }
+    else if (probDim == 3) {
+      if (basisOrder == 1)
+        basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_HEX_C1_FEM<DeviceType,Real,Real>>();
+      else if (basisOrder == 2)
+        basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_HEX_C2_FEM<DeviceType,Real,Real>>();
+    }
+    basisPtrs_.clear(); basisPtrs_.resize(probDim,basisPtr_);  // Displacement component
+    shards::CellTopology cellType = basisPtr_->getBaseCellTopology();        // get cell type from basis
+    if (basisOrderDens == 1) {
+      if (probDim == 2)
+        basisPtrDens_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceType,Real,Real>>();
+      else if (probDim == 3)
+        basisPtrDens_ = ROL::makePtr<Intrepid2::Basis_HGRAD_HEX_C1_FEM<DeviceType,Real,Real>>();
+    }
+    else
+      basisPtrDens_ = ROL::makePtr<Intrepid2::Basis_HVOL_C0_FEM<DeviceType,Real,Real>>(cellType);
+    basisPtrsDens_.clear(); basisPtrsDens_.push_back(basisPtrDens_); // Density component
+
+    // Quadrature rules.
+    Intrepid2::DefaultCubatureFactory cubFactory;                            // create cubature factory
+    cellCub_ = cubFactory.create<DeviceType,Real,Real>(cellType, cubDegree); // create default cubature
+
+    shards::CellTopology bdryCellType = cellType.getCellTopologyData(probDim-1, 0);
+    bdryCub_ = cubFactory.create<DeviceType,Real,Real>(bdryCellType, bdryCubDegree);
+
+    matTensor_ = ROL::makePtr<MaterialTensor<Real,DeviceType>>(parlist.sublist("Problem"));
+    traction_  = ROL::makePtr<Traction<Real,DeviceType>>(probDim);
+    dirichlet_ = ROL::makePtr<Dirichlet<Real,DeviceType>>();
+
+    numDofs_ = 0;
+    numFields_ = basisPtrs_.size();
+    offset_.resize(numFields_);
+    numFieldDofs_.resize(numFields_);
+    for (int i=0; i<numFields_; ++i) {
+      if (i==0)
+        offset_[i]  = 0;
+      else
+        offset_[i]  = offset_[i-1] + basisPtrs_[i-1]->getCardinality();
+      numFieldDofs_[i] = basisPtrs_[i]->getCardinality();
+      numDofs_ += numFieldDofs_[i];
+    }
+
+    numDofsDens_ = 0;
+    numFieldsDens_ = basisPtrsDens_.size();
+    offsetDens_.resize(numFieldsDens_);
+    numFieldDofsDens_.resize(numFieldsDens_);
+    for (int i=0; i<numFieldsDens_; ++i) {
+      if (i==0)
+        offsetDens_[i]  = 0;
+      else
+        offsetDens_[i]  = offsetDens_[i-1] + basisPtrsDens_[i-1]->getCardinality();
+      numFieldDofsDens_[i] = basisPtrsDens_[i]->getCardinality();
+      numDofsDens_ += numFieldDofsDens_[i];
+    }
+  }
+
+  void residual(scalar_view &res,
+                const scalar_view u_coeff,
+                const scalar_view z_coeff = scalar_view(),
+                const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    int p = fe_->gradN().extent_int(2);
+    int d = fe_->gradN().extent_int(3);
+    int matd = matTensor_->getMatrixDim();
+ 
+    // Initialize residuals.
+    std::vector<scalar_view> R(d);
+    for (int i=0; i<d; ++i)
+      R[i] = scalar_view("res",c,f);
+
+    // Split u_coeff into components.
+    std::vector<scalar_view> U, Z;
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoDens_);
+
+    // Evaluate/interpolate finite element fields on cells.
+    std::vector<scalar_view> gradDisp_eval(d);
+    scalar_view UMat;
+    scalar_view rho("rho", c, p);
+    scalar_view valZ_eval("valZ_eval", c, p);
+    scalar_view rhoUMat("rhoUMat", c, p, matd);
+    for (int i=0; i<d; ++i) {
+      gradDisp_eval[i] =  scalar_view("gradDisp_eval", c, p, d);
+      fe_->evaluateGradient(gradDisp_eval[i], U[i]);
+    }
+    feDens_->evaluateValue(valZ_eval, Z[0]);
+
+    // EVALUATE MATERIAL TENSOR
+    matTensor_->computeUmat(UMat, gradDisp_eval);
+    matTensor_->computeDensity(rho, valZ_eval);
+    fst::scalarMultiplyDataData(rhoUMat, rho, UMat);
+    for (int i=0; i<d; ++i)
+      fst::integrate(R[i],rhoUMat,matTensor_->CBdetJ(i),false);
+
+    // APPLY TRACTION CONDITIONS
+    if (!traction_->isNull())
+      traction_->apply(R, feBdry_, PDE<Real,DeviceType>::getParameter(), static_cast<Real>(-1));
+
+    // APPLY DIRICHLET CONDITIONS
+    dirichlet_->applyResidual(R,U);
+
+    // Combine the residuals.
+    FieldUtils::combineFieldCoeff<Real>(res, R, fieldInfo_);
+  }
+
+  void Jacobian_1(scalar_view &jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    int p = fe_->gradN().extent_int(2);
+    int d = fe_->gradN().extent_int(3);
+    int matd = matTensor_->getMatrixDim();
+ 
+    // Initialize Jacobians.
+    std::vector<std::vector<scalar_view>> J(d);
+    for (int i=0; i<d; ++i) {
+      for (int j=0; j<d; ++j)
+        J[i].push_back(scalar_view("jac",c,f,f));
+    }
+
+    // Split z_coeff into components.
+    std::vector<scalar_view> Z;
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoDens_);
+
+    std::vector<scalar_view> rhoBMat(d);
+    scalar_view rho("rho", c, p);
+    scalar_view valZ_eval("valZ_eval", c, p);
+
+    feDens_->evaluateValue(valZ_eval, Z[0]);
+    matTensor_->computeDensity(rho, valZ_eval);
+    for (int i=0; i<d; ++i) {
+      rhoBMat[i] =  scalar_view("rhoBMat", c, f, p, matd);
+      fst::scalarMultiplyDataField(rhoBMat[i], rho, matTensor_->B(i));
+    }
+
+    /*** Evaluate weak form of the Jacobian. ***/
+    for (int i=0; i<d; ++i) {
+      for (int j=0; j<d; ++j)
+        fst::integrate(J[i][j],rhoBMat[i],matTensor_->CBdetJ(j),false);
+    }
+
+    // APPLY DIRICHLET CONDITIONS
+    dirichlet_->applyJacobian1(J);
+
+    // Combine the jacobians.
+    FieldUtils::combineFieldCoeff<Real>(jac, J, fieldInfo_, fieldInfo_);
+  }
+
+  void Jacobian_2(scalar_view &jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    int c  = fe_->gradN().extent_int(0);
+    int f  = fe_->gradN().extent_int(1);
+    int fd = feDens_->gradN().extent_int(1);
+    int p  = fe_->gradN().extent_int(2);
+    int d  = fe_->gradN().extent_int(3);
+    int matd = matTensor_->getMatrixDim();
+
+    // Initialize Jacobians.
+    std::vector<std::vector<scalar_view>> J(d);
+    for (int i=0; i<d; ++i)
+      J[i].push_back(scalar_view("jac",c,f,fd));
+
+    // Split u_coeff into components.
+    std::vector<scalar_view> U, Z;
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoDens_);
+
+    // Evaluate/interpolate finite element fields on cells.
+    std::vector<scalar_view> gradDisp_eval(d), CBrhoUMat(d);
+    scalar_view UMat;
+    scalar_view rho("rho", c, p);
+    scalar_view valZ_eval("valZ_eval", c, p);
+    scalar_view rhoUMat("rhoUMat", c, p, matd);
+    for (int i=0; i<d; ++i) {
+      CBrhoUMat[i]     = scalar_view("CBrhoUMat", c, f, p);
+      gradDisp_eval[i] = scalar_view("gradDisp_eval", c, p, d);
+      fe_->evaluateGradient(gradDisp_eval[i], U[i]);
+    }
+    feDens_->evaluateValue(valZ_eval, Z[0]);
+    matTensor_->computeUmat(UMat, gradDisp_eval);
+    matTensor_->computeDensity(rho, valZ_eval, 1);  // first derivative
+    fst::scalarMultiplyDataData(rhoUMat, rho, UMat);
+
+    for (int i=0; i<d; ++i)
+      fst::dotMultiplyDataField(CBrhoUMat[i], rhoUMat, matTensor_->CBdetJ(i));
+
+    /*** Evaluate weak form of the residual. ***/
+    for (int i=0; i<d; ++i)
+      fst::integrate(J[i][0],CBrhoUMat[i],feDens_->N(),false);
+
+    // APPLY DIRICHLET CONDITIONS
+    dirichlet_->applyJacobian2(J);
+
+    // Combine the jacobians.
+    FieldUtils::combineFieldCoeff<Real>(jac, J, fieldInfo_, fieldInfoDens_);
+  }
+
+  void Hessian_11(scalar_view &hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_TopoOpt::Hessian_11): Hessian is zero.");
+  }
+
+  void Hessian_12(scalar_view &hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    int c  = fe_->gradN().extent_int(0);
+    int f  = fe_->gradN().extent_int(1);
+    int fd = feDens_->gradN().extent_int(1);
+    int p  = fe_->gradN().extent_int(2);
+    int d  = fe_->gradN().extent_int(3);
+    int matd = matTensor_->getMatrixDim();
+
+    // Initialize Hessians.
+    std::vector<std::vector<scalar_view>> J(1);
+    for (int i=0; i<d; ++i)
+      J[0].push_back(scalar_view("jac",c,fd,f));
+
+    // Split u_coeff into components.
+    std::vector<scalar_view> L, Z;
+    FieldUtils::splitFieldCoeff<Real>(L, l_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoDens_);
+
+    // Apply Dirichlet conditions to the multipliers.
+    dirichlet_->applyMultiplier(L);
+
+    // Evaluate/interpolate finite element fields on cells.
+    std::vector<scalar_view> gradDisp_eval(d), CBrhoLMat(d);
+    scalar_view LMat;
+    scalar_view rho("rho", c, p);
+    scalar_view valZ_eval("valZ_eval", c, p);
+    scalar_view rhoLMat("rhoLMat", c, p, matd);
+    for (int i=0; i<d; ++i) {
+      CBrhoLMat[i]     = scalar_view("CBrhoLMat", c, f, p);
+      gradDisp_eval[i] = scalar_view("gradDisp_eval", c, p, d);
+      fe_->evaluateGradient(gradDisp_eval[i], L[i]);
+    }
+    feDens_->evaluateValue(valZ_eval, Z[0]);
+    matTensor_->computeUmat(LMat, gradDisp_eval);
+    matTensor_->computeDensity(rho, valZ_eval, 1);  // first derivative
+    fst::scalarMultiplyDataData(rhoLMat, rho, LMat);
+
+    for (int i=0; i<d; ++i)
+      fst::dotMultiplyDataField(CBrhoLMat[i], rhoLMat, matTensor_->CBdetJ(i));
+
+    /*** Evaluate weak form of the residual. ***/
+    for (int i=0; i<d; ++i)
+      fst::integrate(J[0][i],feDens_->N(),CBrhoLMat[i],false);
+
+    // Combine the Hessians.
+    FieldUtils::combineFieldCoeff<Real>(hess, J, fieldInfoDens_, fieldInfo_);
+  }
+
+  void Hessian_21(scalar_view &hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    int c  = fe_->gradN().extent_int(0);
+    int f  = fe_->gradN().extent_int(1);
+    int fd = feDens_->gradN().extent_int(1);
+    int p  = fe_->gradN().extent_int(2);
+    int d  = fe_->gradN().extent_int(3);
+    int matd = matTensor_->getMatrixDim();
+
+    // Initialize Hessians.
+    std::vector<std::vector<scalar_view>> J(d);
+    for (int i=0; i<d; ++i)
+      J[i].push_back(scalar_view("jac",c,f,fd));
+
+    // Split u_coeff into components.
+    std::vector<scalar_view> L, Z;
+    FieldUtils::splitFieldCoeff<Real>(L, l_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoDens_);
+
+    // Apply Dirichlet conditions to the multipliers.
+    dirichlet_->applyMultiplier(L);
+
+    // Evaluate/interpolate finite element fields on cells.
+    std::vector<scalar_view> gradDisp_eval(d), CBrhoLMat(d);
+    scalar_view LMat;
+    scalar_view rho("rho", c, p);
+    scalar_view valZ_eval("valZ_eval", c, p);
+    scalar_view rhoLMat("rhoLMat", c, p, matd);
+    for (int i=0; i<d; ++i) {
+      CBrhoLMat[i]     = scalar_view("CBrhoLMat", c, f, p);
+      gradDisp_eval[i] = scalar_view("gradDisp_eval", c, p, d);
+      fe_->evaluateGradient(gradDisp_eval[i], L[i]);
+    }
+    feDens_->evaluateValue(valZ_eval, Z[0]);
+    matTensor_->computeUmat(LMat, gradDisp_eval);
+    matTensor_->computeDensity(rho, valZ_eval, 1);  // first derivative
+    fst::scalarMultiplyDataData(rhoLMat, rho, LMat);
+
+    for (int i=0; i<d; ++i)
+      fst::dotMultiplyDataField(CBrhoLMat[i], rhoLMat, matTensor_->CBdetJ(i));
+
+    /*** Evaluate weak form of the residual. ***/
+    for (int i=0; i<d; ++i)
+      fst::integrate(J[i][0],CBrhoLMat[i],feDens_->N(),false);
+
+    // Combine the Hessians.
+    FieldUtils::combineFieldCoeff(hess, J, fieldInfo_, fieldInfoDens_);
+  }
+
+  void Hessian_22(scalar_view &hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    int c  = fe_->gradN().extent_int(0);
+    int fd = feDens_->gradN().extent_int(1);
+    int p  = fe_->gradN().extent_int(2);
+    int d  = fe_->gradN().extent_int(3);
+    int matd = matTensor_->getMatrixDim();
+
+    // Initialize Hessians.
+    std::vector<std::vector<scalar_view>> J(1);
+    J[0].push_back(scalar_view("jac",c,fd,fd));
+
+    // Split u_coeff into components.
+    std::vector<scalar_view> U, L, Z;
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(L, l_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoDens_);
+
+    // Apply Dirichlet conditions to the multipliers.
+    dirichlet_->applyMultiplier(L);
+
+    // Evaluate/interpolate finite element fields on cells.
+    std::vector<scalar_view> gradDispU_eval(d), gradDispL_eval(d);
+    scalar_view UMat, LMat;
+    scalar_view rho("rho", c, p);
+    scalar_view valZ_eval("valZ_eval", c, p);
+    scalar_view rhoLMat("rhoLMat", c, p, matd);
+    scalar_view CUMat("CUMat", c, p, matd);
+    scalar_view CUrhoLMat("CUrhoLMat", c, p);
+    scalar_view NCUrhoLMat("NCUrhoLMat", c, fd, p);
+    for (int i=0; i<d; ++i) {
+      gradDispU_eval[i] = scalar_view("gradDispU_eval", c, p, d);
+      fe_->evaluateGradient(gradDispU_eval[i], U[i]);
+      gradDispL_eval[i] = scalar_view("gradDispL_eval", c, p, d);
+      fe_->evaluateGradient(gradDispL_eval[i], L[i]);
+    }
+    feDens_->evaluateValue(valZ_eval, Z[0]);
+    matTensor_->computeUmat(UMat, gradDispU_eval);
+    matTensor_->computeUmat(LMat, gradDispL_eval);
+    matTensor_->computeDensity(rho, valZ_eval, 2);  // second derivative
+    fst::scalarMultiplyDataData(rhoLMat, rho, LMat);
+    matTensor_->applyTensor(CUMat, UMat);
+
+    fst::dotMultiplyDataData(CUrhoLMat, rhoLMat, CUMat);
+    fst::scalarMultiplyDataField(NCUrhoLMat, CUrhoLMat, feDens_->N());
+
+    /*** Evaluate weak form of the residual. ***/
+    fst::integrate(J[0][0],NCUrhoLMat,feDens_->NdetJ(),false);
+
+    // Combine the Hessians.
+    FieldUtils::combineFieldCoeff(hess, J, fieldInfoDens_, fieldInfoDens_);
+  }
+
+  void RieszMap_1(scalar_view & riesz) override {
+    //throw Exception::NotImplemented(">>> (PDE_TopoOpt::RieszMap_1): Not implemented.");
+
+    // Retrieve dimensions.
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    int d = fe_->gradN().extent_int(3);
+ 
+    // Initialize Jacobians.
+    std::vector<std::vector<scalar_view>> J(d);
+    for (int i=0; i<d; ++i) {
+      for (int j=0; j<d; ++j)
+        J[i].push_back(scalar_view("riesz1",c,f,f));
+    }
+
+    for (int i=0; i<d; ++i) {
+      Kokkos::deep_copy(J[i][i],fe_->stiffMat());
+      rst::add(J[i][i],fe_->massMat());
+    }
+
+    // Combine the jacobians.
+    FieldUtils::combineFieldCoeff<Real>(riesz, J, fieldInfo_, fieldInfo_);
+  }
+
+  void RieszMap_2(scalar_view & riesz) {
+    //throw Exception::NotImplemented(">>> (PDE_TopoOpt::RieszMap_2): Not implemented.");
+
+    // Retrieve dimensions.
+    int c = feDens_->gradN().extent_int(0);
+    int f = feDens_->gradN().extent_int(1);
+
+    riesz = scalar_view("riesz2",c,f,f);
+    Kokkos::deep_copy(riesz,feDens_->massMat());
+  }
+
+  // This must be called before getFields2
+  void setDensityFields(const std::vector<basis_ptr> &basisPtrs) {
+    if (getFields2called_) {
+      TEUCHOS_TEST_FOR_EXCEPTION(getFields2called_, std::invalid_argument,
+        ">>> PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/pde_elasticity.hpp: Must call before getFields2!");
+    }
+    else {
+      basisPtrDens_  = basisPtrs[0];
+      basisPtrsDens_ = basisPtrs;
+
+      numDofsDens_ = 0;
+      numFieldsDens_ = basisPtrsDens_.size();
+      offsetDens_.resize(numFieldsDens_);
+      numFieldDofsDens_.resize(numFieldsDens_);
+      for (int i=0; i<numFieldsDens_; ++i) {
+        if (i==0)
+          offsetDens_[i]  = 0;
+        else
+          offsetDens_[i]  = offsetDens_[i-1] + basisPtrsDens_[i-1]->getCardinality();
+        numFieldDofsDens_[i] = basisPtrsDens_[i]->getCardinality();
+        numDofsDens_ += numFieldDofsDens_[i];
+      }
+    }
+  }
+
+  std::vector<basis_ptr> getFields() override {
+    return basisPtrs_;
+  }
+
+  std::vector<basis_ptr> getFields2() override {
+    getFields2called_ = true;
+    return basisPtrsDens_;
+  }
+
+  void setCellNodes(const scalar_view &volCellNodes,
+                    const std::vector<std::vector<scalar_view>> &bdryCellNodes,
+                    const std::vector<std::vector<std::vector<int>>> &bdryCellLocIds) override {
+    volCellNodes_ = volCellNodes;
+    bdryCellNodes_ = bdryCellNodes;
+    bdryCellLocIds_ = bdryCellLocIds;
+    // Finite element definition.
+    fe_ = ROL::makePtr<fe_type>(volCellNodes_,basisPtr_,cellCub_);
+    feDens_ = ROL::makePtr<fe_type>(volCellNodes_,basisPtrDens_,cellCub_,false);
+    fidx_ = fe_->getBoundaryDofs();
+    if (!traction_->isNull())
+      traction_->setCellNodes(bdryCellNodes_,bdryCellLocIds_);
+    dirichlet_->setCellNodes(bdryCellNodes_,bdryCellLocIds_,fidx_);
+    matTensor_->setFE(fe_);
+    // Construct boundary FE
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      feBdry_.resize(numSideSets);
+      for (int i = 0; i < numSideSets; ++i) {
+        int numLocSides = bdryCellNodes[i].size();
+        feBdry_[i].resize(numLocSides);
+        for (int j = 0; j < numLocSides; ++j) {
+          if (bdryCellNodes[i][j] != scalar_view()) {
+            feBdry_[i][j] = ROL::makePtr<fe_type>(bdryCellNodes[i][j],basisPtr_,bdryCub_,j);
+          }
+        }
+      }
+    }
+  }
+
+  void setFieldPattern(const std::vector<std::vector<int>> &fieldPattern,
+                       const std::vector<std::vector<int>> &fieldPattern2) override {
+    fieldPattern_     = fieldPattern;
+    fieldPatternDens_ = fieldPattern2;
+    fieldInfo_     = ROL::makePtr<FieldUtils::FieldInfo>(numFields_, numDofs_, numFieldDofs_, fieldPattern_);
+    fieldInfoDens_ = ROL::makePtr<FieldUtils::FieldInfo>(numFieldsDens_, numDofsDens_, numFieldDofsDens_, fieldPatternDens_);
+  }
+
+  const ROL::Ptr<fe_type> getStateFE(void) const {
+    return fe_;
+  }
+
+  const ROL::Ptr<fe_type> getDensityFE(void) const {
+    return feDens_;
+  }
+
+  const std::vector<std::vector<ROL::Ptr<fe_type>>> getBdryFE(void) const {
+    return feBdry_;
+  }
+
+  const std::vector<std::vector<std::vector<int>>> getBdryCellLocIds(void) const {
+    return bdryCellLocIds_;
+  }
+
+  const ROL::Ptr<FieldUtils::FieldInfo> getStateFieldInfo(void) const {
+    return fieldInfo_;
+  }
+
+  const ROL::Ptr<FieldUtils::FieldInfo> getDensityFieldInfo(void) const {
+    return fieldInfoDens_;
+  }
+
+  const ROL::Ptr<Traction<Real,DeviceType>> getTraction(void) const {
+    return traction_;
+  }
+
+  const ROL::Ptr<MaterialTensor<Real,DeviceType>> getMaterialTensor(void) const {
+    return matTensor_;
+  }
+
+}; // PDE_TopoOpt
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/pde_filterK.hpp b/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/pde_filterK.hpp
new file mode 100644
index 000000000000..a832c71a656f
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/pde_filterK.hpp
@@ -0,0 +1,253 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  pde_filter.hpp
+    \brief Implements the local PDE interface for the structural topology
+           optimization problem.
+*/
+
+#ifndef PDE_TOPO_OPT_FILTERK_HPP
+#define PDE_TOPO_OPT_FILTERK_HPP
+
+#include "../../../TOOLS/pdeK.hpp"
+#include "../../../TOOLS/feK.hpp"
+
+#include "Intrepid2_HVOL_C0_FEM.hpp"
+#include "Intrepid2_HGRAD_QUAD_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_QUAD_C2_FEM.hpp"
+#include "Intrepid2_HGRAD_HEX_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_HEX_C2_FEM.hpp"
+#include "Intrepid2_DefaultCubatureFactory.hpp"
+#include "Intrepid2_FunctionSpaceTools.hpp"
+#include "Intrepid2_CellTools.hpp"
+
+#include "ROL_Ptr.hpp"
+
+
+template <class Real, class DeviceType>
+class PDE_Filter : public PDE<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  // Finite element basis information
+  basis_ptr basisPtr_, basisPtrDens_;
+  std::vector<basis_ptr> basisPtrs_, basisPtrsDens_;
+  // Cell cubature information
+  ROL::Ptr<Intrepid2::Cubature<DeviceType,Real,Real>> cellCub_;
+  // Cell node information
+  scalar_view volCellNodes_;
+  // Finite element definition
+  ROL::Ptr<fe_type> fe_, feDens_;
+  // Problem parameters.
+  Real lengthScale_;
+
+public:
+  PDE_Filter(ROL::ParameterList &parlist) {
+    // Finite element fields.
+    int basisOrder     = parlist.sublist("Problem").get("Filter Basis Order",1);
+    int basisOrderDens = parlist.sublist("Problem").get("Density Basis Order",0);
+    int cubDegree      = parlist.sublist("Problem").get("Cubature Degree",4);
+//    int bdryCubDegree = parlist.sublist("Problem").get("Boundary Cubature Degree",2);
+    int probDim        = parlist.sublist("Problem").get("Problem Dimension",3);
+    TEUCHOS_TEST_FOR_EXCEPTION(probDim>3||probDim<2, std::invalid_argument,
+      ">>> PDE-OPT/poisson/pde_poisson.hpp: Problem dimension is not 2 or 3!");
+    TEUCHOS_TEST_FOR_EXCEPTION(basisOrder>2||basisOrder<1, std::invalid_argument,
+      ">>> PDE-OPT/poisson/pde_poisson.hpp: Basis order is not 1 or 2!");
+    if (probDim == 2) {
+      if (basisOrder == 1)
+        basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceType,Real,Real>>();
+      else if (basisOrder == 2)
+        basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C2_FEM<DeviceType,Real,Real>>();
+    }
+    else if (probDim == 3) {
+      if (basisOrder == 1)
+        basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_HEX_C1_FEM<DeviceType,Real,Real>>();
+      else if (basisOrder == 2)
+        basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_HEX_C2_FEM<DeviceType,Real,Real>>();
+    }
+    basisPtrs_.clear();
+    basisPtrs_.push_back(basisPtr_);  // Filtered Density component
+    shards::CellTopology cellType = basisPtr_->getBaseCellTopology(); // get cell type from basis
+    if (basisOrderDens == 1) {
+      if (probDim == 2)
+        basisPtrDens_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceType,Real,Real>>();
+      else if (probDim == 3)
+        basisPtrDens_ = ROL::makePtr<Intrepid2::Basis_HGRAD_HEX_C1_FEM<DeviceType,Real,Real>>();
+    }
+    else
+      basisPtrDens_ = ROL::makePtr<Intrepid2::Basis_HVOL_C0_FEM<DeviceType,Real,Real>>(cellType);
+    basisPtrsDens_.clear();
+    basisPtrsDens_.push_back(basisPtrDens_); // Density component
+
+    // Quadrature rules.
+    Intrepid2::DefaultCubatureFactory cubFactory;                            // create cubature factory
+    cellCub_ = cubFactory.create<DeviceType,Real,Real>(cellType, cubDegree); // create default cubature
+
+    // Other problem parameters.
+    Real filterRadius = parlist.sublist("Problem").get("Filter Radius",  0.1);
+    lengthScale_ = std::pow(filterRadius, 2)/static_cast<Real>(12);
+  }
+
+  void residual(scalar_view & res,
+                const scalar_view u_coeff,
+                const scalar_view z_coeff = scalar_view(),
+                const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c = fe_->gradN().extent_int(0);
+    const int f = fe_->gradN().extent_int(1);
+    const int p = fe_->gradN().extent_int(2);
+    const int d = fe_->gradN().extent_int(3);
+ 
+    // Initialize residuals.
+    res = scalar_view("res",c,f);
+
+    // Evaluate/interpolate finite element fields on cells.
+    scalar_view valU_eval("valU_eval",c, p);
+    scalar_view valZ_eval("valZ_eval",c, p);
+    scalar_view gradU_eval("gradU_eval",c, p, d);
+    fe_->evaluateValue(valU_eval, u_coeff);
+    feDens_->evaluateValue(valZ_eval, z_coeff);
+    fe_->evaluateGradient(gradU_eval, u_coeff);
+
+    rst::scale(gradU_eval, lengthScale_);
+    rst::scale(valZ_eval, static_cast<Real>(-1));
+
+    /*** Evaluate weak form of the residual. ***/
+    fst::integrate(res,gradU_eval,fe_->gradNdetJ(),false);
+    fst::integrate(res,valU_eval,fe_->NdetJ(),true);
+    fst::integrate(res,valZ_eval,fe_->NdetJ(),true);
+  }
+
+  void Jacobian_1(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c = fe_->gradN().extent_int(0);
+    const int f = fe_->gradN().extent_int(1);
+ 
+    // Initialize Jacobians.
+    jac = scalar_view("jac",c,f,f);
+
+    /*** Evaluate weak form of the Jacobian. ***/
+    Kokkos::deep_copy(jac,fe_->stiffMat());
+    rst::scale(jac, lengthScale_); // ls*gradN1 . gradN2
+    rst::add(jac,fe_->massMat());  // + N1 * N2
+  }
+
+
+  void Jacobian_2(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c  = fe_->gradN().extent_int(0);
+    const int f  = fe_->gradN().extent_int(1);
+    const int fd = feDens_->gradN().extent_int(1);
+ 
+    // Initialize Jacobians.
+    jac = scalar_view("jac",c,f,fd);
+
+    /*** Evaluate weak form of the Jacobian. ***/
+    fst::integrate(jac,fe_->NdetJ(),feDens_->N(),false);
+    rst::scale(jac, static_cast<Real>(-1));  // -N1 * N2
+  }
+
+  void Hessian_11(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Filter::Hessian_11): Hessian is zero.");
+  }
+
+  void Hessian_12(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Filter::Hessian_12): Hessian is zero.");
+  }
+
+  void Hessian_21(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Filter::Hessian_21): Hessian is zero.");
+  }
+
+  void Hessian_22(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Filter::Hessian_22): Hessian is zero.");
+  }
+
+  void RieszMap_1(scalar_view & riesz) override {
+    //throw Exception::NotImplemented(">>> (PDE_Filter::RieszMap_1): Not implemented.");
+    // Retrieve dimensions.
+    const int c = fe_->gradN().extent_int(0);
+    const int f = fe_->gradN().extent_int(1);
+ 
+    // Initialize Jacobians.
+    riesz = scalar_view("riesz1",c,f,f);
+    Kokkos::deep_copy(riesz,fe_->stiffMat());
+    rst::add(riesz,fe_->massMat());
+  }
+
+  void RieszMap_2(scalar_view & riesz) override {
+    //throw Exception::NotImplemented(">>> (PDE_Filter::RieszMap_2): Not implemented.");
+    // Retrieve dimensions.
+    const int c = feDens_->gradN().extent_int(0);
+    const int f = feDens_->gradN().extent_int(1);
+ 
+    // Initialize Jacobians.
+    riesz = scalar_view("riesz2",c,f,f);
+    Kokkos::deep_copy(riesz,feDens_->massMat());
+  }
+
+  std::vector<basis_ptr> getFields() override {
+    return basisPtrs_;
+  }
+
+  std::vector<basis_ptr> getFields2() override {
+    return basisPtrsDens_;
+  }
+
+  void setCellNodes(const scalar_view &volCellNodes,
+                    const std::vector<std::vector<scalar_view>> &bdryCellNodes,
+                    const std::vector<std::vector<std::vector<int>>> &bdryCellLocIds) override {
+    volCellNodes_ = volCellNodes;
+    // Finite element definition.
+    fe_     = ROL::makePtr<fe_type>(volCellNodes_,basisPtr_,cellCub_);
+    feDens_ = ROL::makePtr<fe_type>(volCellNodes_,basisPtrDens_,cellCub_,false);
+  }
+
+  void setFieldPattern(const std::vector<std::vector<int>> &fieldPattern,
+                       const std::vector<std::vector<int>> &fieldPattern2) override {}
+
+  const ROL::Ptr<fe_type> getStateFE(void) const {
+    return fe_;
+  }
+
+  const ROL::Ptr<fe_type> getDensityFE(void) const {
+    return feDens_;
+  }
+
+}; // PDE_Filter
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/tractionK.hpp b/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/tractionK.hpp
new file mode 100644
index 000000000000..a1d878c0b678
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/tractionK.hpp
@@ -0,0 +1,191 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  traction.hpp
+    \brief Implements traction loads for the structural topology
+           optimization problem.
+*/
+
+#ifndef ROL_PDEOPT_ELASTICITY_TRACTIONK_HPP
+#define ROL_PDEOPT_ELASTICITY_TRACTIONK_HPP
+
+#include <vector>
+#include "ROL_Ptr.hpp"
+#include "ROL_ParameterList.hpp"
+#include "../../../TOOLS/feK.hpp"
+
+template<class Real, class DeviceType>
+class Traction {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  std::vector<int> sidesets_, sideids_;
+  std::vector<Real> loadMagnitude_;
+  std::vector<Real> loadPolar_, loadAzimuth_;
+  std::vector<std::vector<scalar_view>> bdryCellNodes_;
+  std::vector<std::vector<std::vector<int>>> bdryCellLocIds_;
+  int dim_, offset_;
+
+protected:
+  Real DegreesToRadians(Real deg) const {
+    return deg * static_cast<Real>(M_PI) / static_cast<Real>(180);
+  }
+
+public:
+  virtual ~Traction() {}
+
+  Traction(void) {};
+
+  Traction(int dim) : dim_(dim) {
+    TEUCHOS_TEST_FOR_EXCEPTION(dim > 3 || dim < 2, std::invalid_argument,
+      ">>> PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/traction.hpp: Problem dimension is not 2 or 3!");
+    if (dim==2) {
+      offset_ = 0;
+      sidesets_.push_back(2);
+      sideids_.push_back(0);
+      loadMagnitude_.push_back(static_cast<Real>(1));
+      loadPolar_.push_back(static_cast<Real>(270));
+      loadAzimuth_.push_back(static_cast<Real>(0));
+    }
+    else {
+      offset_ = 0;
+      sidesets_.push_back(2);
+      sideids_.push_back(1);
+      loadMagnitude_.push_back(static_cast<Real>(1));
+      loadPolar_.push_back(static_cast<Real>(270));
+      loadAzimuth_.push_back(static_cast<Real>(90));
+    }
+
+    int polarSize = loadPolar_.size();
+    for (int i=0; i<polarSize; ++i)
+      loadPolar_[i] = DegreesToRadians(loadPolar_[i]);
+
+    int azimuthSize = loadAzimuth_.size();
+    for (int i=0; i<azimuthSize; ++i)
+      loadAzimuth_[i] = DegreesToRadians(loadAzimuth_[i]);
+  }
+
+  bool isNull(void) const {
+    return (sidesets_.size()==0);
+  }
+
+  void setCellNodes(const std::vector<std::vector<scalar_view>> &bdryCellNodes,
+                    const std::vector<std::vector<std::vector<int>>> &bdryCellLocIds) {
+    bdryCellNodes_  = bdryCellNodes;
+    bdryCellLocIds_ = bdryCellLocIds;
+  }
+
+  virtual Real evaluate(const int                dir,
+                        const int                sideset,
+                        const std::vector<Real> &param) const {
+//    const int numSideSets = sidesets_.size();
+    const int paramSize   = param.size();
+    Real val(0);
+    Real loadMagNoise(0), loadAngNoise0(0);
+    if (paramSize > 0 + dim_*sideset)
+      loadMagNoise  = param[offset_ + 0 + dim_*sideset];
+    if (paramSize > 1 + dim_*sideset)
+      loadAngNoise0 = DegreesToRadians(param[offset_ + 1 + dim_*sideset]);
+    const Real loadMagnitude = loadMagnitude_[sideset] + loadMagNoise;
+    const Real loadAngle0    = loadPolar_[sideset]     + loadAngNoise0;
+
+    if (dim_==2) {
+      if (dir==0)
+        val = loadMagnitude*std::cos(loadAngle0);
+      if (dir==1)
+        val = loadMagnitude*std::sin(loadAngle0);
+    }
+    if (dim_==3) {
+      Real loadAngNoise1(0);
+      if (paramSize > 2 + dim_*sideset) {
+        loadAngNoise1 = DegreesToRadians(param[offset_ + 2 + dim_*sideset]);
+      }
+      const Real loadAngle1 = loadAzimuth_[sideset] + loadAngNoise1;
+
+      if (dir==0)
+        val = loadMagnitude*std::sin(loadAngle0)*std::cos(loadAngle1);
+      if (dir==1)
+        val = loadMagnitude*std::sin(loadAngle0)*std::sin(loadAngle1);
+      if (dir==2)
+        val = loadMagnitude*std::cos(loadAngle0);
+    }
+    return val;
+  }
+
+  void compute(std::vector<std::vector<std::vector<scalar_view>>> &traction,
+               const std::vector<std::vector<ROL::Ptr<fe_type>>> &fe,
+               const std::vector<Real> &param,
+               Real scale = Real(1)) const {
+    traction.clear();
+    traction.resize(bdryCellLocIds_.size());
+    const int numSideSets = sidesets_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        const int numLocalSideIds = bdryCellLocIds_[sidesets_[i]].size();
+        traction[sidesets_[i]].resize(numLocalSideIds);
+        for (int j = 0; j < numLocalSideIds; ++j) {
+          const int numCellsSide = bdryCellLocIds_[sidesets_[i]][j].size();
+          if (numCellsSide > 0) {
+            const int numCubPerSide = fe[sidesets_[i]][j]->cubPts().extent_int(1);
+            traction[sidesets_[i]][j].resize(dim_);
+            for (int k = 0; k < dim_; ++k) {
+              traction[sidesets_[i]][j][k] = scalar_view("traction", numCellsSide, numCubPerSide);
+              for (int c = 0; c < numCellsSide; ++c) {
+                for (int p = 0; p < numCubPerSide; ++p)
+                  (traction[sidesets_[i]][j][k])(c,p) = scale*evaluate(k,i,param);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void apply(std::vector<scalar_view> &R,
+             const std::vector<std::vector<ROL::Ptr<fe_type>>> &fe,
+             const std::vector<Real> &param,
+             Real scale = Real(1)) const {
+    const int numSideSets = sidesets_.size();
+    const int nf = R[0].extent_int(1);
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        const int numLocalSideIds = bdryCellLocIds_[sidesets_[i]].size();
+        for (int j = 0; j < numLocalSideIds; ++j) {
+          const int numCellsSide = bdryCellLocIds_[sidesets_[i]][j].size();
+          if (numCellsSide > 0) {
+            const int numCubPerSide = fe[sidesets_[i]][j]->cubPts().extent_int(1);
+            for (int k = 0; k < dim_; ++k) {
+              scalar_view traction("traction", numCellsSide, numCubPerSide);
+              for (int c = 0; c < numCellsSide; ++c) {
+                for (int p = 0; p < numCubPerSide; ++p)
+                  traction(c,p) = scale*evaluate(k,i,param);
+              }
+              scalar_view tractionResidual("tractionResidual", numCellsSide, nf);
+              fst::integrate(tractionResidual,traction,fe[sidesets_[i]][j]->NdetJ(),false);
+              // Add traction residual to volume residual
+              for (int l = 0; l < numCellsSide; ++l) {
+                int cidx = bdryCellLocIds_[sidesets_[i]][j][l];
+                for (int m = 0; m < nf; ++m)
+                  (R[k])(cidx,m) += tractionResidual(l,m);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/volume_conK.hpp b/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/volume_conK.hpp
new file mode 100644
index 000000000000..b94b2d03e238
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/published/NonsmoothTR_BaraldiKouri2022/src/volume_conK.hpp
@@ -0,0 +1,139 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+#ifndef TOPOPT_VOLUME_CONK_H
+#define TOPOPT_VOLUME_CONK_H
+
+#include "ROL_Constraint.hpp"
+#include "ROL_SingletonVector.hpp"
+#include "../../../TOOLS/qoiK.hpp"
+#include "../../../TOOLS/assemblerK.hpp"
+#include "../../../TOOLS/pdevectorK.hpp"
+
+template <class Real, class DeviceType>
+class TopOptVolumeConstraint : public ROL::Constraint<Real> {
+private:
+  // Elasticity PDE
+  const ROL::Ptr<QoI<Real,DeviceType>> qoi_;
+  const ROL::Ptr<Assembler<Real,DeviceType>> assembler_;
+  bool assemble_;
+  ROL::Ptr<Tpetra::MultiVector<>> controlZero_;
+  ROL::Ptr<ROL::Vector<Real>> V_, Vdual_;
+  Real V0_;
+
+  unsigned nupda_;
+  unsigned nfval_;
+  unsigned njaco_;
+  unsigned najac_;
+  unsigned nhess_;
+  unsigned nasse_;
+
+public:
+  TopOptVolumeConstraint(
+      const ROL::Ptr<QoI<Real,DeviceType>>       &qoi,
+      const ROL::Ptr<Assembler<Real,DeviceType>> &assembler,
+      const ROL::Ptr<const ROL::Vector<Real>>    &z)
+    : qoi_(qoi), assembler_(assembler), assemble_(true),
+      nupda_(0), nfval_(0), njaco_(0), najac_(0), nhess_(0), nasse_(0) {
+    controlZero_ = assembler_->createControlVector(); controlZero_->putScalar(0.0);
+    V_     = z->dual().clone();
+    Vdual_ = z->clone();
+    assemble();
+  }
+
+  void update( const ROL::Vector<Real> &z, bool flag = true, int iter = -1 ) {
+    nupda_++;
+  }
+
+  void value( ROL::Vector<Real> &c, const ROL::Vector<Real> &z, Real &tol ) override {
+    nfval_++;
+    dynamic_cast<ROL::SingletonVector<Real>&>(c).setValue(V0_ + Vdual_->dot(z));
+  }
+
+  void applyJacobian(ROL::Vector<Real> &jv, const ROL::Vector<Real> &v,
+                     const ROL::Vector<Real> &z, Real &tol) override {
+    njaco_++;
+    dynamic_cast<ROL::SingletonVector<Real>&>(jv).setValue(Vdual_->dot(v));
+  }
+
+  void applyAdjointJacobian( ROL::Vector<Real> &jv, const ROL::Vector<Real> &v, const ROL::Vector<Real> &z, Real &tol ) override {
+    najac_++;
+    Real vval = dynamic_cast<const ROL::SingletonVector<Real>&>(v).getValue();
+    jv.set(*V_);
+    jv.scale(vval);
+  }
+
+  void applyAdjointHessian( ROL::Vector<Real> &ahwv, const ROL::Vector<Real> &w, const ROL::Vector<Real> &v, const ROL::Vector<Real> &z, Real &tol ) override {
+    nhess_++;
+    ahwv.zero();
+  }
+
+  void summarize(std::ostream &stream) const {
+    stream << std::endl;
+    stream << std::string(114,'=') << std::endl;
+    stream << "  Volume_Constraint::summarize" << std::endl;
+    stream << "    Number of calls to update:               " << nupda_ << std::endl;
+    stream << "    Number of calls to value:                " << nfval_ << std::endl;
+    stream << "    Number of calls to applyJacobian:        " << njaco_ << std::endl;
+    stream << "    Number of calls to applyAdjointJacobian: " << najac_ << std::endl;
+    stream << "    Number of calls to applyAdjointHessian:  " << nhess_ << std::endl;
+    stream << "    Number of assemblies:                    " << nasse_ << std::endl;
+    stream << std::string(114,'=') << std::endl;
+    stream << std::endl;
+  }
+
+  void reset(void) {
+    nupda_ = 0; nfval_ = 0; njaco_ = 0; najac_ = 0; nhess_ = 0; nasse_ = 0;
+  }
+
+// For parametrized (stochastic) objective functions and constraints
+public:
+  void setParameter(const std::vector<Real> &param) override {
+    ROL::Constraint<Real>::setParameter(param);
+    qoi_->setParameter(param);
+  }
+
+private:
+
+  ROL::Ptr<Tpetra::MultiVector<>> getField(ROL::Vector<Real> &x) const {
+    ROL::Ptr<Tpetra::MultiVector<>> xp;
+    try {
+      xp = dynamic_cast<ROL::TpetraMultiVector<Real>&>(x).getVector();
+    }
+    catch (std::exception &e) {
+      try {
+        ROL::Ptr<ROL::TpetraMultiVector<Real>> xvec
+          = dynamic_cast<PDE_OptVector<Real>&>(x).getField();
+        if ( xvec == ROL::nullPtr ) {
+          xp = ROL::nullPtr;
+        }
+        else {
+          xp = xvec->getVector();
+        }
+      }
+      catch (std::exception &ee) {
+        xp = ROL::nullPtr;
+      }
+    }
+    return xp;
+  }
+
+  void assemble(void) {
+    nasse_++;
+    if (assemble_) {
+      ROL::Ptr<Tpetra::MultiVector<>> V = getField(*V_);
+      V0_ = assembler_->assembleQoIValue(qoi_,ROL::nullPtr,controlZero_);
+      assembler_->assembleQoIGradient2(V,qoi_,ROL::nullPtr,controlZero_);
+      Vdual_->set(V_->dual());
+      assemble_ = false;
+    }
+  }
+}; // class Volume_Constraint
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/dualksvm/CMakeLists.txt b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/dualksvm/CMakeLists.txt
index 7542e7a416b7..9d5153c5fc60 100644
--- a/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/dualksvm/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/dualksvm/CMakeLists.txt
@@ -11,13 +11,13 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_01
     SOURCES example_01.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     TRSPG_DualKSVMDataCopy
     SOURCE_FILES
       data_matrix.txt label_vector.txt info.txt input.xml
diff --git a/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/dualksvm/example_01.cpp b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/dualksvm/example_01.cpp
index 72d8d56d5ee2..b02ffcbd9ddd 100644
--- a/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/dualksvm/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/dualksvm/example_01.cpp
@@ -21,8 +21,7 @@
 #include "ROL_BLAS.hpp"
 
 #include "Teuchos_Time.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 #include <fstream>
@@ -146,7 +145,7 @@ class DualSVMConstraint : public ROL::StdConstraint<Real> {
 };
 
 int main(int argc, char *argv[]) {
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/dualsvm/CMakeLists.txt b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/dualsvm/CMakeLists.txt
index df55ae02c286..908da6e218ba 100644
--- a/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/dualsvm/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/dualsvm/CMakeLists.txt
@@ -11,13 +11,13 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_01
     SOURCES example_01.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     TRSPG_DualSVMDataCopy
     SOURCE_FILES
       input.xml
diff --git a/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/dualsvm/example_01.cpp b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/dualsvm/example_01.cpp
index e02722afd4d3..a6e35dc901d4 100644
--- a/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/dualsvm/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/dualsvm/example_01.cpp
@@ -18,8 +18,7 @@
 #include "ROL_Stream.hpp"
 
 #include "Teuchos_Time.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 #include <random>
@@ -132,7 +131,7 @@ class DualSVMConstraint : public ROL::StdConstraint<Real> {
 };
 
 int main(int argc, char *argv[]) {
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/CMakeLists.txt b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/CMakeLists.txt
index 3c16c24d36f9..269c26014611 100644
--- a/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/CMakeLists.txt
@@ -11,19 +11,19 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_01
     SOURCES example_01.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_02
     SOURCES example_02.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     TRSPG_TopoOptDataCopy
     SOURCE_FILES
       input_ex01.xml
@@ -39,5 +39,46 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
   )
 
+ENDIF()
+
+IF(${PROJECT_NAME}_ENABLE_Intrepid2 AND 
+   ${PROJECT_NAME}_ENABLE_Amesos2   AND
+   ${PROJECT_NAME}_ENABLE_Ifpack2   AND
+   ${PROJECT_NAME}_ENABLE_MueLu     AND
+   ${PROJECT_NAME}_ENABLE_Tpetra )
+
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/teuchos/src/sol)
+
+  # Need ROL_TpetraMultiVector.hpp
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
+
+  ROL_ADD_EXECUTABLE(
+    example_01_Kokkos
+    SOURCES example_01K.cpp
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_ADD_EXECUTABLE(
+    example_02_Kokkos
+    SOURCES example_02K.cpp
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_COPY_FILES_TO_BINARY_DIR(
+    TRSPG_TopoOptDataCopyK
+    SOURCE_FILES
+      input_ex01.xml
+      plotresults.m
+      bwr.m
+      plotdensity.m
+      processpics.m
+      picme
+      process3D.sh
+      meshfiles/truss2d.jou
+      meshfiles/truss3d.jou
+    SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}"
+    DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
+  )
 
 ENDIF()
diff --git a/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/example_01.cpp b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/example_01.cpp
index 8f0487bab2ee..7d5192d2ac51 100644
--- a/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/example_01.cpp
@@ -15,8 +15,7 @@
 #include "Teuchos_Comm.hpp"
 #include "Teuchos_Time.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -48,7 +47,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int>> comm
     = Tpetra::getDefaultComm();
   const int myRank = comm->getRank();
diff --git a/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/example_01K.cpp b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/example_01K.cpp
new file mode 100644
index 000000000000..7e643f0ab69e
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/example_01K.cpp
@@ -0,0 +1,288 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_01.cpp
+    \brief Shows how to solve the stuctural topology optimization problem.
+
+*/
+
+#include "Teuchos_Comm.hpp"
+#include "Teuchos_Time.hpp"
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+
+#include "ROL_Solver.hpp"
+#include "ROL_Bounds.hpp"
+#include "ROL_ConstraintFromObjective.hpp"
+#include "ROL_LinearCombinationObjective.hpp"
+
+#include "../../../TOOLS/pdevectorK.hpp"
+#include "../../../TOOLS/pdeobjectiveK.hpp"
+#include "../../../TOOLS/meshreaderK.hpp"
+
+#include "src/pde_elasticityK.hpp"
+#include "src/pde_filterK.hpp"
+#include "src/filtered_compliance_robjK.hpp"
+#include "src/volume_conK.hpp"
+#include "src/obj_volumeK.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+int main(int argc, char *argv[]) {
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  Kokkos::ScopeGuard kokkosScope (argc, argv);
+  auto comm = Tpetra::getDefaultComm();
+  const int myRank = comm->getRank();
+  if ((iprint > 0) && (myRank == 0))
+    outStream = ROL::makePtrFromRef(std::cout);
+  else
+    outStream = ROL::makePtrFromRef(bhs);
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+    RealT tol(1e-8);
+
+    /*** Read in XML input ***/
+    std::string filename = "input_ex01.xml";
+    auto parlist = ROL::getParametersFromXmlFile(filename);
+
+    // Retrieve parameters.
+    int probDim               = parlist->sublist("Problem").get("Problem Dimension", 2);
+    const RealT volFraction   = parlist->sublist("Problem").get("Volume Fraction", 0.4);
+    const RealT initDens      = volFraction;
+    const std::string minType  = "Compliance";
+    const std::string hessAppr = "None";
+    if (probDim==3) {
+      parlist->sublist("Mesh").set("File Name","meshfiles/truss3d.txt");
+    }
+    else {
+      probDim = 2;
+      parlist->sublist("Problem").set("Problem Dimension",probDim);
+      parlist->sublist("Mesh").set("File Name","meshfiles/truss2d.txt");
+    }
+    parlist->sublist("Problem").set("Use Inexact Linear Solves",false);
+    parlist->sublist("General").set("Inexact Objective Function",false);
+    parlist->sublist("General").set("Inexact Gradient",false);
+
+    /*** Initialize main data structure. ***/
+    int nProcs = comm->getSize();
+    if (nProcs == 1) nProcs = 0;
+    auto meshMgr = ROL::makePtr<MeshReader<RealT,DeviceT>>(*parlist,nProcs);
+    // Initialize PDE describing elasticity equations.
+    auto pde = ROL::makePtr<PDE_Elasticity<RealT,DeviceT>>(*parlist);
+
+    // Initialize the filter PDE.
+    auto pdeFilter = ROL::makePtr<PDE_Filter<RealT,DeviceT>>(*parlist);
+    pde->setDensityFields(pdeFilter->getFields());
+
+    // Initialize reduced compliance objective function.
+    auto robj_com = ROL::makePtr<TopOptFilteredComplianceObjective<RealT,DeviceT>>(
+                    pde,pdeFilter,meshMgr,comm,*parlist,*outStream);
+    auto assembler = robj_com->getAssembler();
+    auto assemblerFilter = robj_com->getFilterAssembler();
+
+    // Create vectors.
+    auto u_ptr = assembler->createStateVector();         u_ptr->putScalar(0.0);
+    auto p_ptr = assembler->createStateVector();         p_ptr->putScalar(0.0);
+    auto r_ptr = assembler->createResidualVector();      r_ptr->putScalar(0.0);
+    auto z_ptr = assemblerFilter->createControlVector(); z_ptr->putScalar(1.0);
+    auto up = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(u_ptr,pde,assembler,*parlist);
+    auto pp = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(p_ptr,pde,assembler,*parlist);
+    auto rp = ROL::makePtr<PDE_DualSimVector<RealT,DeviceT>>(r_ptr,pde,assembler,*parlist);
+    auto zp = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(z_ptr,pdeFilter,assemblerFilter,*parlist);
+    auto imul = ROL::makePtr<ROL::SingletonVector<RealT>>(0);
+
+    // Build volume objective function.
+    auto qoi_vol = ROL::makePtr<QoI_Volume_TopoOpt<RealT,DeviceT>>(pdeFilter->getDensityFE(),volFraction);
+    auto con_vol = ROL::makePtr<TopOptVolumeConstraint<RealT,DeviceT>>(qoi_vol,assemblerFilter,zp);
+
+    // Normalize compliance objective function
+    zp->setScalar(initDens);
+    RealT cs = ROL::dynamicPtrCast<TopOptFilteredComplianceObjective<RealT,DeviceT>>(robj_com)->normalize(*zp,tol);
+
+    // Output problem details
+    *outStream << std::endl;
+    *outStream << "Problem Data"          << std::endl;
+    *outStream << "  Dimension:         " << probDim << std::endl;
+    *outStream << "  SIMP Power:        "
+               << parlist->sublist("Problem").get("SIMP Power",3.0) << std::endl;
+    *outStream << "  Volume Fraction:   " << volFraction << std::endl;
+    *outStream << "  Initial Density:   " << initDens    << std::endl;
+    *outStream << "  Compliance Scale:  " << cs  << std::endl;
+    *outStream << std::endl;
+
+    // Create objective, constraint, multiplier and bounds
+
+    // Initialize bound constraints.
+    RealT lval = 0.0, uval = 1.0;
+    auto lop = zp->clone(); lop->setScalar(lval);
+    auto hip = zp->clone(); hip->setScalar(uval);
+    auto bnd = ROL::makePtr<ROL::Bounds<RealT>>(lop,hip);
+
+    // Set up optimization problem.
+    auto prob = ROL::makePtr<ROL::Problem<RealT>>(robj_com,zp);
+    prob->addBoundConstraint(bnd);
+    prob->addLinearConstraint("Volume",con_vol,imul);
+    prob->setProjectionAlgorithm(*parlist);
+
+    // Check derivatives.
+    bool derivCheck = parlist->sublist("Problem").get("Check derivatives",false);
+    if (derivCheck) prob->check(true,*outStream);
+
+    // Solve optimization problem.
+    ROL::Ptr<ROL::Solver<RealT>> solver;
+    std::vector<RealT> vals(6);
+
+    // SPG
+    zp->setScalar(initDens);
+    imul->zero();
+    robj_com->reset();
+    con_vol->reset();
+    prob->finalize(false,true,*outStream);
+    parlist->sublist("Step").set("Type","Spectral Gradient");
+    Teuchos::Time spgTimer("SPG Time", true);
+    solver = ROL::makePtr<ROL::Solver<RealT>>(prob,*parlist);
+    solver->solve(*outStream);
+    spgTimer.stop();
+    *outStream << "Total optimization time = " << spgTimer.totalElapsedTime() << " seconds.\n";
+    // Output.
+    robj_com->summarize(*outStream);
+    con_vol->summarize(*outStream);
+    vals[0] = solver->getAlgorithmState()->value;
+
+    // PQN
+    zp->setScalar(initDens);
+    imul->zero();
+    robj_com->reset();
+    con_vol->reset();
+    parlist->sublist("Step").set("Type","Line Search");
+    parlist->sublist("Step").sublist("Line Search").sublist("Descent Method").set("Type","Quasi-Newton Method");
+    Teuchos::Time pqnTimer("PQN Time", true);
+    solver = ROL::makePtr<ROL::Solver<RealT>>(prob,*parlist);
+    solver->solve(*outStream);
+    pqnTimer.stop();
+    *outStream << "Total optimization time = " << pqnTimer.totalElapsedTime() << " seconds.\n";
+    // Output.
+    robj_com->summarize(*outStream);
+    con_vol->summarize(*outStream);
+    vals[1] = solver->getAlgorithmState()->value;
+
+    // LMTR
+    zp->setScalar(initDens);
+    imul->zero();
+    robj_com->reset();
+    con_vol->reset();
+    parlist->sublist("Step").set("Type","Trust Region");
+    parlist->sublist("Step").sublist("Trust Region").set("Subproblem Model","Lin-More");
+    Teuchos::Time lmtrTimer("LMTR Time", true);
+    solver = ROL::makePtr<ROL::Solver<RealT>>(prob,*parlist);
+    solver->solve(*outStream);
+    lmtrTimer.stop();
+    *outStream << "Total optimization time = " << lmtrTimer.totalElapsedTime() << " seconds.\n";
+    // Output.
+    robj_com->summarize(*outStream);
+    con_vol->summarize(*outStream);
+    vals[2] = solver->getAlgorithmState()->value;
+
+    // TRSPG
+    zp->setScalar(initDens);
+    imul->zero();
+    robj_com->reset();
+    con_vol->reset();
+    parlist->sublist("Step").set("Type","Trust Region");
+    parlist->sublist("Step").sublist("Trust Region").set("Subproblem Model","SPG");
+    Teuchos::Time trspgTimer("TRSPG Time", true);
+    solver = ROL::makePtr<ROL::Solver<RealT>>(prob,*parlist);
+    solver->solve(*outStream);
+    trspgTimer.stop();
+    *outStream << "Total optimization time = " << trspgTimer.totalElapsedTime() << " seconds.\n";
+    // Output.
+    robj_com->summarize(*outStream);
+    con_vol->summarize(*outStream);
+    vals[3] = solver->getAlgorithmState()->value;
+
+    // AL-LMTR
+    zp->setScalar(initDens);
+    imul->zero();
+    robj_com->reset();
+    con_vol->reset();
+    prob->edit();
+    prob->finalize(true,true,*outStream);
+    parlist->sublist("Step").set("Type","Augmented Lagrangian");
+    parlist->sublist("Step").sublist("Augmented Lagrangian").set("Subproblem Step Type","Trust Region");
+    parlist->sublist("Step").sublist("Trust Region").set("Subproblem Model","Lin-More");
+    Teuchos::Time allmtrTimer("AL-LMTR Time", true);
+    solver = ROL::makePtr<ROL::Solver<RealT>>(prob,*parlist);
+    solver->solve(*outStream);
+    allmtrTimer.stop();
+    *outStream << "Total optimization time = " << allmtrTimer.totalElapsedTime() << " seconds.\n";
+    // Output.
+    robj_com->summarize(*outStream);
+    con_vol->summarize(*outStream);
+    vals[4] = solver->getAlgorithmState()->value;
+
+    // AL-LMTR
+    zp->setScalar(initDens);
+    imul->zero();
+    robj_com->reset();
+    con_vol->reset();
+    parlist->sublist("Step").set("Type","Augmented Lagrangian");
+    parlist->sublist("Step").sublist("Augmented Lagrangian").set("Subproblem Step Type","Trust Region");
+    parlist->sublist("Step").sublist("Trust Region").set("Subproblem Model","SPG");
+    Teuchos::Time altrspgTimer("AL-TRSPG Time", true);
+    solver = ROL::makePtr<ROL::Solver<RealT>>(prob,*parlist);
+    solver->solve(*outStream);
+    altrspgTimer.stop();
+    *outStream << "Total optimization time = " << altrspgTimer.totalElapsedTime() << " seconds.\n";
+    // Output.
+    robj_com->summarize(*outStream);
+    con_vol->summarize(*outStream);
+    vals[5] = solver->getAlgorithmState()->value;
+
+    RealT minval = std::min(vals[0],std::min(vals[1],std::min(vals[2],vals[3])));
+    *outStream << std::scientific << std::setprecision(16);
+    *outStream << "SPG      " << (vals[0]-minval)/minval << std::endl;
+    *outStream << "PQN      " << (vals[1]-minval)/minval << std::endl;
+    *outStream << "LMTR     " << (vals[2]-minval)/minval << std::endl;
+    *outStream << "TRSPG    " << (vals[3]-minval)/minval << std::endl;
+    *outStream << "AL-LMTR  " << (vals[4]-minval)/minval << std::endl;
+    *outStream << "AL-TRSPG " << (vals[5]-minval)/minval << std::endl;
+    *outStream << std::endl;
+
+    robj_com->printToFile(*zp,*outStream);
+
+    // Get a summary from the time monitor.
+    Teuchos::TimeMonitor::summarize();
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/example_02.cpp b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/example_02.cpp
index fff1ee30e424..e1f1b6197c05 100644
--- a/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/example_02.cpp
+++ b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/example_02.cpp
@@ -15,8 +15,7 @@
 #include "Teuchos_Comm.hpp"
 #include "Teuchos_Time.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -48,7 +47,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int>> comm
     = Tpetra::getDefaultComm();
   const int myRank = comm->getRank();
diff --git a/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/example_02K.cpp b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/example_02K.cpp
new file mode 100644
index 000000000000..7c7aebbe0803
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/example_02K.cpp
@@ -0,0 +1,187 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_01.cpp
+    \brief Shows how to solve the stuctural topology optimization problem.
+
+*/
+
+#include "Teuchos_Comm.hpp"
+#include "Teuchos_Time.hpp"
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+
+#include "ROL_Solver.hpp"
+#include "ROL_Bounds.hpp"
+#include "ROL_ConstraintFromObjective.hpp"
+#include "ROL_LinearCombinationObjective.hpp"
+
+#include "../../../TOOLS/pdevectorK.hpp"
+#include "../../../TOOLS/pdeobjectiveK.hpp"
+#include "../../../TOOLS/meshreaderK.hpp"
+
+#include "src/pde_elasticityK.hpp"
+#include "src/pde_filterK.hpp"
+#include "src/filtered_compliance_robjK.hpp"
+#include "src/volume_conK.hpp"
+#include "src/obj_volumeK.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+int main(int argc, char *argv[]) {
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  Kokkos::ScopeGuard kokkosScope (argc, argv);
+  auto comm = Tpetra::getDefaultComm();
+  const int myRank = comm->getRank();
+  if ((iprint > 0) && (myRank == 0))
+    outStream = ROL::makePtrFromRef(std::cout);
+  else
+    outStream = ROL::makePtrFromRef(bhs);
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+    RealT tol(1e-8);
+
+    /*** Read in XML input ***/
+    std::string filename = "input_ex01.xml";
+    auto parlist = ROL::getParametersFromXmlFile(filename);
+
+    // Retrieve parameters.
+    int probDim               = parlist->sublist("Problem").get("Problem Dimension", 2);
+    const RealT volFraction   = parlist->sublist("Problem").get("Volume Fraction", 0.4);
+    const RealT initDens      = volFraction;
+    const std::string minType  = "Compliance";
+    const std::string hessAppr = "None";
+    if (probDim==3) {
+      parlist->sublist("Mesh").set("File Name","meshfiles/truss3d.txt");
+    }
+    else {
+      probDim = 2;
+      parlist->sublist("Problem").set("Problem Dimension",probDim);
+      parlist->sublist("Mesh").set("File Name","meshfiles/truss2d.txt");
+    }
+    parlist->sublist("Problem").set("Use Inexact Linear Solves",true);
+    parlist->sublist("General").set("Inexact Objective Function",true);
+    parlist->sublist("General").set("Inexact Gradient",true);
+
+    /*** Initialize main data structure. ***/
+    int nProcs = comm->getSize();
+    if (nProcs == 1) nProcs = 0;
+    auto meshMgr = ROL::makePtr<MeshReader<RealT,DeviceT>>(*parlist,nProcs);
+    // Initialize PDE describing elasticity equations.
+    auto pde = ROL::makePtr<PDE_Elasticity<RealT,DeviceT>>(*parlist);
+
+    // Initialize the filter PDE.
+    auto pdeFilter = ROL::makePtr<PDE_Filter<RealT,DeviceT>>(*parlist);
+    pde->setDensityFields(pdeFilter->getFields());
+
+    // Initialize reduced compliance objective function.
+    auto robj_com = ROL::makePtr<TopOptFilteredComplianceObjective<RealT,DeviceT>>(
+                    pde,pdeFilter,meshMgr,comm,*parlist,*outStream);
+    auto assembler = robj_com->getAssembler();
+    auto assemblerFilter = robj_com->getFilterAssembler();
+
+    // Create vectors.
+    auto u_ptr = assembler->createStateVector();         u_ptr->putScalar(0.0);
+    auto p_ptr = assembler->createStateVector();         p_ptr->putScalar(0.0);
+    auto r_ptr = assembler->createResidualVector();      r_ptr->putScalar(0.0);
+    auto z_ptr = assemblerFilter->createControlVector(); z_ptr->putScalar(1.0);
+    auto up = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(u_ptr,pde,assembler,*parlist);
+    auto pp = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(p_ptr,pde,assembler,*parlist);
+    auto rp = ROL::makePtr<PDE_DualSimVector<RealT,DeviceT>>(r_ptr,pde,assembler,*parlist);
+    auto zp = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(z_ptr,pdeFilter,assemblerFilter,*parlist);
+    auto imul = ROL::makePtr<ROL::SingletonVector<RealT>>(0);
+
+    // Build volume objective function.
+    auto qoi_vol = ROL::makePtr<QoI_Volume_TopoOpt<RealT,DeviceT>>(pdeFilter->getDensityFE(),volFraction);
+    auto con_vol = ROL::makePtr<TopOptVolumeConstraint<RealT,DeviceT>>(qoi_vol,assemblerFilter,zp);
+
+    // Normalize compliance objective function
+    zp->setScalar(initDens);
+    RealT cs = ROL::dynamicPtrCast<TopOptFilteredComplianceObjective<RealT,DeviceT>>(robj_com)->normalize(*zp,tol);
+
+    // Output problem details
+    *outStream << std::endl;
+    *outStream << "Problem Data"          << std::endl;
+    *outStream << "  Dimension:         " << probDim << std::endl;
+    *outStream << "  SIMP Power:        "
+               << parlist->sublist("Problem").get("SIMP Power",3.0) << std::endl;
+    *outStream << "  Volume Fraction:   " << volFraction << std::endl;
+    *outStream << "  Initial Density:   " << initDens    << std::endl;
+    *outStream << "  Compliance Scale:  " << cs  << std::endl;
+    *outStream << std::endl;
+
+    // Create objective, constraint, multiplier and bounds
+
+    // Initialize bound constraints.
+    RealT lval = 0.0, uval = 1.0;
+    auto lop = zp->clone(); lop->setScalar(lval);
+    auto hip = zp->clone(); hip->setScalar(uval);
+    auto bnd = ROL::makePtr<ROL::Bounds<RealT>>(lop,hip);
+
+    // Set up optimization problem.
+    auto prob = ROL::makePtr<ROL::Problem<RealT>>(robj_com,zp);
+    prob->addBoundConstraint(bnd);
+    prob->addLinearConstraint("Volume",con_vol,imul);
+    prob->setProjectionAlgorithm(*parlist);
+    prob->finalize(false,true,*outStream);
+
+    // Check derivatives.
+    bool derivCheck = parlist->sublist("Problem").get("Check derivatives",false);
+    if (derivCheck) prob->check(true,*outStream);
+
+    // Solve optimization problem.
+    ROL::Ptr<ROL::Solver<RealT>> solver;
+
+    // TRSPG
+    zp->setScalar(initDens);
+    imul->zero();
+    robj_com->reset();
+    con_vol->reset();
+    parlist->sublist("Step").set("Type","Trust Region");
+    parlist->sublist("Step").sublist("Trust Region").set("Subproblem Model","SPG");
+    Teuchos::Time trspgTimer("TRSPG Time", true);
+    solver = ROL::makePtr<ROL::Solver<RealT>>(prob,*parlist);
+    solver->solve(*outStream);
+    trspgTimer.stop();
+    *outStream << "Total optimization time = " << trspgTimer.totalElapsedTime() << " seconds.\n";
+    // Output.
+    robj_com->summarize(*outStream);
+    con_vol->summarize(*outStream);
+    robj_com->printToFile(*zp,*outStream);
+
+    // Get a summary from the time monitor.
+    Teuchos::TimeMonitor::summarize();
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/src/compliance_robjK.hpp b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/src/compliance_robjK.hpp
new file mode 100644
index 000000000000..4ee134b8134f
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/src/compliance_robjK.hpp
@@ -0,0 +1,631 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+#ifndef TOPOT_COMPLIANCE_ROBJK_H
+#define TOPOT_COMPLIANCE_ROBJK_H
+
+#include "ROL_Objective.hpp"
+#include "ROL_VectorController.hpp"
+#include "ROL_SecantFactory.hpp"
+#include "../../../../TOOLS/pdeK.hpp"
+#include "../../../../TOOLS/assemblerK.hpp"
+#include "../../../../TOOLS/solver.hpp"
+#include "../../../../TOOLS/pdevectorK.hpp"
+
+template <class Real, class DeviceType>
+class TopOptComplianceObjective : public ROL::Objective<Real> {
+private:
+  // Elasticity PDE
+  const ROL::Ptr<PDE<Real,DeviceType>> pde_;
+  ROL::Ptr<Assembler<Real,DeviceType>> assembler_;
+  ROL::Ptr<Solver<Real>> solver_, solver_cache_, solver_tmp_;
+  ROL::Ptr<Tpetra::CrsMatrix<>> matJ1_, matJ1_cache_, matJ1_tmp_;
+  ROL::Ptr<Tpetra::CrsMatrix<>> matJ2_, matJ2_cache_, matJ2_tmp_;
+  ROL::Ptr<Tpetra::CrsMatrix<>> matH22_;
+  bool assembleJ1_, assembleJ2_, assembleH22_, assembleF_;
+  ROL::ParameterList solverList_;
+  Real tol_, maxtol_;
+  bool useInexact_;
+
+  // Output information
+  const std::string uname_, dname_;
+
+  // Vector Storage
+  ROL::Ptr<Tpetra::MultiVector<>> F_data_, state_sens_data_;
+  ROL::Ptr<Tpetra::MultiVector<>> dstat_data_, dctrl_data_;
+  ROL::Ptr<ROL::Vector<Real>> state_, dctrl_;
+  ROL::Ptr<ROL::VectorController<Real>> stateStore_;
+
+  bool storage_;
+  Real cmpScale_;
+  bool nuke_;
+  int printFreq_;
+
+  unsigned nstat_, nsens_;
+  unsigned nupda_, nfval_, ngrad_, nhess_, nprec_;
+  unsigned napJ1_, nasJ1_, napJ2_, nasJ2_, napH2_, nasH2_, nasLo_;
+
+  // Hessian Approximation Data
+  std::string hessAppr_;
+  Real H2scale_;
+  ROL::Ptr<ROL::Secant<Real>> secant_;
+  ROL::Ptr<Tpetra::MultiVector<>> s_data_;
+  ROL::Ptr<ROL::Vector<Real>> g_new_, g_old_, s_, g_;
+  bool computeGrad_;
+
+public:
+  TopOptComplianceObjective(
+      const ROL::Ptr<PDE<Real,DeviceType>>         &pde,
+      const ROL::Ptr<MeshManager<Real,DeviceType>> &mesh,
+      const ROL::Ptr<const Teuchos::Comm<int>>     &comm,
+      ROL::ParameterList                           &list,
+      std::ostream                                 &stream = std::cout,
+      std::string                                   uname  = "state",
+      std::string                                   dname  = "density")
+    : pde_(pde), assembleJ1_(true), assembleJ2_(true),
+      assembleH22_(true), assembleF_(true), tol_(1e-2), maxtol_(1e-2),
+      uname_(uname), dname_(dname),
+      nstat_(0), nsens_(0),
+      nupda_(0), nfval_(0), ngrad_(0), nhess_(0), nprec_(0),
+      napJ1_(0), nasJ1_(0), napJ2_(0), nasJ2_(0), napH2_(0), nasH2_(0), nasLo_(0), H2scale_(1) {
+    // Elasticity PDE
+    assembler_ = ROL::makePtr<Assembler<Real,DeviceType>>(pde_->getFields(),
+                                                          pde_->getFields2(),
+                                                          mesh,comm,list,stream);
+    assembler_->setCellNodes(*pde_);
+    solver_       = ROL::makePtr<Solver<Real>>(list.sublist("Solver"));
+    solver_cache_ = ROL::makePtr<Solver<Real>>(list.sublist("Solver"));
+    solverList_   = list.sublist("Solver");
+    useInexact_   = list.sublist("Problem").get("Use Inexact Linear Solves",false);
+
+    // Vector storage
+    F_data_           = assembler_->createResidualVector();
+    state_sens_data_  = assembler_->createStateVector();
+    dstat_data_       = assembler_->createResidualVector();
+    dctrl_data_       = assembler_->createControlVector();
+    state_            = ROL::makePtr<PDE_PrimalSimVector<Real,DeviceType>>(assembler_->createStateVector(),pde_,assembler_,list);
+    dctrl_            = ROL::makePtr<PDE_DualOptVector<Real,DeviceType>>(dctrl_data_,pde_,assembler_,list);
+    stateStore_       = ROL::makePtr<ROL::VectorController<Real>>();
+
+    storage_   = list.sublist("Problem").get("Use state storage", true);
+    cmpScale_  = list.sublist("Problem").get("Compliance Scaling", 1.0);
+    hessAppr_  = list.sublist("Problem").get("Hessian Approximation", "None");
+    nuke_      = list.sublist("Problem").get("Use Basic Update",false);
+    printFreq_ = list.sublist("Problem").get("Output Frequency",0);
+
+    computeGrad_ = true;
+    if (hessAppr_ == "Secant") {
+      s_data_ = assembler_->createControlVector();
+      s_      = ROL::makePtr<PDE_PrimalOptVector<Real,DeviceType>>(s_data_,pde_,assembler_,list);
+      g_      = ROL::makePtr<PDE_DualOptVector<Real,DeviceType>>(assembler_->createControlVector(),pde_,assembler_,list);
+      g_old_  = ROL::makePtr<PDE_DualOptVector<Real,DeviceType>>(assembler_->createControlVector(),pde_,assembler_,list);
+      g_new_  = ROL::makePtr<PDE_DualOptVector<Real,DeviceType>>(assembler_->createControlVector(),pde_,assembler_,list);
+      secant_ = ROL::SecantFactory<Real>(list);
+    }
+
+    initialize();
+  }
+
+  const ROL::Ptr<Assembler<Real,DeviceType>> getAssembler(void) const {
+    return assembler_;
+  }
+
+  void update(const ROL::Vector<Real> &z, ROL::UpdateType type, int iter = -1) {
+    nupda_++;
+    stateStore_->objectiveUpdate(type);
+    if (nuke_) {
+      update_temp(z,iter);
+    }
+    else {
+      switch (type) {
+        case ROL::UpdateType::Initial: update_initial(z,iter); break;
+        case ROL::UpdateType::Accept:  update_accept(z,iter);  break;
+        case ROL::UpdateType::Revert:  update_revert(z,iter);  break;
+        case ROL::UpdateType::Trial:   update_trial(z,iter);   break;
+        case ROL::UpdateType::Temp:    update_temp(z,iter);    break;
+      }
+    }
+    // Print
+    if (printFreq_ > 0 && iter%printFreq_ == 0) {
+      std::stringstream ufile, zfile;
+      ufile  << uname_ << "_" << iter << ".txt";
+      zfile  << dname_ << "_" << iter << ".txt";
+      ROL::Ptr<const Tpetra::MultiVector<>> z_data = getConstField(z);
+      ROL::Ptr<const Tpetra::MultiVector<>> u_data = getConstField(*state_);
+      assembler_->outputTpetraVector(u_data, ufile.str());
+      assembler_->outputTpetraVector(z_data, zfile.str());
+    }
+  }
+
+  void update( const ROL::Vector<Real> &z, bool flag = true, int iter = -1 ) {
+    nupda_++;
+    stateStore_->objectiveUpdate(true);
+    if (nuke_) {
+      update_temp(z,iter);
+    }
+    else {
+      // Trial:    flag = false, iter = -1
+      // Check:    flag = true,  iter = -1
+      // Reject:   flag = false, iter > -1
+      // Accept:   flag = true,  iter > -1
+      if (flag) {
+        if (iter > -1) {
+          update_accept(z,iter);
+        }
+        else {
+          update_temp(z,iter);
+        }
+      }
+      else {
+        if (iter > -1) {
+          update_revert(z,iter);
+        }
+        else {
+          update_trial(z,iter);
+        }
+      }
+    }
+  }
+
+  Real normalize(const ROL::Vector<Real> &z, Real &tol) {
+    update(z,ROL::UpdateType::Temp);
+    Real val = value(z,tol);
+    cmpScale_ /= val;
+    return cmpScale_;
+  }
+
+  void printToFile(const ROL::Vector<Real> &z, std::ostream &stream = std::cout,
+                   const std::string ufile = "state.txt", const std::string dfile = "density.txt") {
+    Real tol(1e-2*std::sqrt(ROL::ROL_EPSILON<Real>()));
+    update(z,ROL::UpdateType::Temp);
+    ROL::Ptr<const Tpetra::MultiVector<>> z_data = getConstField(z);
+    ROL::Ptr<Tpetra::MultiVector<>>       u_data = getField(*state_);
+    solve_state_equation(u_data, z_data, tol);
+    assembler_->outputTpetraVector(u_data, ufile);
+    assembler_->outputTpetraVector(z_data, dfile);
+    assembler_->printMeshData(stream);
+  }
+
+  void solveState(ROL::Vector<Real> &u, const ROL::Vector<Real> &z) {
+    Real tol(1e-2*std::sqrt(ROL::ROL_EPSILON<Real>()));
+    update(z,ROL::UpdateType::Temp);
+    ROL::Ptr<const Tpetra::MultiVector<>> z_data = getConstField(z);
+    ROL::Ptr<Tpetra::MultiVector<>>       u_data = getField(u);
+    solve_state_equation(u_data, z_data, tol);
+  }
+
+  Real value( const ROL::Vector<Real> &z, Real &tol ) {
+    nfval_++;
+    ROL::Ptr<const Tpetra::MultiVector<>> z_data = getConstField(z);
+    ROL::Ptr<Tpetra::MultiVector<>>       u_data = getField(*state_);
+    // Solve state equation
+    solve_state_equation(*state_,z,tol);
+    // Compute compliance
+    Teuchos::Array<Real> cmp(1,0);
+    F_data_->dot(*u_data, cmp.view(0,1));
+    return cmpScale_*cmp[0];
+  }
+
+  void gradient( ROL::Vector<Real> &g, const ROL::Vector<Real> &z, Real &tol ) {
+    ngrad_++;
+    if (hessAppr_ == "Secant") {
+      // Will not work for stochastic problems
+      if (computeGrad_) {
+        ROL::Ptr<Tpetra::MultiVector<>>       g_data = getField(g);
+        ROL::Ptr<const Tpetra::MultiVector<>> z_data = getConstField(z);
+        ROL::Ptr<Tpetra::MultiVector<>>       u_data = getField(*state_);
+        // Solve state equation
+        solve_state_equation(*state_,z,tol);
+        // Apply adjoint density jacobian
+        assembleJ2(u_data, z_data);
+        applyJacobian2(g_data, u_data, true);
+        g_data->scale(-cmpScale_);
+
+        g_->set(g);
+        computeGrad_ = false;
+      }
+      else {
+        g.set(*g_);
+      }
+    }
+    else {
+      ROL::Ptr<Tpetra::MultiVector<>>       g_data = getField(g);
+      ROL::Ptr<const Tpetra::MultiVector<>> z_data = getConstField(z);
+      ROL::Ptr<Tpetra::MultiVector<>>       u_data = getField(*state_);
+      // Solve state equation
+      solve_state_equation(*state_,z,tol);
+      // Apply adjoint density jacobian
+      assembleJ2(u_data, z_data);
+      applyJacobian2(g_data, u_data, true);
+      g_data->scale(-cmpScale_);
+    }
+  }
+
+  void hessVec( ROL::Vector<Real> &hv, const ROL::Vector<Real> &v, const ROL::Vector<Real> &z, Real &tol ) {
+    nhess_++;
+    if (hessAppr_ == "Zero") {
+      hv.zero();
+    }
+    else if (hessAppr_ == "Positive Riesz") {
+      hv.set(v.dual());
+    }
+    else if (hessAppr_ == "Negative Riesz") {
+      hv.set(v.dual());
+      hv.scale(static_cast<Real>(-1));
+    }
+    else if (hessAppr_ == "Negative") {
+      ROL::Ptr<Tpetra::MultiVector<>>       h_data = getField(hv);
+      ROL::Ptr<const Tpetra::MultiVector<>> v_data = getConstField(v);
+      ROL::Ptr<const Tpetra::MultiVector<>> z_data = getConstField(z);
+      ROL::Ptr<Tpetra::MultiVector<>>       u_data = getField(*state_);
+      // Solve state equation
+      solve_state_equation(*state_,z,tol_);
+      // Apply density hessian to direction
+      assembleH22(u_data, z_data);
+      applyHessian22(h_data, v_data);
+      h_data->scale(-cmpScale_);
+    }
+    else if (hessAppr_ == "Positive") {
+      ROL::Ptr<Tpetra::MultiVector<>>       h_data = getField(hv);
+      ROL::Ptr<const Tpetra::MultiVector<>> v_data = getConstField(v);
+      ROL::Ptr<const Tpetra::MultiVector<>> z_data = getConstField(z);
+      ROL::Ptr<Tpetra::MultiVector<>>       u_data = getField(*state_);
+      // Solve state equation
+      solve_state_equation(*state_,z,tol_);
+      // Solve state sensitivity equation
+      assembleJ2(u_data, z_data);
+      solve_state_sensitivity(state_sens_data_, v_data, tol_);
+      // Apply adjoint density jacobian to state sensitivity
+      applyJacobian2(h_data, state_sens_data_, true);
+      h_data->scale(static_cast<Real>(2)*cmpScale_);
+    }
+    else if (hessAppr_ == "No Solve") {
+      ROL::Ptr<Tpetra::MultiVector<>>       h_data = getField(hv);
+      ROL::Ptr<const Tpetra::MultiVector<>> v_data = getConstField(v);
+      ROL::Ptr<const Tpetra::MultiVector<>> z_data = getConstField(z);
+      ROL::Ptr<Tpetra::MultiVector<>>       u_data = getField(*state_);
+      // Solve state equation
+      solve_state_equation(*state_,z,tol_);
+      // Apply density hessian to direction
+      assembleH22(u_data, z_data);
+      applyHessian22(h_data, v_data);
+      // Approximate positive term
+      assembleJ2(u_data, z_data);
+      applyJacobian2(dstat_data_, v_data, false);
+      applyJacobian2(dctrl_data_, dstat_data_, true);
+      // Approximation of inv(K) by aI where a solves min||aF-U||
+      // or b=1/a solves min||F-bU||
+      Teuchos::Array<Real> UdotF(1,0), UdotU(1,0);
+      u_data->dot(*F_data_,UdotF.view(0,1));
+      F_data_->dot(*F_data_,UdotU.view(0,1));
+      H2scale_ = UdotF[0]/UdotU[0];
+      //u_data->dot(*u_data,UdotU.view(0,1));
+      //H2scale_ = UdotU[0]/UdotF[0];
+      h_data->update(static_cast<Real>(2)*cmpScale_*H2scale_,*dctrl_data_,-cmpScale_);
+    }
+    else if (hessAppr_ == "Secant") {
+      // Will not work for stochastic problems
+      ROL::Ptr<Tpetra::MultiVector<>>       h_data = getField(hv);
+      ROL::Ptr<const Tpetra::MultiVector<>> v_data = getConstField(v);
+      ROL::Ptr<const Tpetra::MultiVector<>> z_data = getConstField(z);
+      ROL::Ptr<Tpetra::MultiVector<>>       u_data = getField(*state_);
+      // Solve state equation
+      solve_state_equation(*state_,z,tol_);
+      // Apply density hessian to direction
+      assembleH22(u_data, z_data);
+      applyHessian22(h_data, v_data);
+      // Apply secant approximation of positive term
+      secant_->applyB(*dctrl_, v);
+      h_data->update(static_cast<Real>(2)*cmpScale_,*dctrl_data_,-cmpScale_);
+    }
+    else {
+      ROL::Ptr<Tpetra::MultiVector<>>       h_data = getField(hv);
+      ROL::Ptr<const Tpetra::MultiVector<>> v_data = getConstField(v);
+      ROL::Ptr<const Tpetra::MultiVector<>> z_data = getConstField(z);
+      ROL::Ptr<Tpetra::MultiVector<>>       u_data = getField(*state_);
+      // Solve state equation
+      solve_state_equation(*state_,z,tol_);
+      // Solve state sensitivity equation
+      assembleJ2(u_data, z_data);
+      solve_state_sensitivity(state_sens_data_, v_data, tol_);
+      // Apply density hessian to direction
+      assembleH22(u_data, z_data);
+      applyHessian22(h_data, v_data);
+      // Apply adjoint density jacobian to state sensitivity
+      applyJacobian2(dctrl_data_, state_sens_data_, true);
+      h_data->update(static_cast<Real>(2)*cmpScale_,*dctrl_data_,-cmpScale_);
+    }
+  }
+
+  /** \brief Apply a reduced Hessian preconditioner.
+  */
+  virtual void precond( ROL::Vector<Real> &Pv, const ROL::Vector<Real> &v, const ROL::Vector<Real> &z, Real &tol ) {
+    nprec_++;
+    Pv.set(v.dual());
+  }
+
+  void summarize(std::ostream &stream) const {
+    stream << std::endl;
+    stream << std::string(114,'=') << std::endl;
+    stream << "  Objective_Compliance::summarize" << std::endl;
+    stream << "    Number of calls to update:          " << nupda_ << std::endl;
+    stream << "    Number of calls to value:           " << nfval_ << std::endl;
+    stream << "    Number of calls to gradient:        " << ngrad_ << std::endl;
+    stream << "    Number of calls to hessVec:         " << nhess_ << std::endl;
+    stream << "    Number of calls to precond:         " << nprec_ << std::endl;
+    stream << "    Number of state solves:             " << nstat_ << std::endl;
+    stream << "    Number of state sensitivity solves: " << nsens_ << std::endl;
+    stream << "    Number of load assemblies:          " << nasLo_ << std::endl;
+    stream << "    Number of Jacobian_1 assemblies:    " << nasJ1_ << std::endl;
+    stream << "    Number of Jacobian_1 sovles:        " << napJ1_ << std::endl;
+    stream << "    Number of Jacobian_2 assemblies:    " << nasJ2_ << std::endl;
+    stream << "    Number of Jacobian_2 applies:       " << napJ2_ << std::endl;
+    stream << "    Number of Hessian_22 assemblies:    " << nasH2_ << std::endl;
+    stream << "    Number of Hessian_22 applies:       " << napH2_ << std::endl;
+    stream << std::string(114,'=') << std::endl;
+    stream << std::endl;
+  }
+
+  void reset(void) {
+    nupda_ = 0; nfval_ = 0; ngrad_ = 0; nhess_ = 0; nprec_ = 0; nstat_ = 0; nsens_ = 0;
+    nasLo_ = 0; nasJ1_ = 0; napJ1_ = 0; nasJ2_ = 0; napJ2_ = 0; nasH2_ = 0; napH2_ = 0;
+  }
+
+// For parametrized (stochastic) objective functions and constraints
+public:
+  void setParameter(const std::vector<Real> &param) {
+    ROL::Objective<Real>::setParameter(param);
+    pde_->setParameter(param);
+    assembleF_ = true;
+  }
+
+private:
+
+  ROL::Ptr<const Tpetra::MultiVector<>> getConstField(const ROL::Vector<Real> &x) const {
+    ROL::Ptr<const Tpetra::MultiVector<>> xp;
+    try {
+      xp = dynamic_cast<const ROL::TpetraMultiVector<Real>&>(x).getVector();
+    }
+    catch (std::exception &e) {
+      try {
+        ROL::Ptr<const ROL::TpetraMultiVector<Real>> xvec
+          = dynamic_cast<const PDE_OptVector<Real>&>(x).getField();
+        if (xvec == ROL::nullPtr) {
+          xp = ROL::nullPtr;
+        }
+        else {
+          xp = xvec->getVector();
+        }
+      }
+      catch (std::exception &ee) {
+        xp = ROL::nullPtr;
+      }
+    }
+    return xp;
+  }
+
+  ROL::Ptr<Tpetra::MultiVector<>> getField(ROL::Vector<Real> &x) const {
+    ROL::Ptr<Tpetra::MultiVector<>> xp;
+    try {
+      xp = dynamic_cast<ROL::TpetraMultiVector<Real>&>(x).getVector();
+    }
+    catch (std::exception &e) {
+      try {
+        ROL::Ptr<ROL::TpetraMultiVector<Real>> xvec
+          = dynamic_cast<PDE_OptVector<Real>&>(x).getField();
+        if ( xvec == ROL::nullPtr ) {
+          xp = ROL::nullPtr;
+        }
+        else {
+          xp = xvec->getVector();
+        }
+      }
+      catch (std::exception &ee) {
+        xp = ROL::nullPtr;
+      }
+    }
+    return xp;
+  }
+
+  void initialize(void) {
+    ROL::Ptr<Tpetra::MultiVector<>> u_data = getField(*state_);
+    u_data->putScalar(0.0);
+    dctrl_data_->putScalar(1.0);
+    assembleJ1_     = true;
+    assembleJ2_     = true;
+    assembleJ1(u_data, dctrl_data_,false);
+    assembleJ2(u_data, dctrl_data_);
+    matJ1_tmp_      = matJ1_;
+    matJ2_tmp_      = matJ2_;
+    matJ1_          = matJ1_cache_;
+    matJ2_          = matJ2_cache_;
+    assembleJ1_     = true;
+    assembleJ2_     = true;
+    assembleJ1(u_data, dctrl_data_,false);
+    assembleJ2(u_data, dctrl_data_);
+    matJ1_cache_    = matJ1_;
+    matJ2_cache_    = matJ2_;
+    solver_tmp_     = solver_;
+    solver_         = solver_cache_;
+    assembleJ1_     = true;
+    assembleJ2_     = true;
+    // Upon exit, matJ1, matJ2 and solver are set to their cached versions
+  }
+
+  // Assemble load vector
+  void assembleF(void) {
+    if (assembleF_) {
+      nasLo_++;
+      ROL::Ptr<Tpetra::MultiVector<>> u0 = assembler_->createStateVector();
+      ROL::Ptr<Tpetra::MultiVector<>> z1 = assembler_->createControlVector();
+      u0->putScalar(0.0);
+      z1->putScalar(1.0);
+      assembler_->assemblePDEResidual(F_data_,pde_,u0,z1);
+      F_data_->scale(static_cast<Real>(-1));
+      assembleF_ = false;
+    }
+  }
+
+  void assembleJ2(const ROL::Ptr<const Tpetra::MultiVector<>> &u, const ROL::Ptr<const Tpetra::MultiVector<>> &z) {
+    if (assembleJ2_) {
+      nasJ2_++;
+      assembler_->assemblePDEJacobian2(matJ2_,pde_,u,z);
+      assembleJ2_ = false;
+    }
+  }
+
+  void applyJacobian2(ROL::Ptr<Tpetra::MultiVector<>> &Jv, const ROL::Ptr<const Tpetra::MultiVector<>> &v,
+                      const bool transpose) {
+    napJ2_++;
+    if (transpose) {
+      matJ2_->apply(*v,*Jv,Teuchos::TRANS);
+    }
+    else {
+      matJ2_->apply(*v,*Jv);
+    }
+  }
+
+  void assembleH22(const ROL::Ptr<const Tpetra::MultiVector<>> &u, const ROL::Ptr<const Tpetra::MultiVector<>> &z) {
+    if (assembleH22_) {
+      nasH2_++;
+      assembler_->assemblePDEHessian22(matH22_,pde_,u,u,z);
+      assembleH22_ = false;
+    }
+  }
+
+  void applyHessian22(ROL::Ptr<Tpetra::MultiVector<>> &Hv, const ROL::Ptr<const Tpetra::MultiVector<>> &v) {
+    matH22_->apply(*v,*Hv);
+    napH2_++;
+  }
+
+  void assembleJ1(const ROL::Ptr<const Tpetra::MultiVector<>> &u, const ROL::Ptr<const Tpetra::MultiVector<>> &z, bool setSolve = true) {
+    if (assembleJ1_) {
+      nasJ1_++;
+      assembler_->assemblePDEJacobian1(matJ1_,pde_,u,z);
+      if (setSolve) {
+        solver_->setA(matJ1_);
+      }
+      assembleJ1_ = false;
+    }
+  }
+
+  void solve(ROL::Ptr<Tpetra::MultiVector<>> &x, const ROL::Ptr<const Tpetra::MultiVector<>> &b, Real &tol) {
+    napJ1_++;
+    if (useInexact_) {
+      solverList_.sublist("Belos").set("Convergence Tolerance",tol);
+      solver_->setParameters(solverList_);
+    }
+    solver_->solve(x,b,false);
+  }
+
+  void solve_state_equation(ROL::Vector<Real> &u, const ROL::Vector<Real> &z, Real &tol) {
+    bool isComputed = false;
+    Real mytol = std::min(maxtol_,tol);
+    if (storage_ && tol_ <= mytol) {
+      isComputed = stateStore_->get(u, ROL::Objective<Real>::getParameter());
+    }
+    if (!isComputed || !storage_ || tol_ > mytol) {
+      ROL::Ptr<const Tpetra::MultiVector<>> z_data = getConstField(z);
+      ROL::Ptr<Tpetra::MultiVector<>>       u_data = getField(u);
+      solve_state_equation(u_data,z_data,mytol);
+      tol_ = mytol;
+      if (storage_) {
+        stateStore_->set(u, ROL::Objective<Real>::getParameter());
+      }
+    }
+  }
+
+  void solve_state_equation(ROL::Ptr<Tpetra::MultiVector<>> &u, const ROL::Ptr<const Tpetra::MultiVector<>> &z, Real &tol) {
+    nstat_++;
+    assembleF(); assembleJ1(u,z,true);
+    solve(u,F_data_,tol);
+  }
+
+  void solve_state_sensitivity(ROL::Ptr<Tpetra::MultiVector<>> &s, const ROL::Ptr<const Tpetra::MultiVector<>> &v, Real &tol) {
+    // This assumes that assembleJ1 and assemble J2 were called
+    nsens_++;
+    applyJacobian2(dstat_data_, v, false);
+    solve(s, dstat_data_, tol);
+  }
+
+  void update_initial( const ROL::Vector<Real> &z, int iter ) {
+    assembleJ1_   = true;
+    assembleJ2_   = true;
+    assembleH22_  = true;
+
+    // For secant approximation
+    if (hessAppr_ == "Secant") {
+      computeGrad_ = true;
+      const Real half(0.5), one(1);
+      Real tol(std::sqrt(ROL::ROL_EPSILON<Real>()));
+      gradient(*g_old_,z,tol);       // g_old = J'(z)
+      g_old_->scale(half/cmpScale_); // g_old = J'(z)/2c
+      s_->set(z); s_->scale(-one);   // s     = -z
+    }
+  }
+
+  void update_accept( const ROL::Vector<Real> &z, int iter ) {
+    // Push trial state storage into base state storage.
+    solver_tmp_       = solver_cache_;
+    solver_cache_     = solver_;
+    matJ1_tmp_        = matJ1_cache_;
+    matJ1_cache_      = matJ1_;
+    matJ2_tmp_        = matJ2_cache_;
+    matJ2_cache_      = matJ2_;
+    assembleJ1_       = false;
+    // If assembleJ2 was not called, then assembleJ2_ = true
+    assembleH22_      = true;
+
+    // For secant update
+    // Secant Equation: B s = 1/2c(J'(z) - (J'(z_old) + H s))
+    if (hessAppr_ == "Secant") {
+      computeGrad_ = true;
+      const Real half(0.5), one(1);
+      Real tol(std::sqrt(ROL::ROL_EPSILON<Real>()));
+      s_->plus(z);                         // s     = z - z_old
+      applyHessian22(dctrl_data_,s_data_); // dctrl = -H s
+      g_old_->axpy(-half,*dctrl_);         // g_old = (J'(z_old) + cH s)/2c
+      gradient(*g_new_,z,tol);             // gnew  = J'(z)
+      g_new_->scale(half/cmpScale_);       // gnew  = J'(z)/2c
+      secant_->updateStorage(z,*g_new_,*g_old_,*s_,s_->norm(),iter);
+      s_->set(z); s_->scale(-one);         // s     = -z
+      g_old_->set(*g_new_);                // g_old = J'(z)/2c
+    }
+  }
+
+  void update_temp( const ROL::Vector<Real> &z, int iter ) {
+    assembleJ1_     = true;
+    assembleJ2_     = true;
+    assembleH22_    = true;
+    if (hessAppr_ == "Secant") {
+      computeGrad_    = true;
+    }
+  }
+
+  void update_trial( const ROL::Vector<Real> &z, int iter ) {
+    solver_       = solver_tmp_;
+    matJ1_        = matJ1_tmp_;
+    matJ2_        = matJ2_tmp_;
+    assembleJ1_   = true;
+    assembleJ2_   = true;
+    assembleH22_  = false;
+  }
+
+  void update_revert( const ROL::Vector<Real> &z, int iter ) {
+    solver_       = solver_cache_;
+    matJ1_        = matJ1_cache_;
+    matJ2_        = matJ2_cache_;
+    assembleJ1_   = false;
+    // If assembleJ2 was not called, then assembleJ2_ = true
+    assembleH22_  = false;
+    // If gradient was not called, then computeGrad_ = true
+  }
+}; // class TopOptComplianceObjective
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/src/dirichletK.hpp b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/src/dirichletK.hpp
new file mode 100644
index 000000000000..d975753d2930
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/src/dirichletK.hpp
@@ -0,0 +1,191 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  dirichlet.hpp
+    \brief Implements Dirichlet boundary conditions for the structural
+           topology optimization problem.
+*/
+
+#ifndef ROL_PDEOPT_ELASTICITY_DIRICHLETK_HPP
+#define ROL_PDEOPT_ELASTICITY_DIRICHLETK_HPP
+
+#include <vector>
+#include "ROL_Ptr.hpp"
+#include "ROL_ParameterList.hpp"
+#include "../../../../TOOLS/feK.hpp"
+
+template<class Real, class DeviceType>
+class Dirichlet {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  std::vector<int> sidesets_, types_;
+  std::vector<std::vector<scalar_view>> bdryCellNodes_;
+  std::vector<std::vector<std::vector<int>>> bdryCellLocIds_;
+  std::vector<std::vector<int>> fidx_;
+
+  std::vector<int> getConstrainedDimensions(int type, int dim) const {
+    std::vector<int> conDim;
+    if ( type == 1 ) {
+      conDim.push_back(0);
+    }
+    else if ( type == 2 ) {
+      conDim.push_back(1);
+    }
+    else if ( type == 3 && dim > 2 ) {
+      conDim.push_back(2);
+    }
+    else if ( type == 4 && dim > 2 ) {
+      conDim.push_back(0);
+      conDim.push_back(1);
+    }
+    else if ( type == 5 && dim > 2 ) {
+      conDim.push_back(0);
+      conDim.push_back(2);
+    }
+    else if ( type == 6 && dim > 2 ) {
+      conDim.push_back(1);
+      conDim.push_back(2);
+    }
+    else {
+      for (int i = 0; i < dim; ++i) {
+        conDim.push_back(i);
+      }
+    }
+    return conDim;
+  }
+
+public:
+  Dirichlet(void) {
+    sidesets_.push_back(0);
+    types_.push_back(0);
+  }
+
+  void setCellNodes(const std::vector<std::vector<scalar_view>> &bdryCellNodes,
+                    const std::vector<std::vector<std::vector<int>>> &bdryCellLocIds,
+                    const std::vector<std::vector<int>> fidx) {
+    bdryCellNodes_  = bdryCellNodes;
+    bdryCellLocIds_ = bdryCellLocIds;
+    fidx_           = fidx;
+  }
+
+  void applyResidual(std::vector<scalar_view> &R,
+                     const std::vector<scalar_view> &U) const {
+    const int d = R.size();
+    const int numSideSets = sidesets_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        const std::vector<int> conDim = getConstrainedDimensions(types_[i],d);
+        const int numConDim = conDim.size();
+        const int numLocalSideIds = bdryCellLocIds_[sidesets_[i]].size();
+        for (int j = 0; j < numLocalSideIds; ++j) {
+          const int numCellsSide = bdryCellLocIds_[sidesets_[i]][j].size();
+          const int numBdryDofs = fidx_[j].size();
+          for (int k = 0; k < numCellsSide; ++k) {
+            const int cidx = bdryCellLocIds_[sidesets_[i]][j][k];
+            for (int l = 0; l < numBdryDofs; ++l) {
+              for (int m=0; m < numConDim; ++m) {
+                (R[conDim[m]])(cidx,fidx_[j][l]) = (U[conDim[m]])(cidx,fidx_[j][l]);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void applyJacobian1(std::vector<std::vector<scalar_view>> &J) const {
+    const int d = J.size();
+    const int f = J[0][0].extent_int(1);
+    const int numSideSets = sidesets_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        const std::vector<int> conDim = getConstrainedDimensions(types_[i],d);
+        const int numConDim = conDim.size();
+        const int numLocalSideIds = bdryCellLocIds_[sidesets_[i]].size();
+        for (int j = 0; j < numLocalSideIds; ++j) {
+          const int numCellsSide = bdryCellLocIds_[sidesets_[i]][j].size();
+          const int numBdryDofs = fidx_[j].size();
+          for (int k = 0; k < numCellsSide; ++k) {
+            const int cidx = bdryCellLocIds_[sidesets_[i]][j][k];
+            for (int l = 0; l < numBdryDofs; ++l) {
+              for (int m=0; m < f; ++m) {
+                for (int n=0; n < numConDim; ++n) {
+                  for (int p=0; p < d; ++p) {
+                    (J[conDim[n]][p])(cidx,fidx_[j][l],m) = static_cast<Real>(0);
+                  }
+                  (J[conDim[n]][conDim[n]])(cidx,fidx_[j][l],fidx_[j][l]) = static_cast<Real>(1);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void applyJacobian2(std::vector<std::vector<scalar_view>> &J) const {
+    const int d = J.size();
+    const int f = J[0][0].extent_int(2);
+    const int numSideSets = sidesets_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        const std::vector<int> conDim = getConstrainedDimensions(types_[i],d);
+        const int numConDim = conDim.size();
+        const int numLocalSideIds = bdryCellLocIds_[sidesets_[i]].size();
+        for (int j = 0; j < numLocalSideIds; ++j) {
+          const int numCellsSide = bdryCellLocIds_[sidesets_[i]][j].size();
+          const int numBdryDofs = fidx_[j].size();
+          for (int k = 0; k < numCellsSide; ++k) {
+            const int cidx = bdryCellLocIds_[sidesets_[i]][j][k];
+            for (int l = 0; l < numBdryDofs; ++l) {
+              for (int m=0; m < f; ++m) {
+                for (int n=0; n < numConDim; ++n) {
+                  (J[conDim[n]][0])(cidx,fidx_[j][l],m) = static_cast<Real>(0);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void applyMultiplier(std::vector<scalar_view> &L) const {
+    const int d = L.size();
+    const int numSideSets = sidesets_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        const std::vector<int> conDim = getConstrainedDimensions(types_[i],d);
+        const int numConDim = conDim.size();
+        const int numLocalSideIds = bdryCellLocIds_[sidesets_[i]].size();
+        for (int j = 0; j < numLocalSideIds; ++j) {
+          const int numCellsSide = bdryCellLocIds_[sidesets_[i]][j].size();
+          const int numBdryDofs = fidx_[j].size();
+          for (int k = 0; k < numCellsSide; ++k) {
+            const int cidx = bdryCellLocIds_[sidesets_[i]][j][k];
+            for (int l = 0; l < numBdryDofs; ++l) {
+              for (int m=0; m < numConDim; ++m) {
+                (L[conDim[m]])(cidx,fidx_[j][l]) = static_cast<Real>(0);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/src/filtered_compliance_robjK.hpp b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/src/filtered_compliance_robjK.hpp
new file mode 100644
index 000000000000..0513fc28ffc2
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/src/filtered_compliance_robjK.hpp
@@ -0,0 +1,309 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+#ifndef TOPOPT_FILTERED_COMPLIANCE_ROBJK_H
+#define TOPOPT_FILTERED_COMPLIANCE_ROBJK_H
+
+#include "ROL_Objective.hpp"
+#include "../../../../TOOLS/pdeK.hpp"
+#include "../../../../TOOLS/assemblerK.hpp"
+#include "../../../../TOOLS/solver.hpp"
+#include "../../../../TOOLS/pdevectorK.hpp"
+#include "compliance_robjK.hpp"
+
+template <class Real, class DeviceType>
+class TopOptFilteredComplianceObjective : public ROL::Objective<Real> {
+private:
+  // Compliance objective function
+  ROL::Ptr<TopOptComplianceObjective<Real,DeviceType>> comp_;
+
+  // FIlter PDE
+  const ROL::Ptr<PDE<Real,DeviceType>> pde_filter_;
+  ROL::Ptr<Assembler<Real,DeviceType>> assembler_filter_;
+  ROL::Ptr<Solver<Real>> solver_filter_;
+  ROL::Ptr<Tpetra::CrsMatrix<>> matF1_, matF2_;
+
+  // Vector Storage
+  ROL::Ptr<Tpetra::MultiVector<>> f_state_, f_res_;
+  ROL::Ptr<ROL::Vector<Real>> Fz_, Fzcache_, Fv_, dctrl_;
+
+  bool nuke_;
+  int printFreq_;
+
+  unsigned nupda_, nfval_, ngrad_, nhess_, nprec_;
+  unsigned nasFi_, napFi_;
+
+public:
+  TopOptFilteredComplianceObjective(
+      const ROL::Ptr<PDE<Real,DeviceType>>         &pde,
+      const ROL::Ptr<PDE<Real,DeviceType>>         &filter,
+      const ROL::Ptr<MeshManager<Real,DeviceType>> &mesh,
+      const ROL::Ptr<const Teuchos::Comm<int>>     &comm,
+      ROL::ParameterList                           &list,
+      std::ostream                                 &stream = std::cout)
+    : pde_filter_(filter),
+      nupda_(0), nfval_(0), ngrad_(0), nhess_(0), nprec_(0),
+      nasFi_(0), napFi_(0) {
+    // Compliance objective function
+    comp_ = ROL::makePtr<TopOptComplianceObjective<Real,DeviceType>>(pde,mesh,comm,list,stream,"state","filtered_density");
+
+    // Filter PDE
+    assembler_filter_ = ROL::makePtr<Assembler<Real,DeviceType>>(pde_filter_->getFields(),
+                                                      pde_filter_->getFields2(),
+                                                      mesh,comm,list,stream);
+    assembler_filter_->setCellNodes(*pde_filter_);
+    ROL::ParameterList solverList = list.sublist("Solver");
+    solverList.set("Preconditioner","MueLu");
+    solverList.sublist("MueLu").set("number of equations",1);
+    solverList.sublist("MueLu").set("problem: symmetric",true);
+    solverList.sublist("Belos").set("Convergence Tolerance",1e-10);
+    solver_filter_    = ROL::makePtr<Solver<Real>>(solverList);
+
+    // Vector storage
+    f_state_ = assembler_filter_->createStateVector();
+    f_res_   = assembler_filter_->createResidualVector();
+    Fz_      = ROL::makePtr<PDE_PrimalSimVector<Real,DeviceType>>(assembler_filter_->createStateVector(),pde_filter_,assembler_filter_,list);
+    Fzcache_ = ROL::makePtr<PDE_PrimalSimVector<Real,DeviceType>>(assembler_filter_->createStateVector(),pde_filter_,assembler_filter_,list);
+    Fv_      = ROL::makePtr<PDE_PrimalSimVector<Real,DeviceType>>(assembler_filter_->createStateVector(),pde_filter_,assembler_filter_,list);
+    dctrl_   = ROL::makePtr<PDE_DualSimVector<Real,DeviceType>>(assembler_filter_->createStateVector(),pde_filter_,assembler_filter_,list);
+
+    nuke_      = list.sublist("Problem").get("Use Basic Update",false);
+    printFreq_ = list.sublist("Problem").get("Output Frequency",0);
+
+    assembleFilter();
+  }
+
+  const ROL::Ptr<Assembler<Real,DeviceType>> getAssembler(void) const {
+    return comp_->getAssembler();
+  }
+
+  const ROL::Ptr<Assembler<Real,DeviceType>> getFilterAssembler(void) const {
+    return assembler_filter_;
+  }
+
+  void update(const ROL::Vector<Real> &z, ROL::UpdateType type, int iter = -1) {
+    nupda_++;
+    if (nuke_) {
+      update_temp(z,iter);
+    }
+    else {
+      switch (type) {
+        case ROL::UpdateType::Initial: update_initial(z,iter); break;
+        case ROL::UpdateType::Accept:  update_accept(z,iter);  break;
+        case ROL::UpdateType::Revert:  update_revert(z,iter);  break;
+        case ROL::UpdateType::Trial:   update_trial(z,iter);   break;
+        case ROL::UpdateType::Temp:    update_temp(z,iter);   break;
+      }
+    }
+    comp_->update(*Fz_,type,iter);
+    // Print
+    if (printFreq_ > 0 && iter%printFreq_ == 0) {
+      std::stringstream dfile;
+      dfile  << "density_"          << iter << ".txt";
+      ROL::Ptr<const Tpetra::MultiVector<>> d_data = getConstField(z);
+      assembler_filter_->outputTpetraVector(d_data, dfile.str());
+    }
+  }
+
+  void update( const ROL::Vector<Real> &z, bool flag = true, int iter = -1 ) {
+    nupda_++;
+    if (nuke_) {
+      update_temp(z,iter);
+    }
+    else {
+      // Trial:    flag = false, iter = -1
+      // Check:    flag = true,  iter = -1
+      // Reject:   flag = false, iter > -1
+      // Accept:   flag = true,  iter > -1
+      if (flag) {
+        if (iter > -1) {
+          update_accept(z,iter);
+        }
+        else {
+          update_temp(z,iter);
+        }
+      }
+      else {
+        if (iter > -1) {
+          update_revert(z,iter);
+        }
+        else {
+          update_trial(z,iter);
+        }
+      }
+    }
+    comp_->update(*Fz_,flag,iter);
+  }
+
+  Real normalize(const ROL::Vector<Real> &z, Real &tol) {
+    applyFilter(*Fz_,z,false);
+    return comp_->normalize(*Fz_, tol);
+  }
+
+  void printToFile(const ROL::Vector<Real> &z, std::ostream &stream = std::cout,
+                   std::string ufile = "state.txt", std::string dfile = "density.txt",
+                   std::string ffile = "filtered_density.txt") {
+    update(z,ROL::UpdateType::Temp);
+    ROL::Ptr<const Tpetra::MultiVector<>> dens_data = getConstField(z);
+    assembler_filter_->outputTpetraVector(dens_data,  dfile);
+    comp_->printToFile(*Fz_, stream, ufile, ffile);
+  }
+
+  void solveState(ROL::Vector<Real> &u, const ROL::Vector<Real> &z) {
+    update(z,ROL::UpdateType::Temp);
+    comp_->solveState(u,*Fz_);
+  }
+
+  Real value( const ROL::Vector<Real> &z, Real &tol ) {
+    nfval_++;
+    return comp_->value(*Fz_,tol);
+  }
+
+  void gradient( ROL::Vector<Real> &g, const ROL::Vector<Real> &z, Real &tol ) {
+    ngrad_++;
+    comp_->gradient(*dctrl_, *Fz_, tol);
+    applyFilter(g, *dctrl_, true);
+  }
+
+  void hessVec( ROL::Vector<Real> &hv, const ROL::Vector<Real> &v, const ROL::Vector<Real> &z, Real &tol ) {
+    nhess_++;
+    applyFilter(*Fv_, v, false);
+    comp_->hessVec(*dctrl_, *Fv_, *Fz_, tol);
+    applyFilter(hv, *dctrl_, true);
+  }
+
+  /** \brief Apply a reduced Hessian preconditioner.
+  */
+  virtual void precond( ROL::Vector<Real> &Pv, const ROL::Vector<Real> &v, const ROL::Vector<Real> &z, Real &tol ) {
+    nprec_++;
+    Pv.set(v.dual());
+  }
+
+  void summarize(std::ostream &stream) const {
+    stream << std::endl;
+    stream << std::string(114,'=') << std::endl;
+    stream << "  Filtered_Objective_Compliance::summarize" << std::endl;
+    stream << "    Number of calls to update:          " << nupda_ << std::endl;
+    stream << "    Number of calls to value:           " << nfval_ << std::endl;
+    stream << "    Number of calls to gradient:        " << ngrad_ << std::endl;
+    stream << "    Number of calls to hessVec:         " << nhess_ << std::endl;
+    stream << "    Number of calls to precond:         " << nprec_ << std::endl;
+    stream << "    Number of filter assemblies:        " << nasFi_ << std::endl;
+    stream << "    Number of filter applies:           " << napFi_ << std::endl;
+    stream << std::string(114,'=') << std::endl;
+    stream << std::endl;
+    comp_->summarize(stream);
+  }
+
+  void reset(void) {
+    nupda_ = 0; nfval_ = 0; ngrad_ = 0; nhess_ = 0; nprec_ = 0; nasFi_ = 0; napFi_ = 0;
+    comp_->reset();
+  }
+
+// For parametrized (stochastic) objective functions and constraints
+public:
+  void setParameter(const std::vector<Real> &param) {
+    ROL::Objective<Real>::setParameter(param);
+    comp_->setParameter(param);
+  }
+
+private:
+
+  ROL::Ptr<const Tpetra::MultiVector<>> getConstField(const ROL::Vector<Real> &x) const {
+    ROL::Ptr<const Tpetra::MultiVector<>> xp;
+    try {
+      xp = dynamic_cast<const ROL::TpetraMultiVector<Real>&>(x).getVector();
+    }
+    catch (std::exception &e) {
+      try {
+        ROL::Ptr<const ROL::TpetraMultiVector<Real>> xvec
+          = dynamic_cast<const PDE_OptVector<Real>&>(x).getField();
+        if (xvec == ROL::nullPtr) {
+          xp = ROL::nullPtr;
+        }
+        else {
+          xp = xvec->getVector();
+        }
+      }
+      catch (std::exception &ee) {
+        xp = ROL::nullPtr;
+      }
+    }
+    return xp;
+  }
+
+  ROL::Ptr<Tpetra::MultiVector<>> getField(ROL::Vector<Real> &x) const {
+    ROL::Ptr<Tpetra::MultiVector<>> xp;
+    try {
+      xp = dynamic_cast<ROL::TpetraMultiVector<Real>&>(x).getVector();
+    }
+    catch (std::exception &e) {
+      try {
+        ROL::Ptr<ROL::TpetraMultiVector<Real>> xvec
+          = dynamic_cast<PDE_OptVector<Real>&>(x).getField();
+        if ( xvec == ROL::nullPtr ) {
+          xp = ROL::nullPtr;
+        }
+        else {
+          xp = xvec->getVector();
+        }
+      }
+      catch (std::exception &ee) {
+        xp = ROL::nullPtr;
+      }
+    }
+    return xp;
+  }
+
+  void assembleFilter(void) {
+    nasFi_++;
+    ROL::Ptr<Tpetra::MultiVector<>> f_ctrl = assembler_filter_->createControlVector();
+    assembler_filter_->assemblePDEJacobian1(matF1_,pde_filter_,f_state_,f_ctrl);
+    assembler_filter_->assemblePDEJacobian2(matF2_,pde_filter_,f_state_,f_ctrl);
+    solver_filter_->setA(matF1_);
+  }
+
+  void applyFilter(ROL::Vector<Real> &Fx, const ROL::Vector<Real> &x, const bool transpose) {
+    napFi_++;
+    ROL::Ptr<Tpetra::MultiVector<>>      Fx_data = getField(Fx);
+    ROL::Ptr<const Tpetra::MultiVector<>> x_data = getConstField(x);
+    if (transpose) {
+      solver_filter_->solve(f_state_,x_data,false);
+      matF2_->apply(*f_state_,*Fx_data,Teuchos::TRANS);
+    }
+    else {
+      matF2_->apply(*x_data,*f_res_);
+      solver_filter_->solve(Fx_data,f_res_,false);
+    }
+    Fx.scale(static_cast<Real>(-1));
+  }
+
+  void update_initial( const ROL::Vector<Real> &z, int iter ) {
+    applyFilter(*Fz_,z,false);
+    Fzcache_->set(*Fz_);
+  }
+
+  void update_accept( const ROL::Vector<Real> &z, int iter ) {
+    Fzcache_->set(*Fz_);
+  }
+
+  void update_temp( const ROL::Vector<Real> &z, int iter ) {
+    applyFilter(*Fz_,z,false);
+  }
+
+  void update_trial( const ROL::Vector<Real> &z, int iter ) {
+    applyFilter(*Fz_,z,false);
+  }
+
+  void update_revert( const ROL::Vector<Real> &z, int iter ) {
+    Fz_->set(*Fzcache_);
+  }
+}; // class TopOptComplianceObjective
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/src/materialtensorK.hpp b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/src/materialtensorK.hpp
new file mode 100644
index 000000000000..3e8e3ba2ddff
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/src/materialtensorK.hpp
@@ -0,0 +1,332 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  obj.hpp
+    \brief Provides the interface for local (cell-based) objective function computations.
+*/
+
+#ifndef PDEOPT_MATERIALTENSORK_HPP
+#define PDEOPT_MATERIALTENSORK_HPP
+
+#include "../../../../TOOLS/feK.hpp"
+
+template <class Real, class DeviceType>
+class MaterialTensor {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  ROL::Ptr<fe_type> fe_;
+
+  // Problem parameters.
+  Real youngsModulus_;
+  Real poissonRatio_;
+  bool isPlainStress_;
+  Real minDensity_;
+  Real maxDensity_;
+  Real powerSIMP_;
+  bool usePhaseField_;
+  bool useProjection_;
+  Real beta_, eta_;
+  std::vector<std::vector<Real>> materialMat_;
+
+  // Precomputed quantities.
+  std::vector<scalar_view> BMat_, BdetJMat_, NMat_, NdetJMat_, CBdetJMat_;
+
+  void computeMaterialTensor(const int d) {
+    int matd = (d*(d+1))/2;
+    std::vector<Real> tmpvec(matd);
+    materialMat_.resize(matd, tmpvec);
+    if ((d==2) && (isPlainStress_)) {
+      Real one(1), half(0.5);
+      Real factor1 = youngsModulus_ /(one-poissonRatio_*poissonRatio_);
+      materialMat_[0][0] = factor1;
+      materialMat_[0][1] = factor1 * poissonRatio_;
+      materialMat_[1][0] = factor1 * poissonRatio_;
+      materialMat_[1][1] = factor1;
+      materialMat_[2][2] = factor1 * half * (one-poissonRatio_);
+    }
+    else if ((d==2) && (!isPlainStress_)) {
+      Real one(1), two(2), half(0.5);
+      Real factor2 = youngsModulus_ /(one+poissonRatio_)/(one-two*poissonRatio_);
+      materialMat_[0][0] = factor2 * (one-poissonRatio_);
+      materialMat_[0][1] = factor2 * poissonRatio_;
+      materialMat_[1][0] = factor2 * poissonRatio_;
+      materialMat_[1][1] = factor2 * (one-poissonRatio_);
+      materialMat_[2][2] = factor2 * half * (one-two*poissonRatio_);
+    }
+    else {
+      Real one(1), two(2), half(0.5);
+      Real lam = youngsModulus_*poissonRatio_/(one+poissonRatio_)/(one-two*poissonRatio_);
+      Real mu = half*youngsModulus_/(one+poissonRatio_);
+      materialMat_[0][0] = lam + two*mu;
+      materialMat_[0][1] = lam;
+      materialMat_[0][2] = lam;
+      materialMat_[1][0] = lam;
+      materialMat_[1][1] = lam + two*mu;
+      materialMat_[1][2] = lam;
+      materialMat_[2][0] = lam;
+      materialMat_[2][1] = lam;
+      materialMat_[2][2] = lam + two*mu;
+      materialMat_[3][3] = mu;
+      materialMat_[4][4] = mu;
+      materialMat_[5][5] = mu;
+    }
+  }
+
+  void computeNBmats(void) {
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    int p = fe_->gradN().extent_int(2);
+    int d = fe_->gradN().extent_int(3);
+    int matd = materialMat_.size();
+
+    for (int i=0; i<d; ++i) {
+      BMat_.push_back(scalar_view("Bmat",c,f,p,matd));
+      BdetJMat_.push_back(scalar_view("BdetJMat",c,f,p,matd));
+      CBdetJMat_.push_back(scalar_view("CBdetJMat",c,f,p,matd));
+      NMat_.push_back(scalar_view("NMat",c,f,p,d));
+      NdetJMat_.push_back(scalar_view("NdetJMat",c,f,p,d));
+    }
+
+    if (d==2) {
+      for (int i=0; i<c; ++i) {
+        for (int j=0; j<f; ++j) {
+      	  for (int k=0; k<p; ++k) {
+            (NMat_[0])(i, j, k, 0) = (fe_->N())(i, j, k);
+            (NMat_[1])(i, j, k, 1) = (fe_->N())(i, j, k);
+            (NdetJMat_[0])(i, j, k, 0) = (fe_->NdetJ())(i, j, k);
+            (NdetJMat_[1])(i, j, k, 1) = (fe_->NdetJ())(i, j, k);
+
+            (BMat_[0])(i, j, k, 0) = (fe_->gradN())(i, j, k, 0);
+            (BMat_[1])(i, j, k, 1) = (fe_->gradN())(i, j, k, 1);
+            (BMat_[0])(i, j, k, 2) = (fe_->gradN())(i, j, k, 1);
+            (BMat_[1])(i, j, k, 2) = (fe_->gradN())(i, j, k, 0);
+
+            (BdetJMat_[0])(i, j, k, 0) = (fe_->gradNdetJ())(i, j, k, 0);
+            (BdetJMat_[1])(i, j, k, 1) = (fe_->gradNdetJ())(i, j, k, 1);
+            (BdetJMat_[0])(i, j, k, 2) = (fe_->gradNdetJ())(i, j, k, 1);
+            (BdetJMat_[1])(i, j, k, 2) = (fe_->gradNdetJ())(i, j, k, 0);
+          }
+        }
+      }
+    }
+
+    if(d==3) {
+      for (int i=0; i<c; ++i) {
+        for (int j=0; j<f; ++j) {
+          for (int k=0; k<p; ++k) {
+            (NMat_[0])(i, j, k, 0) = (fe_->N())(i, j, k);
+            (NMat_[1])(i, j, k, 1) = (fe_->N())(i, j, k);
+            (NMat_[2])(i, j, k, 2) = (fe_->N())(i, j, k);
+            (NdetJMat_[0])(i, j, k, 0) = (fe_->NdetJ())(i, j, k);
+            (NdetJMat_[1])(i, j, k, 1) = (fe_->NdetJ())(i, j, k);
+            (NdetJMat_[2])(i, j, k, 2) = (fe_->NdetJ())(i, j, k);
+
+            (BMat_[0])(i, j, k, 0) = (fe_->gradN())(i, j, k, 0);
+            (BMat_[1])(i, j, k, 1) = (fe_->gradN())(i, j, k, 1);
+            (BMat_[2])(i, j, k, 2) = (fe_->gradN())(i, j, k, 2);
+            (BMat_[1])(i, j, k, 3) = (fe_->gradN())(i, j, k, 2);
+            (BMat_[2])(i, j, k, 3) = (fe_->gradN())(i, j, k, 1);
+            (BMat_[0])(i, j, k, 4) = (fe_->gradN())(i, j, k, 2);
+            (BMat_[2])(i, j, k, 4) = (fe_->gradN())(i, j, k, 0);
+            (BMat_[0])(i, j, k, 5) = (fe_->gradN())(i, j, k, 1);
+            (BMat_[1])(i, j, k, 5) = (fe_->gradN())(i, j, k, 0);
+
+            (BdetJMat_[0])(i, j, k, 0) = (fe_->gradNdetJ())(i, j, k, 0);
+            (BdetJMat_[1])(i, j, k, 1) = (fe_->gradNdetJ())(i, j, k, 1);
+            (BdetJMat_[2])(i, j, k, 2) = (fe_->gradNdetJ())(i, j, k, 2);
+            (BdetJMat_[1])(i, j, k, 3) = (fe_->gradNdetJ())(i, j, k, 2);
+            (BdetJMat_[2])(i, j, k, 3) = (fe_->gradNdetJ())(i, j, k, 1);
+            (BdetJMat_[0])(i, j, k, 4) = (fe_->gradNdetJ())(i, j, k, 2);
+            (BdetJMat_[2])(i, j, k, 4) = (fe_->gradNdetJ())(i, j, k, 0);
+            (BdetJMat_[0])(i, j, k, 5) = (fe_->gradNdetJ())(i, j, k, 1);
+            (BdetJMat_[1])(i, j, k, 5) = (fe_->gradNdetJ())(i, j, k, 0);
+          }
+        }
+      }
+    } 
+
+    for (int i=0; i<c; ++i) {
+      for (int j=0; j<f; ++j) {
+        for (int k=0; k<p; ++k) {
+          for (int l=0; l<d; ++l) {
+            for (int m=0; m<matd; ++m) {
+              for (int n=0; n<matd; ++n)
+                (CBdetJMat_[l])(i,j,k,m) += materialMat_[m][n] * (BdetJMat_[l])(i,j,k,n);
+            }
+          }
+        }
+      }
+    }
+
+  }
+
+public:
+  MaterialTensor(ROL::ParameterList &parlist) {
+    // Problem parameters.
+    youngsModulus_ = parlist.get("Young's Modulus",      1.0);
+    poissonRatio_  = parlist.get("Poisson Ratio",        0.3);
+    isPlainStress_ = parlist.get("Use Plain Stress",     true);
+    minDensity_    = parlist.get("Minimum Density",      1e-4);
+    maxDensity_    = parlist.get("Maximum Density",      1.0);
+    powerSIMP_     = parlist.get("SIMP Power",           3.0);
+    usePhaseField_ = parlist.get("Use Phase Field",      false);
+    useProjection_ = parlist.get("Use Projection",       false);
+    beta_          = parlist.get("Projection Parameter", 1.0);
+    eta_           = parlist.get("Projection Threshold", 0.5);
+  }
+
+  void setFE(ROL::Ptr<fe_type> &fe) {
+    fe_ = fe;
+    int d = fe_->gradN().extent_int(3);
+    computeMaterialTensor(d);
+    computeNBmats();
+  }
+
+  void applyTensor(scalar_view & out, const scalar_view inData) const {
+    int c = fe_->gradN().extent_int(0);
+    int p = fe_->gradN().extent_int(2);
+    int matd = materialMat_.size();
+
+    for (int i=0; i<c; ++i) {
+      for (int j=0; j<p; ++j) {
+        for (int k=0; k<matd; ++k) {
+          for (int l=0; l<matd; ++l)
+            out(i,j,k) += materialMat_[k][l] * inData(i,j,l);
+        }
+      }
+    }
+  }
+
+  void computeUmat(scalar_view & UMat, std::vector<scalar_view> & gradU) const {
+    int c = fe_->gradN().extent_int(0);
+    int p = fe_->gradN().extent_int(2);
+    int d = fe_->gradN().extent_int(3);
+    int matd = materialMat_.size();
+
+    UMat = scalar_view("UMat",c,p,matd);
+
+    if (d==2) {
+      for (int i=0; i<c; ++i) {
+        for (int j=0; j<p; ++j) {
+          UMat(i, j, 0) = (gradU[0])(i, j, 0);
+          UMat(i, j, 1) = (gradU[1])(i, j, 1);
+          UMat(i, j, 2) = (gradU[0])(i, j, 1) + (gradU[1])(i, j, 0);
+        }
+      }
+    }
+
+    if(d==3) {
+      for (int i=0; i<c; ++i) {
+        for (int j=0; j<p; ++j) {
+          UMat(i, j, 0) = (gradU[0])(i, j, 0);
+          UMat(i, j, 1) = (gradU[1])(i, j, 1);
+          UMat(i, j, 2) = (gradU[2])(i, j, 2);
+          UMat(i, j, 3) = (gradU[1])(i, j, 2) + (gradU[2])(i, j, 1);
+          UMat(i, j, 4) = (gradU[0])(i, j, 2) + (gradU[2])(i, j, 0);
+          UMat(i, j, 5) = (gradU[0])(i, j, 1) + (gradU[1])(i, j, 0);
+        }
+      }
+    } 
+  }
+
+  void computeDensity(scalar_view & rho, const scalar_view Z, int deriv = 0) const {
+    // Retrieve dimensions.
+    int c = rho.extent_int(0);
+    int p = rho.extent_int(1);
+
+    Real z(0);
+    const Real half(0.5), one(1), two(2), diff = maxDensity_-minDensity_;
+    for (int i=0; i<c; ++i) {
+      for (int j=0; j<p; ++j) {
+        z = Z(i,j);
+        if (!usePhaseField_) {
+          if (!useProjection_) {
+            if (deriv==0)
+              rho(i,j) = minDensity_ + diff*std::pow(z, powerSIMP_);
+            else if (deriv==1)
+              rho(i,j) = powerSIMP_*diff*std::pow(z, powerSIMP_-one);
+            else if (deriv==2)
+              rho(i,j) = powerSIMP_*(powerSIMP_-one)*diff*std::pow(z, powerSIMP_-two);
+          }
+          else {
+            if (deriv==0) {
+              Real pz = project(z,0);
+              rho(i,j) = minDensity_ + diff*std::pow(pz, powerSIMP_);
+            }
+            else if (deriv==1) {
+              Real pz = project(z,0), pz1 = project(z,1);
+              rho(i,j) = powerSIMP_*diff*std::pow(pz, powerSIMP_-one)*pz1;
+            }
+            else if (deriv==2) {
+              Real pz = project(z,0), pz1 = project(z,1), pz2 = project(z,2);
+              rho(i,j) = powerSIMP_*(powerSIMP_-one)*diff*std::pow(pz, powerSIMP_-two)*pz1*pz1
+                           +powerSIMP_*diff*std::pow(pz, powerSIMP_-one)*pz2;
+            }
+          }
+        }
+        else {
+          z += one;
+          z *= half;
+          if (deriv==0)      rho(i,j) = minDensity_ + diff*std::pow(z, 2);
+          else if (deriv==1) rho(i,j) = diff*z;
+          else if (deriv==2) rho(i,j) = half*diff;
+        }
+      }
+    }
+  }
+
+  const scalar_view B(int i) const {
+    return BMat_[i];
+  }
+
+  const scalar_view BdetJ(int i) const {
+    return BdetJMat_[i];
+  }
+
+  const scalar_view N(int i) const {
+    return NMat_[i];
+  }
+
+  const scalar_view NdetJ(int i) const{
+    return NdetJMat_[i];
+  }
+
+  const scalar_view CBdetJ(int i) const {
+    return CBdetJMat_[i];
+  }
+
+  int getMatrixDim(void) const {
+    return materialMat_.size();
+  }
+
+private:
+  Real project(Real &rho, int deriv) const {
+    if (deriv==1) {
+      const Real one(1), tbe(std::tanh(beta_*eta_)), tbe1(std::tanh(beta_*(one-eta_)));
+      const Real tber(std::tanh(beta_*(rho-eta_)));
+      return (one-tber*tber)/(tbe+tbe1)*beta_;
+    }
+    else if (deriv==2) {
+      const Real one(1), tbe(std::tanh(beta_*eta_)), tbe1(std::tanh(beta_*(one-eta_)));
+      const Real tber(std::tanh(beta_*(rho-eta_))), two(2);
+      return -two*tber*(one-tber*tber)/(tbe+tbe1)*beta_*beta_;
+    }
+    const Real one(1), tbe(std::tanh(beta_*eta_)), tbe1(std::tanh(beta_*(one-eta_)));
+    return (tbe+std::tanh(beta_*(rho-eta_)))/(tbe+tbe1);
+  }
+};
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/src/obj_volumeK.hpp b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/src/obj_volumeK.hpp
new file mode 100644
index 000000000000..bbf7c9609328
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/src/obj_volumeK.hpp
@@ -0,0 +1,123 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  obj.hpp
+    \brief Provides the interface for local (cell-based) objective function computations.
+*/
+
+#ifndef PDEOPT_QOI_VOLUMEK_HPP
+#define PDEOPT_QOI_VOLUMEK_HPP
+
+#include "../../../../TOOLS/qoiK.hpp"
+#include "../../../../TOOLS/feK.hpp"
+
+template <class Real, class DeviceType>
+class QoI_Volume_TopoOpt : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> fe_;
+  scalar_view ones_, volFrac_;
+
+public:
+  QoI_Volume_TopoOpt(const ROL::Ptr<fe_type> &fe, Real v0 = Real(0))
+  : fe_(fe) {
+    // Get relevant dimensions
+    int c = fe_->cubPts().extent_int(0);
+    int p = fe_->cubPts().extent_int(1);
+    ones_ = scalar_view("ones",c,p);
+    volFrac_ = scalar_view("volFrac",c,p);
+    Kokkos::deep_copy(ones_,static_cast<Real>(1));
+    Kokkos::deep_copy(volFrac_,v0);
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = fe_->gradN().extent_int(0);
+    int p = fe_->gradN().extent_int(2);
+
+    // Initialize output val
+    val = scalar_view("val", c);
+
+    // Evaluate on FE basis
+    scalar_view valZ_eval("valZ_eval", c, p);
+    fe_->evaluateValue(valZ_eval, z_coeff);
+    rst::subtract(valZ_eval,volFrac_);
+
+    // Compute energy
+    fe_->computeIntegral(val,valZ_eval,ones_);
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Volume_TopoOpt::gradient_1 is zero.");
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+
+    // Initialize output grad
+    grad = scalar_view("grad", c, f);
+
+    // Compute gradient of energy
+    fst::integrate(grad,ones_,fe_->NdetJ(),false);
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Volume_TopoOpt::HessVec_11 is zero.");
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Volume_TopoOpt::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Volume_TopoOpt::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Volume_TopoOpt::HessVec_22 is zero.");
+  }
+
+}; // QoI_VolumeObj
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/src/pde_elasticityK.hpp b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/src/pde_elasticityK.hpp
new file mode 100644
index 000000000000..a5ef74575426
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/src/pde_elasticityK.hpp
@@ -0,0 +1,625 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  pde_elasticity.hpp
+    \brief Implements the local PDE interface for the structural topology
+           optimization problem.
+*/
+
+#ifndef PDE_TOPO_OPT_ELASTICITYK_HPP
+#define PDE_TOPO_OPT_ELASTICITYK_HPP
+
+#include "../../../../TOOLS/pdeK.hpp"
+#include "../../../../TOOLS/feK.hpp"
+#include "../../../../TOOLS/fieldhelperK.hpp"
+
+#include "dirichletK.hpp"
+#include "tractionK.hpp"
+#include "materialtensorK.hpp"
+
+#include "Intrepid2_HVOL_C0_FEM.hpp"
+#include "Intrepid2_HGRAD_QUAD_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_QUAD_C2_FEM.hpp"
+#include "Intrepid2_HGRAD_HEX_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_HEX_C2_FEM.hpp"
+#include "Intrepid2_DefaultCubatureFactory.hpp"
+#include "Intrepid2_FunctionSpaceTools.hpp"
+#include "Intrepid2_CellTools.hpp"
+
+#include "ROL_Ptr.hpp"
+
+template <class Real, class DeviceType>
+class PDE_Elasticity : public PDE<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  // Finite element basis information
+  basis_ptr basisPtr_, basisPtrDens_;
+  std::vector<basis_ptr> basisPtrs_, basisPtrsDens_;
+  // Cell cubature information
+  ROL::Ptr<Intrepid2::Cubature<DeviceType,Real,Real>> cellCub_, bdryCub_;
+  // Cell node information
+  scalar_view volCellNodes_;
+  std::vector<std::vector<scalar_view>> bdryCellNodes_;
+  std::vector<std::vector<std::vector<int>>> bdryCellLocIds_;
+  // Finite element definition
+  ROL::Ptr<fe_type> fe_, feDens_;
+  std::vector<std::vector<ROL::Ptr<fe_type>>> feBdry_;
+  // Local degrees of freedom on boundary, for each side of the reference cell (first index).
+  std::vector<std::vector<int>> fidx_;
+  // Coordinates of degrees freedom on boundary cells.
+  // Indexing:  [sideset number][local side id](cell number, value at dof)
+  std::vector<std::vector<scalar_view>> bdryCellDofValues_;
+  // Field pattern, offsets, etc.
+  std::vector<std::vector<int>> fieldPattern_;   // local Field/DOF pattern; set from DOF manager 
+  int numFields_;                                // number of fields (equations in the PDE)
+  int numDofs_;                                  // total number of degrees of freedom for all (local) fields
+  std::vector<int> offset_;                      // for each field, a counting offset
+  std::vector<int> numFieldDofs_;                // for each field, number of degrees of freedom
+  std::vector<std::vector<int>> fieldPatternDens_;   // local Field/DOF pattern; set from DOF manager 
+  int numFieldsDens_;                                // number of fields (equations in the PDE)
+  int numDofsDens_;                                  // total number of degrees of freedom for all (local) fields
+  std::vector<int> offsetDens_;                      // for each field, a counting offset
+  std::vector<int> numFieldDofsDens_;                // for each field, number of degrees of freedom
+
+  ROL::Ptr<MaterialTensor<Real,DeviceType>>  matTensor_;
+  ROL::Ptr<Dirichlet<Real,DeviceType>>       dirichlet_;
+  ROL::Ptr<Traction<Real,DeviceType>>        traction_;
+  ROL::Ptr<FieldUtils::FieldInfo> fieldInfo_, fieldInfoDens_;
+
+  bool getFields2called_;
+
+public:
+  PDE_Elasticity(ROL::ParameterList &parlist) : getFields2called_(false) {
+    // Finite element fields.
+    int basisOrder     = parlist.sublist("Problem").get("Basis Order",1);
+    int basisOrderDens = parlist.sublist("Problem").get("Density Basis Order",0);
+    int cubDegree      = parlist.sublist("Problem").get("Cubature Degree",4);
+    int bdryCubDegree  = parlist.sublist("Problem").get("Boundary Cubature Degree",2);
+    int probDim        = parlist.sublist("Problem").get("Problem Dimension",2);
+    if (probDim > 3 || probDim < 2) {
+      TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument,
+        ">>> PDE-OPT/poisson/pde_poisson.hpp: Problem dimension is not 2 or 3!");
+    }
+    if (basisOrder > 2 || basisOrder < 1) {
+      TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument,
+        ">>> PDE-OPT/poisson/pde_poisson.hpp: Basis order is not 1 or 2!");
+    }
+    if (probDim == 2) {
+      if (basisOrder == 1)
+        basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceType,Real,Real>>();
+      else if (basisOrder == 2)
+        basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C2_FEM<DeviceType,Real,Real>>();
+    }
+    else if (probDim == 3) {
+      if (basisOrder == 1)
+        basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_HEX_C1_FEM<DeviceType,Real,Real>>();
+      else if (basisOrder == 2)
+        basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_HEX_C2_FEM<DeviceType,Real,Real>>();
+    }
+    basisPtrs_.clear(); basisPtrs_.resize(probDim,basisPtr_); // Displacement component
+    shards::CellTopology cellType = basisPtr_->getBaseCellTopology(); // get cell type from basis
+    if (basisOrderDens == 1) {
+      if (probDim == 2)
+        basisPtrDens_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceType,Real,Real>>();
+      else if (probDim == 3)
+        basisPtrDens_ = ROL::makePtr<Intrepid2::Basis_HGRAD_HEX_C1_FEM<DeviceType,Real,Real>>();
+    }
+    else
+      basisPtrDens_ = ROL::makePtr<Intrepid2::Basis_HVOL_C0_FEM<DeviceType,Real,Real>>(cellType);
+    basisPtrsDens_.clear(); basisPtrsDens_.push_back(basisPtrDens_); // Density component
+
+    // Quadrature rules.
+    Intrepid2::DefaultCubatureFactory cubFactory;                            // create cubature factory
+    cellCub_ = cubFactory.create<DeviceType,Real,Real>(cellType, cubDegree); // create default cubature
+
+    shards::CellTopology bdryCellType = cellType.getCellTopologyData(probDim-1, 0);
+    bdryCub_ = cubFactory.create<DeviceType,Real,Real>(bdryCellType, bdryCubDegree);
+
+    matTensor_ = ROL::makePtr<MaterialTensor<Real,DeviceType>>(parlist.sublist("Problem"));
+    traction_  = ROL::makePtr<Traction<Real,DeviceType>>(probDim);
+    dirichlet_ = ROL::makePtr<Dirichlet<Real,DeviceType>>();
+
+    numDofs_ = 0;
+    numFields_ = basisPtrs_.size();
+    offset_.resize(numFields_);
+    numFieldDofs_.resize(numFields_);
+    for (int i=0; i<numFields_; ++i) {
+      if (i==0)
+        offset_[i]  = 0;
+      else
+        offset_[i]  = offset_[i-1] + basisPtrs_[i-1]->getCardinality();
+      numFieldDofs_[i] = basisPtrs_[i]->getCardinality();
+      numDofs_ += numFieldDofs_[i];
+    }
+
+    numDofsDens_ = 0;
+    numFieldsDens_ = basisPtrsDens_.size();
+    offsetDens_.resize(numFieldsDens_);
+    numFieldDofsDens_.resize(numFieldsDens_);
+    for (int i=0; i<numFieldsDens_; ++i) {
+      if (i==0)
+        offsetDens_[i]  = 0;
+      else
+        offsetDens_[i]  = offsetDens_[i-1] + basisPtrsDens_[i-1]->getCardinality();
+      numFieldDofsDens_[i] = basisPtrsDens_[i]->getCardinality();
+      numDofsDens_ += numFieldDofsDens_[i];
+    }
+  }
+
+  void residual(scalar_view &res,
+                const scalar_view u_coeff,
+                const scalar_view z_coeff = scalar_view(),
+                const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    int p = fe_->gradN().extent_int(2);
+    int d = fe_->gradN().extent_int(3);
+    int matd = matTensor_->getMatrixDim();
+ 
+    // Initialize residuals.
+    std::vector<scalar_view> R(d);
+    for (int i=0; i<d; ++i)
+      R[i] = scalar_view("res",c,f);
+
+    // Split u_coeff into components.
+    std::vector<scalar_view> U, Z;
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoDens_);
+
+    // Evaluate/interpolate finite element fields on cells.
+    std::vector<scalar_view> gradDisp_eval(d);
+    scalar_view UMat;
+    scalar_view rho("rho", c, p);
+    scalar_view valZ_eval("valZ_eval", c, p);
+    scalar_view rhoUMat("rhoUMat", c, p, matd);
+    for (int i=0; i<d; ++i) {
+      gradDisp_eval[i] =  scalar_view("gradDisp_eval", c, p, d);
+      fe_->evaluateGradient(gradDisp_eval[i], U[i]);
+    }
+    feDens_->evaluateValue(valZ_eval, Z[0]);
+
+    // EVALUATE MATERIAL TENSOR
+    matTensor_->computeUmat(UMat, gradDisp_eval);
+    matTensor_->computeDensity(rho, valZ_eval);
+    fst::scalarMultiplyDataData(rhoUMat, rho, UMat);
+    for (int i=0; i<d; ++i)
+      fst::integrate(R[i],rhoUMat,matTensor_->CBdetJ(i),false);
+
+    // APPLY TRACTION CONDITIONS
+    if (!traction_->isNull())
+      traction_->apply(R, feBdry_, PDE<Real,DeviceType>::getParameter(), static_cast<Real>(-1));
+
+    // APPLY DIRICHLET CONDITIONS
+    dirichlet_->applyResidual(R,U);
+
+    // Combine the residuals.
+    FieldUtils::combineFieldCoeff<Real>(res, R, fieldInfo_);
+  }
+
+  void Jacobian_1(scalar_view &jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    int p = fe_->gradN().extent_int(2);
+    int d = fe_->gradN().extent_int(3);
+    int matd = matTensor_->getMatrixDim();
+ 
+    // Initialize Jacobians.
+    std::vector<std::vector<scalar_view>> J(d);
+    for (int i=0; i<d; ++i) {
+      for (int j=0; j<d; ++j)
+        J[i].push_back(scalar_view("jac",c,f,f));
+    }
+
+    // Split z_coeff into components.
+    std::vector<scalar_view> Z;
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoDens_);
+
+    std::vector<scalar_view> rhoBMat(d);
+    scalar_view rho("rho", c, p);
+    scalar_view valZ_eval("valZ_eval", c, p);
+
+    feDens_->evaluateValue(valZ_eval, Z[0]);
+    matTensor_->computeDensity(rho, valZ_eval);
+    for (int i=0; i<d; ++i) {
+      rhoBMat[i] =  scalar_view("rhoBMat", c, f, p, matd);
+      fst::scalarMultiplyDataField(rhoBMat[i], rho, matTensor_->B(i));
+    }
+
+    /*** Evaluate weak form of the Jacobian. ***/
+    for (int i=0; i<d; ++i) {
+      for (int j=0; j<d; ++j)
+        fst::integrate(J[i][j],rhoBMat[i],matTensor_->CBdetJ(j),false);
+    }
+
+    // APPLY DIRICHLET CONDITIONS
+    dirichlet_->applyJacobian1(J);
+
+    // Combine the jacobians.
+    FieldUtils::combineFieldCoeff<Real>(jac, J, fieldInfo_, fieldInfo_);
+  }
+
+  void Jacobian_2(scalar_view &jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    int c  = fe_->gradN().extent_int(0);
+    int f  = fe_->gradN().extent_int(1);
+    int fd = feDens_->gradN().extent_int(1);
+    int p  = fe_->gradN().extent_int(2);
+    int d  = fe_->gradN().extent_int(3);
+    int matd = matTensor_->getMatrixDim();
+
+    // Initialize Jacobians.
+    std::vector<std::vector<scalar_view>> J(d);
+    for (int i=0; i<d; ++i)
+      J[i].push_back(scalar_view("jac",c,f,fd));
+
+    // Split u_coeff into components.
+    std::vector<scalar_view> U, Z;
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoDens_);
+
+    // Evaluate/interpolate finite element fields on cells.
+    std::vector<scalar_view> gradDisp_eval(d), CBrhoUMat(d);
+    scalar_view UMat;
+    scalar_view rho("rho", c, p);
+    scalar_view valZ_eval("valZ_eval", c, p);
+    scalar_view rhoUMat("rhoUMat", c, p, matd);
+    for (int i=0; i<d; ++i) {
+      CBrhoUMat[i]     = scalar_view("CBrhoUMat", c, f, p);
+      gradDisp_eval[i] = scalar_view("gradDisp_eval", c, p, d);
+      fe_->evaluateGradient(gradDisp_eval[i], U[i]);
+    }
+    feDens_->evaluateValue(valZ_eval, Z[0]);
+    matTensor_->computeUmat(UMat, gradDisp_eval);
+    matTensor_->computeDensity(rho, valZ_eval, 1);  // first derivative
+    fst::scalarMultiplyDataData(rhoUMat, rho, UMat);
+
+    for (int i=0; i<d; ++i)
+      fst::dotMultiplyDataField(CBrhoUMat[i], rhoUMat, matTensor_->CBdetJ(i));
+
+    /*** Evaluate weak form of the residual. ***/
+    for (int i=0; i<d; ++i)
+      fst::integrate(J[i][0],CBrhoUMat[i],feDens_->N(),false);
+
+    // APPLY DIRICHLET CONDITIONS
+    dirichlet_->applyJacobian2(J);
+
+    // Combine the jacobians.
+    FieldUtils::combineFieldCoeff<Real>(jac, J, fieldInfo_, fieldInfoDens_);
+  }
+
+  void Hessian_11(scalar_view &hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_TopoOpt::Hessian_11): Hessian is zero.");
+  }
+
+  void Hessian_12(scalar_view &hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    int c  = fe_->gradN().extent_int(0);
+    int f  = fe_->gradN().extent_int(1);
+    int fd = feDens_->gradN().extent_int(1);
+    int p  = fe_->gradN().extent_int(2);
+    int d  = fe_->gradN().extent_int(3);
+    int matd = matTensor_->getMatrixDim();
+
+    // Initialize Hessians.
+    std::vector<std::vector<scalar_view>> J(1);
+    for (int i=0; i<d; ++i)
+      J[0].push_back(scalar_view("jac",c,fd,f));
+
+
+    // Split u_coeff into components.
+    std::vector<scalar_view> L, Z;
+    FieldUtils::splitFieldCoeff<Real>(L, l_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoDens_);
+
+    // Apply Dirichlet conditions to the multipliers.
+    dirichlet_->applyMultiplier(L);
+
+    // Evaluate/interpolate finite element fields on cells.
+    std::vector<scalar_view> gradDisp_eval(d), CBrhoLMat(d);
+    scalar_view LMat;
+    scalar_view rho("rho", c, p);
+    scalar_view valZ_eval("valZ_eval", c, p);
+    scalar_view rhoLMat("rhoLMat", c, p, matd);
+    for (int i=0; i<d; ++i) {
+      CBrhoLMat[i]     = scalar_view("CBrhoLMat", c, f, p);
+      gradDisp_eval[i] = scalar_view("gradDisp_eval", c, p, d);
+      fe_->evaluateGradient(gradDisp_eval[i], L[i]);
+    }
+    feDens_->evaluateValue(valZ_eval, Z[0]);
+    matTensor_->computeUmat(LMat, gradDisp_eval);
+    matTensor_->computeDensity(rho, valZ_eval, 1);  // first derivative
+    fst::scalarMultiplyDataData(rhoLMat, rho, LMat);
+
+    for (int i=0; i<d; ++i)
+      fst::dotMultiplyDataField(CBrhoLMat[i], rhoLMat, matTensor_->CBdetJ(i));
+
+    /*** Evaluate weak form of the residual. ***/
+    for (int i=0; i<d; ++i)
+      fst::integrate(J[0][i],feDens_->N(),CBrhoLMat[i],false);
+
+    // Combine the Hessians.
+    FieldUtils::combineFieldCoeff<Real>(hess, J, fieldInfoDens_, fieldInfo_);
+  }
+
+  void Hessian_21(scalar_view &hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    int c  = fe_->gradN().extent_int(0);
+    int f  = fe_->gradN().extent_int(1);
+    int fd = feDens_->gradN().extent_int(1);
+    int p  = fe_->gradN().extent_int(2);
+    int d  = fe_->gradN().extent_int(3);
+    int matd = matTensor_->getMatrixDim();
+
+    // Initialize Hessians.
+    std::vector<std::vector<scalar_view>> J(d);
+    for (int i=0; i<d; ++i)
+      J[i].push_back(scalar_view("jac",c,f,fd));
+
+    // Split u_coeff into components.
+    std::vector<scalar_view> L, Z;
+    FieldUtils::splitFieldCoeff<Real>(L, l_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoDens_);
+
+    // Apply Dirichlet conditions to the multipliers.
+    dirichlet_->applyMultiplier(L);
+
+    // Evaluate/interpolate finite element fields on cells.
+    std::vector<scalar_view> gradDisp_eval(d), CBrhoLMat(d);
+    scalar_view LMat;
+    scalar_view rho("rho", c, p);
+    scalar_view valZ_eval("valZ_eval", c, p);
+    scalar_view rhoLMat("rhoLMat", c, p, matd);
+    for (int i=0; i<d; ++i) {
+      CBrhoLMat[i]     = scalar_view("CBrhoLMat", c, f, p);
+      gradDisp_eval[i] = scalar_view("gradDisp_eval", c, p, d);
+      fe_->evaluateGradient(gradDisp_eval[i], L[i]);
+    }
+    feDens_->evaluateValue(valZ_eval, Z[0]);
+    matTensor_->computeUmat(LMat, gradDisp_eval);
+    matTensor_->computeDensity(rho, valZ_eval, 1);  // first derivative
+    fst::scalarMultiplyDataData(rhoLMat, rho, LMat);
+
+    for (int i=0; i<d; ++i)
+      fst::dotMultiplyDataField(CBrhoLMat[i], rhoLMat, matTensor_->CBdetJ(i));
+
+    /*** Evaluate weak form of the residual. ***/
+    for (int i=0; i<d; ++i)
+      fst::integrate(J[i][0],CBrhoLMat[i],feDens_->N(),false);
+
+    // Combine the Hessians.
+    FieldUtils::combineFieldCoeff(hess, J, fieldInfo_, fieldInfoDens_);
+  }
+
+  void Hessian_22(scalar_view &hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    int c  = fe_->gradN().extent_int(0);
+    int fd = feDens_->gradN().extent_int(1);
+    int p  = fe_->gradN().extent_int(2);
+    int d  = fe_->gradN().extent_int(3);
+    int matd = matTensor_->getMatrixDim();
+
+    // Initialize Hessians.
+    std::vector<std::vector<scalar_view>> J(1);
+    J[0].push_back(scalar_view("hess",c,fd,fd));
+
+    // Split u_coeff into components.
+    std::vector<scalar_view> U, L, Z;
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(L, l_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoDens_);
+
+    // Apply Dirichlet conditions to the multipliers.
+    dirichlet_->applyMultiplier(L);
+
+    // Evaluate/interpolate finite element fields on cells.
+    std::vector<scalar_view> gradDispU_eval(d), gradDispL_eval(d);
+    scalar_view UMat, LMat;
+    scalar_view rho("rho", c, p);
+    scalar_view valZ_eval("valZ_eval", c, p);
+    scalar_view rhoLMat("rhoLMat", c, p, matd);
+    scalar_view CUMat("CUMat", c, p, matd);
+    scalar_view CUrhoLMat("CUrhoLMat", c, p);
+    scalar_view NCUrhoLMat("NCUrhoLMat", c, fd, p);
+    for (int i=0; i<d; ++i) {
+      gradDispU_eval[i] = scalar_view("gradDispU_eval", c, p, d);
+      fe_->evaluateGradient(gradDispU_eval[i], U[i]);
+      gradDispL_eval[i] = scalar_view("gradDispL_eval", c, p, d);
+      fe_->evaluateGradient(gradDispL_eval[i], L[i]);
+    }
+    feDens_->evaluateValue(valZ_eval, Z[0]);
+    matTensor_->computeUmat(UMat, gradDispU_eval);
+    matTensor_->computeUmat(LMat, gradDispL_eval);
+    matTensor_->computeDensity(rho, valZ_eval, 2);  // second derivative
+    fst::scalarMultiplyDataData(rhoLMat, rho, LMat);
+    matTensor_->applyTensor(CUMat, UMat);
+
+    fst::dotMultiplyDataData(CUrhoLMat, rhoLMat, CUMat);
+    fst::scalarMultiplyDataField(NCUrhoLMat, CUrhoLMat, feDens_->N());
+
+    /*** Evaluate weak form of the residual. ***/
+    fst::integrate(J[0][0],NCUrhoLMat,feDens_->NdetJ(),false);
+
+    // Combine the Hessians.
+    FieldUtils::combineFieldCoeff(hess, J, fieldInfoDens_, fieldInfoDens_);
+  }
+
+  void RieszMap_1(scalar_view & riesz) override {
+    //throw Exception::NotImplemented(">>> (PDE_TopoOpt::RieszMap_1): Not implemented.");
+
+    // Retrieve dimensions.
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    int d = fe_->gradN().extent_int(3);
+ 
+    // Initialize Jacobians.
+    std::vector<std::vector<scalar_view>> J(d);
+    for (int i=0; i<d; ++i) {
+      for (int j=0; j<d; ++j)
+        J[i].push_back(scalar_view("riesz1",c,f,f));
+    }
+
+    for (int i=0; i<d; ++i) {
+      Kokkos::deep_copy(J[i][i],fe_->stiffMat());
+      rst::add(J[i][i],fe_->massMat());
+    }
+
+    // Combine the jacobians.
+    FieldUtils::combineFieldCoeff<Real>(riesz, J, fieldInfo_, fieldInfo_);
+  }
+
+  void RieszMap_2(scalar_view & riesz) override {
+    //throw Exception::NotImplemented(">>> (PDE_TopoOpt::RieszMap_2): Not implemented.");
+
+    // Retrieve dimensions.
+    int c = feDens_->gradN().extent_int(0);
+    int f = feDens_->gradN().extent_int(1);
+
+    riesz = scalar_view("riesz2",c,f,f);
+    Kokkos::deep_copy(riesz,feDens_->massMat());
+  }
+
+  // This must be called before getFields2
+  void setDensityFields(const std::vector<basis_ptr> &basisPtrs) {
+    if (getFields2called_) {
+      TEUCHOS_TEST_FOR_EXCEPTION(getFields2called_, std::invalid_argument,
+        ">>> PDE-OPT/topo-opt/elasticity/src/pde_elasticity.hpp: Must call before getFields2!");
+    }
+    else {
+      basisPtrDens_  = basisPtrs[0];
+      basisPtrsDens_ = basisPtrs;
+
+      numDofsDens_ = 0;
+      numFieldsDens_ = basisPtrsDens_.size();
+      offsetDens_.resize(numFieldsDens_);
+      numFieldDofsDens_.resize(numFieldsDens_);
+      for (int i=0; i<numFieldsDens_; ++i) {
+        if (i==0) {
+          offsetDens_[i]  = 0;
+        }
+        else {
+          offsetDens_[i]  = offsetDens_[i-1] + basisPtrsDens_[i-1]->getCardinality();
+        }
+        numFieldDofsDens_[i] = basisPtrsDens_[i]->getCardinality();
+        numDofsDens_ += numFieldDofsDens_[i];
+      }
+    }
+  }
+
+  std::vector<basis_ptr> getFields() override {
+    return basisPtrs_;
+  }
+
+  std::vector<basis_ptr> getFields2() override {
+    getFields2called_ = true;
+    return basisPtrsDens_;
+  }
+
+  void setCellNodes(const scalar_view &volCellNodes,
+                    const std::vector<std::vector<scalar_view>> &bdryCellNodes,
+                    const std::vector<std::vector<std::vector<int>>> &bdryCellLocIds) override {
+    volCellNodes_ = volCellNodes;
+    bdryCellNodes_ = bdryCellNodes;
+    bdryCellLocIds_ = bdryCellLocIds;
+    // Finite element definition.
+    fe_ = ROL::makePtr<fe_type>(volCellNodes_,basisPtr_,cellCub_);
+    feDens_ = ROL::makePtr<fe_type>(volCellNodes_,basisPtrDens_,cellCub_,false);
+    fidx_ = fe_->getBoundaryDofs();
+    if (!traction_->isNull())
+      traction_->setCellNodes(bdryCellNodes_,bdryCellLocIds_);
+    dirichlet_->setCellNodes(bdryCellNodes_,bdryCellLocIds_,fidx_);
+    matTensor_->setFE(fe_);
+    // Construct boundary FE
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      feBdry_.resize(numSideSets);
+      for (int i = 0; i < numSideSets; ++i) {
+        int numLocSides = bdryCellNodes[i].size();
+        feBdry_[i].resize(numLocSides);
+        for (int j = 0; j < numLocSides; ++j) {
+          if (bdryCellNodes[i][j] != scalar_view())
+            feBdry_[i][j] = ROL::makePtr<fe_type>(bdryCellNodes[i][j],basisPtr_,bdryCub_,j);
+        }
+      }
+    }
+  }
+
+  void setFieldPattern(const std::vector<std::vector<int>> &fieldPattern,
+                       const std::vector<std::vector<int>> &fieldPattern2) override {
+    fieldPattern_     = fieldPattern;
+    fieldPatternDens_ = fieldPattern2;
+    fieldInfo_     = ROL::makePtr<FieldUtils::FieldInfo>(numFields_, numDofs_, numFieldDofs_, fieldPattern_);
+    fieldInfoDens_ = ROL::makePtr<FieldUtils::FieldInfo>(numFieldsDens_, numDofsDens_, numFieldDofsDens_, fieldPatternDens_);
+  }
+
+  const ROL::Ptr<fe_type> getStateFE(void) const {
+    return fe_;
+  }
+
+  const ROL::Ptr<fe_type> getDensityFE(void) const {
+    return feDens_;
+  }
+
+  const std::vector<std::vector<ROL::Ptr<fe_type>>> getBdryFE(void) const {
+    return feBdry_;
+  }
+
+  const std::vector<std::vector<std::vector<int>>> getBdryCellLocIds(void) const {
+    return bdryCellLocIds_;
+  }
+
+  const ROL::Ptr<FieldUtils::FieldInfo> getStateFieldInfo(void) const {
+    return fieldInfo_;
+  }
+
+  const ROL::Ptr<FieldUtils::FieldInfo> getDensityFieldInfo(void) const {
+    return fieldInfoDens_;
+  }
+
+  const ROL::Ptr<Traction<Real,DeviceType>> getTraction(void) const {
+    return traction_;
+  }
+
+  const ROL::Ptr<MaterialTensor<Real,DeviceType>> getMaterialTensor(void) const {
+    return matTensor_;
+  }
+
+}; // PDE_TopoOpt
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/src/pde_filterK.hpp b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/src/pde_filterK.hpp
new file mode 100644
index 000000000000..fb46e7051621
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/src/pde_filterK.hpp
@@ -0,0 +1,249 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  pde_filter.hpp
+    \brief Implements the local PDE interface for the structural topology
+           optimization problem.
+*/
+
+#ifndef PDE_TOPO_OPT_FILTERK_HPP
+#define PDE_TOPO_OPT_FILTERK_HPP
+
+#include "../../../../TOOLS/pdeK.hpp"
+#include "../../../../TOOLS/feK.hpp"
+
+#include "Intrepid2_HVOL_C0_FEM.hpp"
+#include "Intrepid2_HGRAD_QUAD_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_QUAD_C2_FEM.hpp"
+#include "Intrepid2_HGRAD_HEX_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_HEX_C2_FEM.hpp"
+#include "Intrepid2_DefaultCubatureFactory.hpp"
+#include "Intrepid2_FunctionSpaceTools.hpp"
+#include "Intrepid2_CellTools.hpp"
+
+#include "ROL_Ptr.hpp"
+
+
+template <class Real, class DeviceType>
+class PDE_Filter : public PDE<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  // Finite element basis information
+  basis_ptr basisPtr_, basisPtrDens_;
+  std::vector<basis_ptr> basisPtrs_, basisPtrsDens_;
+  // Cell cubature information
+  ROL::Ptr<Intrepid2::Cubature<DeviceType,Real,Real>> cellCub_;
+  // Cell node information
+  scalar_view volCellNodes_;
+  // Finite element definition
+  ROL::Ptr<fe_type> fe_, feDens_;
+  // Problem parameters.
+  Real lengthScale_;
+
+public:
+  PDE_Filter(ROL::ParameterList &parlist) {
+    // Finite element fields.
+    int basisOrder     = parlist.sublist("Problem").get("Filter Basis Order",1);
+    int basisOrderDens = parlist.sublist("Problem").get("Density Basis Order",0);
+    int cubDegree      = parlist.sublist("Problem").get("Cubature Degree",4);
+    int probDim        = parlist.sublist("Problem").get("Problem Dimension",2);
+    TEUCHOS_TEST_FOR_EXCEPTION(probDim>3||probDim<2, std::invalid_argument,
+      ">>> PDE-OPT/poisson/pde_poisson.hpp: Problem dimension is not 2 or 3!");
+    TEUCHOS_TEST_FOR_EXCEPTION(basisOrder>2||basisOrder<1, std::invalid_argument,
+      ">>> PDE-OPT/poisson/pde_poisson.hpp: Basis order is not 1 or 2!");
+    if (probDim == 2) {
+      if (basisOrder == 1)
+        basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceType,Real,Real>>();
+      else if (basisOrder == 2)
+        basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C2_FEM<DeviceType,Real,Real>>();
+    }
+    else if (probDim == 3) {
+      if (basisOrder == 1)
+        basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_HEX_C1_FEM<DeviceType,Real,Real>>();
+      else if (basisOrder == 2)
+        basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_HEX_C2_FEM<DeviceType,Real,Real>>();
+    }
+    basisPtrs_.clear(); basisPtrsDens_.clear();
+    basisPtrs_.push_back(basisPtr_);  // Filtered Density component
+    shards::CellTopology cellType = basisPtr_->getBaseCellTopology(); // get cell type from basis
+    if (basisOrderDens == 1)  {
+      if (probDim == 2)
+        basisPtrDens_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceType,Real,Real>>();
+      else if (probDim == 3)
+        basisPtrDens_ = ROL::makePtr<Intrepid2::Basis_HGRAD_HEX_C1_FEM<DeviceType,Real,Real>>();
+    }
+    else
+      basisPtrDens_ = ROL::makePtr<Intrepid2::Basis_HVOL_C0_FEM<DeviceType,Real,Real>>(cellType);
+    basisPtrsDens_.push_back(basisPtrDens_); // Density component
+
+    // Quadrature rules.
+    Intrepid2::DefaultCubatureFactory cubFactory;                            // create cubature factory
+    cellCub_ = cubFactory.create<DeviceType,Real,Real>(cellType, cubDegree); // create default cubature
+
+    // Other problem parameters.
+    Real filterRadius = parlist.sublist("Problem").get("Filter Radius",  0.1);
+    lengthScale_ = std::pow(filterRadius, 2)/static_cast<Real>(12);
+  }
+
+  void residual(scalar_view & res,
+                const scalar_view u_coeff,
+                const scalar_view z_coeff = scalar_view(),
+                const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c = fe_->gradN().extent_int(0);
+    const int f = fe_->gradN().extent_int(1);
+    const int p = fe_->gradN().extent_int(2);
+    const int d = fe_->gradN().extent_int(3);
+ 
+    // Initialize residuals.
+    res = scalar_view("res", c,f);
+
+    // Evaluate/interpolate finite element fields on cells.
+    scalar_view valU_eval ("valU_eval", c, p);
+    scalar_view valZ_eval ("valZ_eval", c, p);
+    scalar_view gradU_eval("gradU_eval", c, p, d);
+    fe_->evaluateValue(valU_eval, u_coeff);
+    feDens_->evaluateValue(valZ_eval, z_coeff);
+    fe_->evaluateGradient(gradU_eval, u_coeff);
+
+    rst::scale(gradU_eval, lengthScale_);
+    rst::scale(valZ_eval, static_cast<Real>(-1));
+
+    /*** Evaluate weak form of the residual. ***/
+    fst::integrate(res,gradU_eval,fe_->gradNdetJ(),false);
+    fst::integrate(res,valU_eval,fe_->NdetJ(),true);
+    fst::integrate(res,valZ_eval,fe_->NdetJ(),true);
+  }
+
+  void Jacobian_1(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c = fe_->gradN().extent_int(0);
+    const int f = fe_->gradN().extent_int(1);
+ 
+    // Initialize Jacobians.
+    jac = scalar_view("jac",c,f,f);
+
+    /*** Evaluate weak form of the Jacobian. ***/
+    Kokkos::deep_copy(jac,fe_->stiffMat());
+    rst::scale(jac, lengthScale_); // ls*gradN1 . gradN2
+    rst::add(jac, fe_->massMat()); // + N1 * N2
+  }
+
+
+  void Jacobian_2(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c  = fe_->gradN().extent_int(0);
+    const int f  = fe_->gradN().extent_int(1);
+    const int fd = feDens_->gradN().extent_int(1);
+ 
+    // Initialize Jacobians.
+    jac = scalar_view("jac",c,f,fd);
+
+    /*** Evaluate weak form of the Jacobian. ***/
+    fst::integrate(jac,fe_->NdetJ(),feDens_->N(),false);
+    rst::scale(jac, static_cast<Real>(-1));
+  }
+
+  void Hessian_11(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Filter::Hessian_11): Hessian is zero.");
+  }
+
+  void Hessian_12(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Filter::Hessian_12): Hessian is zero.");
+  }
+
+  void Hessian_21(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Filter::Hessian_21): Hessian is zero.");
+  }
+
+  void Hessian_22(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Filter::Hessian_22): Hessian is zero.");
+  }
+
+  void RieszMap_1(scalar_view & riesz) override {
+    // Retrieve dimensions.
+    const int c = fe_->gradN().extent_int(0);
+    const int f = fe_->gradN().extent_int(1);
+ 
+    // Initialize Jacobians.
+    riesz = scalar_view("riesz1",c,f,f);
+    Kokkos::deep_copy(riesz,fe_->stiffMat());
+    rst::add(riesz,fe_->massMat());
+  }
+
+  void RieszMap_2(scalar_view & riesz) override {
+    // Retrieve dimensions.
+    const int c = feDens_->gradN().extent_int(0);
+    const int f = feDens_->gradN().extent_int(1);
+ 
+    // Initialize Jacobians.
+    riesz = scalar_view("riesz2",c,f,f);
+    Kokkos::deep_copy(riesz,feDens_->massMat());
+  }
+
+  std::vector<basis_ptr> getFields() override {
+    return basisPtrs_;
+  }
+
+  std::vector<basis_ptr> getFields2() override {
+    return basisPtrsDens_;
+  }
+
+  void setCellNodes(const scalar_view &volCellNodes,
+                    const std::vector<std::vector<scalar_view>> &bdryCellNodes,
+                    const std::vector<std::vector<std::vector<int>>> &bdryCellLocIds) override {
+    volCellNodes_ = volCellNodes;
+    // Finite element definition.
+    fe_     = ROL::makePtr<fe_type>(volCellNodes_,basisPtr_,cellCub_);
+    feDens_ = ROL::makePtr<fe_type>(volCellNodes_,basisPtrDens_,cellCub_,false);
+  }
+
+  void setFieldPattern(const std::vector<std::vector<int>> &fieldPattern,
+                       const std::vector<std::vector<int>> &fieldPattern2) override {}
+
+  const ROL::Ptr<fe_type> getStateFE(void) const {
+    return fe_;
+  }
+
+  const ROL::Ptr<fe_type> getDensityFE(void) const {
+    return feDens_;
+  }
+
+}; // PDE_Filter
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/src/traction.hpp b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/src/traction.hpp
index 56233ff6e399..c1d797019bb8 100644
--- a/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/src/traction.hpp
+++ b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/src/traction.hpp
@@ -44,7 +44,7 @@ class Traction {
   Traction(void) {};
 
   Traction(int dim) : dim_(dim) {
-    assert(dim > 3 || dim < 2);
+    assert(dim <= 3 && dim >= 2);
     if (dim==2) {
       offset_ = 0;
       sidesets_.push_back(2);
diff --git a/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/src/tractionK.hpp b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/src/tractionK.hpp
new file mode 100644
index 000000000000..85cef205cd55
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/src/tractionK.hpp
@@ -0,0 +1,185 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  traction.hpp
+    \brief Implements traction loads for the structural topology
+           optimization problem.
+*/
+
+#ifndef ROL_PDEOPT_ELASTICITY_TRACTIONK_HPP
+#define ROL_PDEOPT_ELASTICITY_TRACTIONK_HPP
+
+#include <vector>
+#include "ROL_Ptr.hpp"
+#include "ROL_ParameterList.hpp"
+#include "../../../../TOOLS/feK.hpp"
+
+template<class Real, class DeviceType>
+class Traction {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  std::vector<int> sidesets_, sideids_;
+  std::vector<Real> loadMagnitude_, loadPolar_, loadAzimuth_;
+  std::vector<std::vector<scalar_view>> bdryCellNodes_;
+  std::vector<std::vector<std::vector<int>>> bdryCellLocIds_;
+  int dim_, offset_;
+
+protected:
+  Real DegreesToRadians(Real deg) const {
+    return deg * static_cast<Real>(M_PI) / static_cast<Real>(180);
+  }
+
+public:
+  virtual ~Traction() {}
+
+  Traction(void) {};
+
+  Traction(int dim) : dim_(dim) {
+    assert(dim <= 3 && dim >= 2);
+    if (dim==2) {
+      offset_ = 0;
+      sidesets_.push_back(2);
+      sideids_.push_back(0);
+      loadMagnitude_.push_back(static_cast<Real>(1));
+      loadPolar_.push_back(static_cast<Real>(270));
+      loadAzimuth_.push_back(static_cast<Real>(0));
+    }
+    else {
+      offset_ = 0;
+      sidesets_.push_back(2);
+      sideids_.push_back(1);
+      loadMagnitude_.push_back(static_cast<Real>(1));
+      loadPolar_.push_back(static_cast<Real>(270));
+      loadAzimuth_.push_back(static_cast<Real>(90));
+    }
+
+    int polarSize = loadPolar_.size();
+    for (int i=0; i<polarSize; ++i)
+      loadPolar_[i] = DegreesToRadians(loadPolar_[i]);
+
+    int azimuthSize = loadAzimuth_.size();
+    for (int i=0; i<azimuthSize; ++i)
+      loadAzimuth_[i] = DegreesToRadians(loadAzimuth_[i]);
+  }
+
+  bool isNull(void) const {
+    return (sidesets_.size()==0);
+  }
+
+  void setCellNodes(const std::vector<std::vector<scalar_view>> &bdryCellNodes,
+                    const std::vector<std::vector<std::vector<int>>> &bdryCellLocIds) {
+    bdryCellNodes_  = bdryCellNodes;
+    bdryCellLocIds_ = bdryCellLocIds;
+  }
+
+  virtual Real evaluate(int dir, int sideset, const std::vector<Real> &param) const {
+    const int paramSize = param.size();
+    Real val(0), loadMagNoise(0), loadAngNoise0(0);
+    if (paramSize > 0 + dim_*sideset)
+      loadMagNoise  = param[offset_ + 0 + dim_*sideset];
+    if (paramSize > 1 + dim_*sideset)
+      loadAngNoise0 = DegreesToRadians(param[offset_ + 1 + dim_*sideset]);
+    const Real loadMagnitude = loadMagnitude_[sideset] + loadMagNoise;
+    const Real loadAngle0    = loadPolar_[sideset]     + loadAngNoise0;
+
+    if (dim_==2) {
+      if (dir==0)
+        val = loadMagnitude*std::cos(loadAngle0);
+      if (dir==1)
+        val = loadMagnitude*std::sin(loadAngle0);
+    }
+    if (dim_==3) {
+      Real loadAngNoise1(0);
+      if (paramSize > 2 + dim_*sideset)
+        loadAngNoise1 = DegreesToRadians(param[offset_ + 2 + dim_*sideset]);
+      const Real loadAngle1 = loadAzimuth_[sideset] + loadAngNoise1;
+
+      if (dir==0)
+        val = loadMagnitude*std::sin(loadAngle0)*std::cos(loadAngle1);
+      if (dir==1)
+        val = loadMagnitude*std::sin(loadAngle0)*std::sin(loadAngle1);
+      if (dir==2)
+        val = loadMagnitude*std::cos(loadAngle0);
+    }
+    return val;
+  }
+
+  void compute(std::vector<std::vector<std::vector<scalar_view>>> &traction,
+               const std::vector<std::vector<ROL::Ptr<fe_type>>> &fe,
+               const std::vector<Real> &param,
+               Real scale = Real(1)) const {
+    traction.clear();
+    traction.resize(bdryCellLocIds_.size());
+    const int numSideSets = sidesets_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        const int numLocalSideIds = bdryCellLocIds_[sidesets_[i]].size();
+        traction[sidesets_[i]].resize(numLocalSideIds);
+        for (int j = 0; j < numLocalSideIds; ++j) {
+          const int numCellsSide = bdryCellLocIds_[sidesets_[i]][j].size();
+          if (numCellsSide > 0) {
+            const int numCubPerSide = fe[sidesets_[i]][j]->cubPts().extent_int(1);
+            traction[sidesets_[i]][j].resize(dim_);
+            for (int k = 0; k < dim_; ++k) {
+              traction[sidesets_[i]][j][k] = scalar_view("traction", numCellsSide, numCubPerSide);
+              for (int c = 0; c < numCellsSide; ++c) {
+                for (int p = 0; p < numCubPerSide; ++p)
+                  (traction[sidesets_[i]][j][k])(c,p) = scale*evaluate(k,i,param);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void apply(std::vector<scalar_view> &R,
+             const std::vector<std::vector<ROL::Ptr<fe_type>>> &fe,
+             const std::vector<Real> &param,
+             Real scale = Real(1)) const {
+    const int numSideSets = sidesets_.size();
+    const int nf          = R[0].extent_int(1);
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        const int numLocalSideIds = bdryCellLocIds_[sidesets_[i]].size();
+        for (int j = 0; j < numLocalSideIds; ++j) {
+          const int numCellsSide = bdryCellLocIds_[sidesets_[i]][j].size();
+          if (numCellsSide > 0) {
+            const int numCubPerSide = fe[sidesets_[i]][j]->cubPts().extent_int(1);
+            for (int k = 0; k < dim_; ++k) {
+              scalar_view traction("traction", numCellsSide, numCubPerSide);
+              for (int c = 0; c < numCellsSide; ++c) {
+                for (int p = 0; p < numCubPerSide; ++p)
+                  traction(c,p) = scale*evaluate(k,i,param);
+              }
+              scalar_view tractionResidual("tractionResidual", numCellsSide, nf);
+              fst::integrate(tractionResidual, traction, fe[sidesets_[i]][j]->NdetJ(), false);
+              // Add traction residual to volume residual
+              for (int l = 0; l < numCellsSide; ++l) {
+                int cidx = bdryCellLocIds_[sidesets_[i]][j][l];
+                for (int m = 0; m < nf; ++m) {
+                  (R[k])(cidx,m) += tractionResidual(l,m);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/src/volume_conK.hpp b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/src/volume_conK.hpp
new file mode 100644
index 000000000000..4b6901e2a739
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/elasticity/src/volume_conK.hpp
@@ -0,0 +1,138 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+#ifndef TOPOPT_VOLUME_CONK_H
+#define TOPOPT_VOLUME_CONK_H
+
+#include "ROL_Constraint.hpp"
+#include "ROL_SingletonVector.hpp"
+#include "../../../../TOOLS/qoiK.hpp"
+#include "../../../../TOOLS/assemblerK.hpp"
+#include "../../../../TOOLS/pdevectorK.hpp"
+
+template <class Real, class DeviceType>
+class TopOptVolumeConstraint : public ROL::Constraint<Real> {
+private:
+  // Elasticity PDE
+  const ROL::Ptr<QoI<Real,DeviceType>> qoi_;
+  const ROL::Ptr<Assembler<Real,DeviceType>> assembler_;
+  bool assemble_;
+  ROL::Ptr<Tpetra::MultiVector<>> controlZero_;
+  ROL::Ptr<ROL::Vector<Real>> V_, Vdual_;
+  Real V0_;
+
+  unsigned nupda_;
+  unsigned nfval_;
+  unsigned njaco_;
+  unsigned najac_;
+  unsigned nhess_;
+  unsigned nasse_;
+
+public:
+  TopOptVolumeConstraint(const ROL::Ptr<QoI<Real,DeviceType>> &qoi,
+                         const ROL::Ptr<Assembler<Real,DeviceType>> &assembler,
+                         const ROL::Ptr<const ROL::Vector<Real>> &z)
+    : qoi_(qoi), assembler_(assembler), assemble_(true),
+      nupda_(0), nfval_(0), njaco_(0), najac_(0), nhess_(0), nasse_(0) {
+    controlZero_ = assembler_->createControlVector(); controlZero_->putScalar(0.0);
+    V_     = z->dual().clone();
+    Vdual_ = z->clone();
+    assemble();
+  }
+
+  void update( const ROL::Vector<Real> &z, bool flag = true, int iter = -1 ) {
+    nupda_++;
+  }
+
+  void value( ROL::Vector<Real> &c, const ROL::Vector<Real> &z, Real &tol ) {
+    nfval_++;
+    dynamic_cast<ROL::SingletonVector<Real>&>(c).setValue(V0_ + Vdual_->dot(z));
+  }
+
+  void applyJacobian(ROL::Vector<Real> &jv, const ROL::Vector<Real> &v,
+                     const ROL::Vector<Real> &z, Real &tol) {
+    njaco_++;
+    dynamic_cast<ROL::SingletonVector<Real>&>(jv).setValue(Vdual_->dot(v));
+  }
+
+  void applyAdjointJacobian( ROL::Vector<Real> &jv, const ROL::Vector<Real> &v, const ROL::Vector<Real> &z, Real &tol ) {
+    najac_++;
+    Real vval = dynamic_cast<const ROL::SingletonVector<Real>&>(v).getValue();
+    jv.set(*V_);
+    jv.scale(vval);
+  }
+
+  void applyAdjointHessian( ROL::Vector<Real> &ahwv, const ROL::Vector<Real> &w, const ROL::Vector<Real> &v, const ROL::Vector<Real> &z, Real &tol ) {
+    nhess_++;
+    ahwv.zero();
+  }
+
+  void summarize(std::ostream &stream) const {
+    stream << std::endl;
+    stream << std::string(114,'=') << std::endl;
+    stream << "  Volume_Constraint::summarize" << std::endl;
+    stream << "    Number of calls to update:               " << nupda_ << std::endl;
+    stream << "    Number of calls to value:                " << nfval_ << std::endl;
+    stream << "    Number of calls to applyJacobian:        " << njaco_ << std::endl;
+    stream << "    Number of calls to applyAdjointJacobian: " << najac_ << std::endl;
+    stream << "    Number of calls to applyAdjointHessian:  " << nhess_ << std::endl;
+    stream << "    Number of assemblies:                    " << nasse_ << std::endl;
+    stream << std::string(114,'=') << std::endl;
+    stream << std::endl;
+  }
+
+  void reset(void) {
+    nupda_ = 0; nfval_ = 0; njaco_ = 0; najac_ = 0; nhess_ = 0; nasse_ = 0;
+  }
+
+// For parametrized (stochastic) objective functions and constraints
+public:
+  void setParameter(const std::vector<Real> &param) {
+    ROL::Constraint<Real>::setParameter(param);
+    qoi_->setParameter(param);
+  }
+
+private:
+
+  ROL::Ptr<Tpetra::MultiVector<>> getField(ROL::Vector<Real> &x) const {
+    ROL::Ptr<Tpetra::MultiVector<>> xp;
+    try {
+      xp = dynamic_cast<ROL::TpetraMultiVector<Real>&>(x).getVector();
+    }
+    catch (std::exception &e) {
+      try {
+        ROL::Ptr<ROL::TpetraMultiVector<Real>> xvec
+          = dynamic_cast<PDE_OptVector<Real>&>(x).getField();
+        if ( xvec == ROL::nullPtr ) {
+          xp = ROL::nullPtr;
+        }
+        else {
+          xp = xvec->getVector();
+        }
+      }
+      catch (std::exception &ee) {
+        xp = ROL::nullPtr;
+      }
+    }
+    return xp;
+  }
+
+  void assemble(void) {
+    nasse_++;
+    if (assemble_) {
+      ROL::Ptr<Tpetra::MultiVector<>> V = getField(*V_);
+      V0_ = assembler_->assembleQoIValue(qoi_,ROL::nullPtr,controlZero_);
+      assembler_->assembleQoIGradient2(V,qoi_,ROL::nullPtr,controlZero_);
+      Vdual_->set(V_->dual());
+      assemble_ = false;
+    }
+  }
+}; // class Volume_Constraint
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/lasso/CMakeLists.txt b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/lasso/CMakeLists.txt
index 697d7680f966..42dc3d56c9b6 100644
--- a/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/lasso/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/lasso/CMakeLists.txt
@@ -11,13 +11,13 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_01
     SOURCES example_01.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     TRSPG_LassoDataCopy
     SOURCE_FILES
       lasso-data.txt input.xml
diff --git a/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/lasso/example_01.cpp b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/lasso/example_01.cpp
index 685d4c9d4246..91e3102a6320 100644
--- a/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/lasso/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/lasso/example_01.cpp
@@ -20,8 +20,7 @@
 #include "ROL_BLAS.hpp"
 
 #include "Teuchos_Time.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 #include <fstream>
@@ -157,7 +156,7 @@ class LassoConstraint : public ROL::StdConstraint<Real> {
 };
 
 int main(int argc, char *argv[]) {
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/navier-stokes/CMakeLists.txt b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/navier-stokes/CMakeLists.txt
index 0a45a636195a..64c14c53c478 100644
--- a/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/navier-stokes/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/navier-stokes/CMakeLists.txt
@@ -11,13 +11,13 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_01
     SOURCES example_01.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     TRSPG_TopOptNavierStokesDataCopy
     SOURCE_FILES
       input.xml plotresults.m mesh.jou
@@ -25,5 +25,32 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
   )
 
+ENDIF()
+
+IF(${PROJECT_NAME}_ENABLE_Intrepid2 AND
+   ${PROJECT_NAME}_ENABLE_Ifpack2   AND
+   ${PROJECT_NAME}_ENABLE_MueLu     AND
+   ${PROJECT_NAME}_ENABLE_Amesos2   AND
+   ${PROJECT_NAME}_ENABLE_Tpetra )
+
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/teuchos/src/sol)
+
+  # Need ROL_TpetraMultiVector.hpp
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
+
+  ROL_ADD_EXECUTABLE(
+    example_01_Kokkos
+    SOURCES example_01K.cpp
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_COPY_FILES_TO_BINARY_DIR(
+    TRSPG_TopOptNavierStokesDataCopyK
+    SOURCE_FILES
+      input.xml plotresults.m mesh.jou
+    SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}"
+    DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
+  )
 
 ENDIF()
diff --git a/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/navier-stokes/example_01.cpp b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/navier-stokes/example_01.cpp
index 183d5f4e5b43..423d784e6246 100644
--- a/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/navier-stokes/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/navier-stokes/example_01.cpp
@@ -14,8 +14,7 @@
 #include "Teuchos_Comm.hpp"
 #include "Teuchos_Time.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -48,7 +47,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int>> comm
     = Tpetra::getDefaultComm();
   const int myRank = comm->getRank();
diff --git a/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/navier-stokes/example_01K.cpp b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/navier-stokes/example_01K.cpp
new file mode 100644
index 000000000000..b73ecb75f7ce
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/navier-stokes/example_01K.cpp
@@ -0,0 +1,264 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_01.cpp
+    \brief Shows how to solve the Navier-Stokes control problem.
+*/
+
+#include "Teuchos_Comm.hpp"
+#include "Teuchos_Time.hpp"
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+
+#include "ROL_TpetraMultiVector.hpp"
+#include "ROL_Reduced_Objective_SimOpt.hpp"
+#include "ROL_Bounds.hpp"
+#include "ROL_Solver.hpp"
+#include "ROL_SingletonVector.hpp"
+#include "ROL_ConstraintFromObjective.hpp"
+
+#include "../../../TOOLS/meshreaderK.hpp"
+#include "../../../TOOLS/pdeconstraintK.hpp"
+#include "../../../TOOLS/pdeobjectiveK.hpp"
+#include "../../../TOOLS/pdevectorK.hpp"
+#include "../../../TOOLS/integralconstraintK.hpp"
+
+#include "pde_navier-stokesK.hpp"
+#include "obj_navier-stokesK.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+int main(int argc, char *argv[]) {
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  Kokkos::ScopeGuard kokkosScope (argc, argv);
+  auto comm = Tpetra::getDefaultComm();
+  const int myRank = comm->getRank();
+  if ((iprint > 0) && (myRank == 0))
+    outStream = ROL::makePtrFromRef(std::cout);
+  else
+    outStream = ROL::makePtrFromRef(bhs);
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+
+    /*** Read in XML input ***/
+    std::string filename = "input.xml";
+    auto parlist = ROL::getParametersFromXmlFile(filename);
+
+    bool output = parlist->sublist("SimOpt").sublist("Solve").get("Output Iteration History",false);
+    output = (iprint > 0) && (myRank==0) && output;
+    parlist->sublist("SimOpt").sublist("Solve").set("Output Iteration History",output);
+
+    /*** Initialize main data structure. ***/
+    auto meshMgr = ROL::makePtr<MeshReader<RealT,DeviceT>>(*parlist);
+    // Initialize PDE describing Navier-Stokes equations.
+    auto pde = ROL::makePtr<PDE_NavierStokes<RealT,DeviceT>>(*parlist);
+    auto con = ROL::makePtr<PDE_Constraint<RealT,DeviceT>>(pde,meshMgr,comm,*parlist,*outStream);
+    // Cast the constraint and get the assembler.
+    auto assembler = con->getAssembler();
+    con->setSolveParameters(*parlist);
+
+    // Create state vector and set to zeroes
+    auto u_ptr = assembler->createStateVector();    u_ptr->randomize();
+    auto p_ptr = assembler->createStateVector();    p_ptr->randomize();
+    auto z_ptr = assembler->createControlVector();  z_ptr->randomize();
+    auto r_ptr = assembler->createResidualVector(); r_ptr->putScalar(0.0);
+    auto up = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(u_ptr,pde,assembler,*parlist);
+    auto pp = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(p_ptr,pde,assembler,*parlist);
+    auto zp = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(z_ptr,pde,assembler,*parlist);
+    auto rp = ROL::makePtr<PDE_DualSimVector<RealT,DeviceT>>(r_ptr,pde,assembler,*parlist);
+    auto xp = ROL::makePtr<ROL::Vector_SimOpt<RealT>>(up,zp);
+
+    // Initialize quadratic objective function.
+    auto qoi = ROL::makePtr<QoI_Power_NavierStokes<RealT,DeviceT>>(*parlist,
+                                                                   pde->getVelocityFE(),
+                                                                   pde->getPressureFE(),
+                                                                   pde->getControlFE(),
+                                                                   pde->getStateFieldInfo(),
+                                                                   pde->getControlFieldInfo());
+    auto obj = ROL::makePtr<PDE_Objective<RealT,DeviceT>>(qoi,assembler);
+    auto robj = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(obj,con,up,zp,pp,true,false);
+
+    // Initialize volume constraint.
+    RealT volFraction = parlist->sublist("Problem").get("Volume Fraction",0.5);
+    auto qoi_vol = ROL::makePtr<QoI_Volume_NavierStokes<RealT,DeviceT>>(*parlist,
+                                                                        pde->getControlFE(),
+                                                                        pde->getControlFieldInfo());
+    auto obj_vol = ROL::makePtr<IntegralOptObjective<RealT,DeviceT>>(qoi_vol,assembler);
+    auto icon = ROL::makePtr<ROL::ConstraintFromObjective<RealT>>(obj_vol);
+    auto imul = ROL::makePtr<ROL::SingletonVector<RealT>>(0.0);
+
+    // Build bound constraint
+    auto lp = zp->clone(); lp->setScalar(0.0);
+    auto hp = zp->clone(); hp->setScalar(1.0);
+    auto bnd = ROL::makePtr<ROL::Bounds<RealT>>(lp, hp);
+    // Build optimization problem
+    auto optProb = ROL::makePtr<ROL::Problem<RealT>>(robj, zp);
+    optProb->addBoundConstraint(bnd);
+    optProb->addLinearConstraint("Volume",icon,imul);
+    optProb->setProjectionAlgorithm(*parlist);
+
+    // Run derivative checks
+    bool checkDeriv = parlist->sublist("Problem").get("Check derivatives",false);
+    if ( checkDeriv ) {
+      auto dup = up->clone(); dup->randomize();
+      auto dzp = zp->clone(); dzp->randomize();
+      con->checkApplyJacobian_1(*up,*zp,*dup,*up,true,*outStream);
+      con->checkApplyJacobian_2(*up,*zp,*dzp,*up,true,*outStream);
+      con->checkInverseJacobian_1(*up,*up,*up,*zp,true,*outStream);
+      con->checkInverseAdjointJacobian_1(*up,*up,*up,*zp,true,*outStream);
+      con->checkApplyAdjointHessian_11(*up,*zp,*pp,*dup,*up,true,*outStream);
+      con->checkApplyAdjointHessian_21(*up,*zp,*pp,*dzp,*up,true,*outStream);
+      con->checkApplyAdjointHessian_12(*up,*zp,*pp,*dup,*zp,true,*outStream);
+      con->checkApplyAdjointHessian_22(*up,*zp,*pp,*dzp,*zp,true,*outStream);
+      obj->checkGradient_1(*up,*zp,*dup,true,*outStream);
+      obj->checkGradient_2(*up,*zp,*dzp,true,*outStream);
+      obj->checkHessVec_11(*up,*zp,*dup,true,*outStream);
+      obj->checkHessVec_12(*up,*zp,*dzp,true,*outStream);
+      obj->checkHessVec_21(*up,*zp,*dup,true,*outStream);
+      obj->checkHessVec_22(*up,*zp,*dzp,true,*outStream);
+      icon->checkApplyJacobian(*zp,*dzp,*imul,true,*outStream);
+      icon->checkApplyAdjointHessian(*zp,*imul,*dzp,*zp,true,*outStream);
+      robj->checkGradient(*zp,*dzp,true,*outStream);
+      robj->checkHessVec(*zp,*dzp,true,*outStream);
+      //optProb->check(*outStream);
+    }
+
+    // Solve optimization problem
+    ROL::Ptr<ROL::Solver<RealT>> solver;
+    std::vector<RealT> vals(6);
+
+    // SPG
+    up->zero(); pp->zero(); imul->zero();
+    zp->setScalar(volFraction);
+    optProb->finalize(false,true,*outStream);
+    parlist->sublist("Step").set("Type","Spectral Gradient");
+    Teuchos::Time spgTimer("SPG Time", true);
+    solver = ROL::makePtr<ROL::Solver<RealT>>(optProb,*parlist);
+    solver->solve(*outStream);
+    spgTimer.stop();
+    *outStream << "Total optimization time = " << spgTimer.totalElapsedTime() << " seconds.\n";
+    vals[0] = solver->getAlgorithmState()->value;
+
+    // PQN
+    up->zero(); pp->zero(); imul->zero();
+    zp->setScalar(volFraction);
+    parlist->sublist("Step").set("Type","Line Search");
+    parlist->sublist("Step").sublist("Line Search").sublist("Descent Method").set("Type","Quasi-Newton Method");
+    Teuchos::Time pqnTimer("PQN Time", true);
+    solver = ROL::makePtr<ROL::Solver<RealT>>(optProb,*parlist);
+    solver->solve(*outStream);
+    pqnTimer.stop();
+    *outStream << "Total optimization time = " << pqnTimer.totalElapsedTime() << " seconds.\n";
+    vals[1] = solver->getAlgorithmState()->value;
+
+    // LMTR
+    up->zero(); pp->zero(); imul->zero();
+    zp->setScalar(volFraction);
+    parlist->sublist("Step").set("Type","Trust Region");
+    parlist->sublist("Step").sublist("Trust Region").set("Subproblem Model","Lin-More");
+    Teuchos::Time lmtrTimer("LMTR Time", true);
+    solver = ROL::makePtr<ROL::Solver<RealT>>(optProb,*parlist);
+    solver->solve(*outStream);
+    lmtrTimer.stop();
+    *outStream << "Total optimization time = " << lmtrTimer.totalElapsedTime() << " seconds.\n";
+    vals[2] = solver->getAlgorithmState()->value;
+
+    // TRSPG
+    up->zero(); pp->zero(); imul->zero();
+    zp->setScalar(volFraction);
+    parlist->sublist("Step").set("Type","Trust Region");
+    parlist->sublist("Step").sublist("Trust Region").set("Subproblem Model","SPG");
+    Teuchos::Time trspgTimer("TRSPG Time", true);
+    solver = ROL::makePtr<ROL::Solver<RealT>>(optProb,*parlist);
+    solver->solve(*outStream);
+    trspgTimer.stop();
+    *outStream << "Total optimization time = " << trspgTimer.totalElapsedTime() << " seconds.\n";
+    vals[3] = solver->getAlgorithmState()->value;
+
+    // AL-LMTR
+    up->zero(); pp->zero(); imul->zero();
+    zp->setScalar(volFraction);
+    optProb->edit();
+    optProb->finalize(true,true,*outStream);
+    parlist->sublist("Step").set("Type","Augmented Lagrangian");
+    parlist->sublist("Step").sublist("Augmented Lagrangian").set("Subproblem Step Type","Trust Region");
+    parlist->sublist("Step").sublist("Trust Region").set("Subproblem Model","Lin-More");
+    Teuchos::Time allmtrTimer("AL-LMTR Time", true);
+    solver = ROL::makePtr<ROL::Solver<RealT>>(optProb,*parlist);
+    solver->solve(*outStream);
+    allmtrTimer.stop();
+    *outStream << "Total optimization time = " << allmtrTimer.totalElapsedTime() << " seconds.\n";
+    vals[4] = solver->getAlgorithmState()->value;
+
+    // AL-LMTR
+    up->zero(); pp->zero(); imul->zero();
+    zp->setScalar(volFraction);
+    parlist->sublist("Step").set("Type","Augmented Lagrangian");
+    parlist->sublist("Step").sublist("Augmented Lagrangian").set("Subproblem Step Type","Trust Region");
+    parlist->sublist("Step").sublist("Trust Region").set("Subproblem Model","SPG");
+    Teuchos::Time altrspgTimer("AL-TRSPG Time", true);
+    solver = ROL::makePtr<ROL::Solver<RealT>>(optProb,*parlist);
+    solver->solve(*outStream);
+    altrspgTimer.stop();
+    *outStream << "Total optimization time = " << altrspgTimer.totalElapsedTime() << " seconds.\n";
+    vals[5] = solver->getAlgorithmState()->value;
+
+    RealT minval = std::min(vals[0],std::min(vals[1],std::min(vals[2],vals[3])));
+    *outStream << std::scientific << std::setprecision(16);
+    *outStream << "SPG      " << (vals[0]-minval)/minval << std::endl;
+    *outStream << "PQN      " << (vals[1]-minval)/minval << std::endl;
+    *outStream << "LMTR     " << (vals[2]-minval)/minval << std::endl;
+    *outStream << "TRSPG    " << (vals[3]-minval)/minval << std::endl;
+    *outStream << "AL-LMTR  " << (vals[4]-minval)/minval << std::endl;
+    *outStream << "AL-TRSPG " << (vals[5]-minval)/minval << std::endl;
+    *outStream << std::endl;
+
+    // Output
+    con->outputTpetraVector(z_ptr,"control.txt");
+    assembler->printMeshData(*outStream);
+    RealT tol(1.e-8);
+    Teuchos::Array<RealT> res(1,0);
+    con->solve(*rp,*up,*zp,tol);
+    con->outputTpetraVector(u_ptr,"state.txt");
+    con->value(*rp,*up,*zp,tol);
+    r_ptr->norm2(res.view(0,1));
+    *outStream << "Residual Norm: " << res[0] << std::endl;
+    errorFlag += (res[0] > 1.e-6 ? 1 : 0);
+    //con->outputTpetraData();
+
+    // Get a summary from the time monitor.
+    Teuchos::TimeMonitor::summarize();
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/navier-stokes/impermiabilityK.hpp b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/navier-stokes/impermiabilityK.hpp
new file mode 100644
index 000000000000..46ddc9db852b
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/navier-stokes/impermiabilityK.hpp
@@ -0,0 +1,62 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+#ifndef IMPERMIABILITYK_HPP
+#define IMPERMIABILITYK_HPP
+
+#include <vector>
+#include "ROL_ParameterList.hpp"
+#include "ROL_Ptr.hpp"
+#include "../../../TOOLS/feK.hpp"
+
+template<class Real, class DeviceType>
+class Impermiability {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+
+private:
+  Real alphaLo_, alphaHi_, q_;
+
+  Real value(Real z) const {
+    const Real one(1);
+    return alphaHi_ + (alphaLo_-alphaHi_)*z*(one+q_)/(q_+z);
+  }
+
+  Real deriv1(Real z) const {
+    const Real one(1), two(2);
+    return (alphaLo_-alphaHi_)*q_*(q_+one)/std::pow(q_+z,two);
+  }
+
+  Real deriv2(Real z) const {
+    const Real one(1), two(2), three(3);
+    return -(alphaLo_-alphaHi_)*two*q_*(q_+one)/std::pow(q_+z,three);
+  }
+
+public:
+  Impermiability(ROL::ParameterList &list) {
+    alphaLo_ = list.sublist("Problem").get("Minimum Impermiability", 0.0);
+    alphaHi_ = list.sublist("Problem").get("Maximum Impermiability", 1e3);
+    q_       = list.sublist("Problem").get("RAMP Parameter", 1e5);
+  }
+
+  void compute(scalar_view &alpha, const scalar_view z, const scalar_view pts, int deriv=0) const {
+    const int c = pts.extent_int(0);
+    const int p = pts.extent_int(1);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        // Compute spatially dependent viscosity
+        if (deriv==1)      alpha(i,j) = deriv1(z(i,j));
+        else if (deriv==2) alpha(i,j) = deriv2(z(i,j));
+        else               alpha(i,j) = value(z(i,j));
+      }
+    }
+  }
+};
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/navier-stokes/obj_navier-stokesK.hpp b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/navier-stokes/obj_navier-stokesK.hpp
new file mode 100644
index 000000000000..a134059f49f0
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/navier-stokes/obj_navier-stokesK.hpp
@@ -0,0 +1,473 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  obj.hpp
+    \brief Provides the interface for local (cell-based) objective function computations.
+*/
+
+#ifndef PDEOPT_QOI_NAVIERSTOKESK_HPP
+#define PDEOPT_QOI_NAVIERSTOKESK_HPP
+
+#include "../../../TOOLS/qoiK.hpp"
+#include "pde_navier-stokesK.hpp"
+#include "impermiabilityK.hpp"
+
+template <class Real, class DeviceType>
+class QoI_Power_NavierStokes : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> feVel_, fePrs_, feCtrl_;
+  const ROL::Ptr<const FieldUtils::FieldInfo> fieldInfo_, fieldInfoCtrl_;
+
+  ROL::Ptr<Impermiability<Real,DeviceType>> imp_;
+  Real viscosity_;
+
+public:
+  QoI_Power_NavierStokes(ROL::ParameterList                          &list,
+                         const ROL::Ptr<fe_type>                     &feVel,
+                         const ROL::Ptr<fe_type>                     &fePrs,
+                         const ROL::Ptr<fe_type>                     &feCtrl,
+                         const ROL::Ptr<const FieldUtils::FieldInfo> &fieldInfo,
+                         const ROL::Ptr<const FieldUtils::FieldInfo> &fieldInfoCtrl)
+    : feVel_(feVel), fePrs_(fePrs), feCtrl_(feCtrl), fieldInfo_(fieldInfo), fieldInfoCtrl_(fieldInfoCtrl) {
+    viscosity_ = list.sublist("Problem").get("Viscosity",5e-3);
+    imp_ = ROL::makePtr<Impermiability<Real,DeviceType>>(list);
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c = feVel_->gradN().extent_int(0);
+    const int p = feVel_->gradN().extent_int(2);
+    const int d = feVel_->gradN().extent_int(3);
+    // Initialize output val
+    val = scalar_view("val", c);
+    // Get components of the control
+    std::vector<scalar_view> U, Z;
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoCtrl_);
+    // Evaluate on FE basis
+    scalar_view Zval("Zval", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view Uval("Uval", c, p);
+    scalar_view aUval("aUval", c, p);
+    scalar_view Ugrad("Ugrad", c, p, d);
+    scalar_view nUgrad("nUgrad", c, p, d);
+    feCtrl_->evaluateValue(Zval, Z[0]);
+    imp_->compute(alpha, Zval, feVel_->cubPts(), 0); 
+    for (int i = 0; i < d; ++i) {
+      Kokkos::deep_copy(Uval,static_cast<Real>(0));
+      Kokkos::deep_copy(Ugrad,static_cast<Real>(0));
+      Kokkos::deep_copy(aUval,static_cast<Real>(0));
+      Kokkos::deep_copy(nUgrad,static_cast<Real>(0));
+      // Evaluate on FE basis
+      feVel_->evaluateValue(Uval, U[i]);
+      feVel_->evaluateGradient(Ugrad, U[i]);
+      // Scale
+      fst::scalarMultiplyDataData(aUval, alpha, Uval);
+      rst::scale(nUgrad, Ugrad, viscosity_);
+      // Integrate
+      feVel_->computeIntegral(val, nUgrad, Ugrad, true);
+      feVel_->computeIntegral(val, aUval, Uval, true);
+    }
+    rst::scale(val, static_cast<Real>(0.5));
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fv = feVel_->gradN().extent_int(1);
+    const int fp = fePrs_->gradN().extent_int(1);
+    const int p  = feVel_->gradN().extent_int(2);
+    const int d  = feVel_->gradN().extent_int(3);
+    // Initialize output grad
+    std::vector<scalar_view> G;
+    G.resize(fieldInfo_->numFields);
+    for (int i = 0; i < d; ++i)
+      G[i] = scalar_view("grad", c, fv);
+    G[d] = scalar_view("grad", c, fp);
+    // Get components of the control
+    std::vector<scalar_view> U, Z;
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoCtrl_);
+    // Create storage
+    scalar_view Zval("Zval", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view Uval("Uval", c, p);
+    scalar_view aUval("aUval", c, p);
+    scalar_view Ugrad("Ugrad", c, p, d);
+    feCtrl_->evaluateValue(Zval, Z[0]);
+    imp_->compute(alpha, Zval, feVel_->cubPts(), 0); 
+    for (int i = 0; i < d; ++i) {
+      Kokkos::deep_copy(Uval,static_cast<Real>(0));
+      Kokkos::deep_copy(Ugrad,static_cast<Real>(0));
+      Kokkos::deep_copy(aUval,static_cast<Real>(0));
+      // Evaluate on FE basis
+      feVel_->evaluateValue(Uval, U[i]);
+      feVel_->evaluateGradient(Ugrad, U[i]);
+      // Scale
+      fst::scalarMultiplyDataData(aUval, alpha, Uval);
+      rst::scale(Ugrad, viscosity_);
+      // Integrate
+      fst::integrate(G[i],Ugrad,feVel_->gradNdetJ(),false);
+      fst::integrate(G[i],aUval,feVel_->NdetJ(),true);
+    }
+    FieldUtils::combineFieldCoeff<Real>(grad, G, fieldInfo_);
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int p  = feVel_->gradN().extent_int(2);
+    const int d  = feVel_->gradN().extent_int(3);
+    // Initialize output grad
+    std::vector<scalar_view> G;
+    G.resize(fieldInfoCtrl_->numFields);
+    G[0] = scalar_view("grad", c, fc);
+    // Get components of the control
+    std::vector<scalar_view> U, Z;
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoCtrl_);
+    // Create storage
+    scalar_view Zval("Zval", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view Uval("Uval", c, p);
+    scalar_view aUval("aUval", c, p);
+    scalar_view aU2val("aU2val", c, p);
+    feCtrl_->evaluateValue(Zval, Z[0]);
+    imp_->compute(alpha, Zval, feVel_->cubPts(), 1); 
+    for (int i = 0; i < d; ++i) {
+      Kokkos::deep_copy(Uval,static_cast<Real>(0));
+      Kokkos::deep_copy(aUval,static_cast<Real>(0));
+      Kokkos::deep_copy(aU2val,static_cast<Real>(0));
+      // Evaluate on FE basis
+      feVel_->evaluateValue(Uval, U[i]);
+      // Scale
+      fst::scalarMultiplyDataData(aUval, alpha, Uval);
+      fst::scalarMultiplyDataData(aU2val, aUval, Uval);
+      // Integrate
+      fst::integrate(G[0],aU2val,feCtrl_->NdetJ(),true);
+    }
+    rst::scale(G[0], static_cast<Real>(0.5));
+    FieldUtils::combineFieldCoeff<Real>(grad, G, fieldInfoCtrl_);
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fv = feVel_->gradN().extent_int(1);
+    const int fp = fePrs_->gradN().extent_int(1);
+    const int p  = feVel_->gradN().extent_int(2);
+    const int d  = feVel_->gradN().extent_int(3);
+    // Initialize output grad
+    std::vector<scalar_view> H;
+    H.resize(fieldInfo_->numFields);
+    for (int i = 0; i < d; ++i)
+      H[i] = scalar_view("hess", c, fv);
+    H[d] = scalar_view("hess", c, fp);
+    // Get components of the control
+    std::vector<scalar_view> V, Z;
+    FieldUtils::splitFieldCoeff<Real>(V, v_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoCtrl_);
+    // Create storage
+    scalar_view Zval("Zval", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view Vval("Vval", c, p);
+    scalar_view aVval("aVval", c, p);
+    scalar_view Vgrad("Vgrad", c, p, d);
+    feCtrl_->evaluateValue(Zval, Z[0]);
+    imp_->compute(alpha, Zval, feVel_->cubPts(), 0); 
+    for (int i = 0; i < d; ++i) {
+      Kokkos::deep_copy(Vval,static_cast<Real>(0));
+      Kokkos::deep_copy(Vgrad,static_cast<Real>(0));
+      Kokkos::deep_copy(aVval,static_cast<Real>(0));
+      // Evaluate on FE basis
+      feVel_->evaluateValue(Vval, V[i]);
+      feVel_->evaluateGradient(Vgrad, V[i]);
+      // Scale
+      fst::scalarMultiplyDataData(aVval, alpha, Vval);
+      rst::scale(Vgrad, viscosity_);
+      // Integrate
+      fst::integrate(H[i],Vgrad,feVel_->gradNdetJ(),false);
+      fst::integrate(H[i],aVval,feVel_->NdetJ(),true);
+    }
+    FieldUtils::combineFieldCoeff<Real>(hess, H, fieldInfo_);
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fv = feVel_->gradN().extent_int(1);
+    const int fp = fePrs_->gradN().extent_int(1);
+    const int p  = feVel_->gradN().extent_int(2);
+    const int d  = feVel_->gradN().extent_int(3);
+    // Initialize output grad
+    std::vector<scalar_view> H;
+    H.resize(fieldInfo_->numFields);
+    for (int i = 0; i < d; ++i)
+      H[i] = scalar_view("hess", c, fv);
+    H[d] = scalar_view("hess", c, fp);
+    // Get components of the control
+    std::vector<scalar_view> V, U, Z;
+    FieldUtils::splitFieldCoeff<Real>(V, v_coeff, fieldInfoCtrl_);
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoCtrl_);
+    // Create storage
+    scalar_view Zval("Zval", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view Uval("Uval", c, p);
+    scalar_view aUval("aUval", c, p);
+    scalar_view Vval("Vval", c, p);
+    scalar_view aVval("aVval", c, p);
+    feCtrl_->evaluateValue(Vval, V[0]);
+    feCtrl_->evaluateValue(Zval, Z[0]);
+    imp_->compute(alpha, Zval, feVel_->cubPts(), 1); 
+    fst::scalarMultiplyDataData(aVval, alpha, Vval);
+    for (int i = 0; i < d; ++i) {
+      Kokkos::deep_copy(Uval,static_cast<Real>(0));
+      Kokkos::deep_copy(aUval,static_cast<Real>(0));
+      // Evaluate on FE basis
+      feVel_->evaluateValue(Uval, U[i]);
+      // Scale
+      fst::scalarMultiplyDataData(aUval, aVval, Uval);
+      // Integrate
+      fst::integrate(H[i],aUval,feVel_->NdetJ(),false);
+    }
+    FieldUtils::combineFieldCoeff<Real>(hess, H, fieldInfo_);
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int p  = feVel_->gradN().extent_int(2);
+    const int d  = feVel_->gradN().extent_int(3);
+    // Initialize output grad
+    std::vector<scalar_view> H;
+    H.resize(fieldInfoCtrl_->numFields);
+    H[0] = scalar_view("hess", c, fc);
+    // Get components of the control
+    std::vector<scalar_view> V, U, Z;
+    FieldUtils::splitFieldCoeff<Real>(V, v_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoCtrl_);
+    // Create storage
+    scalar_view Zval("Zval", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view Uval("Uval", c, p);
+    scalar_view aUval("aUval", c, p);
+    scalar_view aUVval("aUVval", c, p);
+    scalar_view Vval("Vval", c, p);
+    feCtrl_->evaluateValue(Zval, Z[0]);
+    imp_->compute(alpha, Zval, feVel_->cubPts(), 1); 
+    for (int i = 0; i < d; ++i) {
+      Kokkos::deep_copy(Uval,static_cast<Real>(0));
+      Kokkos::deep_copy(aUval,static_cast<Real>(0));
+      Kokkos::deep_copy(Vval,static_cast<Real>(0));
+      Kokkos::deep_copy(aUVval,static_cast<Real>(0));
+      // Evaluate on FE basis
+      feVel_->evaluateValue(Vval, V[i]);
+      feVel_->evaluateValue(Uval, U[i]);
+      // Scale
+      fst::scalarMultiplyDataData(aUval, alpha, Uval);
+      fst::scalarMultiplyDataData(aUVval, aUval, Vval);
+      // Integrate
+      fst::integrate(H[0],aUVval,feCtrl_->NdetJ(),true);
+    }
+    FieldUtils::combineFieldCoeff<Real>(hess, H, fieldInfoCtrl_);
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int p  = feVel_->gradN().extent_int(2);
+    const int d  = feVel_->gradN().extent_int(3);
+    // Initialize output grad
+    std::vector<scalar_view> H;
+    H.resize(fieldInfoCtrl_->numFields);
+    H[0] = scalar_view("hess", c, fc);
+    // Get components of the control
+    std::vector<scalar_view> V, U, Z;
+    FieldUtils::splitFieldCoeff<Real>(V, v_coeff, fieldInfoCtrl_);
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoCtrl_);
+    // Create storage
+    scalar_view Zval("Zval", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view Uval("Uval", c, p);
+    scalar_view aUval("aUval", c, p);
+    scalar_view aU2val("aU2val", c, p);
+    scalar_view Vval("Vval", c, p);
+    scalar_view aVval("aVval", c, p);
+    feCtrl_->evaluateValue(Vval, V[0]);
+    feCtrl_->evaluateValue(Zval, Z[0]);
+    imp_->compute(alpha, Zval, feVel_->cubPts(), 2); 
+    fst::scalarMultiplyDataData(aVval, alpha, Vval);
+    for (int i = 0; i < d; ++i) {
+      Kokkos::deep_copy(Uval,static_cast<Real>(0));
+      Kokkos::deep_copy(aUval,static_cast<Real>(0));
+      Kokkos::deep_copy(aU2val,static_cast<Real>(0));
+      // Evaluate on FE basis
+      feVel_->evaluateValue(Uval, U[i]);
+      // Scale
+      fst::scalarMultiplyDataData(aUval, aVval, Uval);
+      fst::scalarMultiplyDataData(aU2val, aUval, Uval);
+      // Integrate
+      fst::integrate(H[0],aU2val,feCtrl_->NdetJ(),true);
+    }
+    rst::scale(H[0], static_cast<Real>(0.5));
+    FieldUtils::combineFieldCoeff<Real>(hess, H, fieldInfoCtrl_);
+  }
+
+}; // QoI_Power_NavierStokes
+
+template <class Real, class DeviceType>
+class QoI_Volume_NavierStokes : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> fe_;
+  const ROL::Ptr<const FieldUtils::FieldInfo> fieldInfo_;
+
+  scalar_view weight_;
+  Real volFraction_;
+
+public:
+  QoI_Volume_NavierStokes(ROL::ParameterList &list,
+                          const ROL::Ptr<fe_type> &fe,
+                          const ROL::Ptr<const FieldUtils::FieldInfo> &fieldInfo)
+    : fe_(fe), fieldInfo_(fieldInfo) {
+    volFraction_ = list.sublist("Problem").get("Volume Fraction",0.5);
+    const int c = fe_->cubPts().extent_int(0);
+    const int p = fe_->cubPts().extent_int(1);
+    weight_ = scalar_view("weight", c, p);
+    Kokkos::deep_copy(weight_,static_cast<Real>(1));
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c = fe_->cubPts().extent_int(0);
+    const int p = fe_->cubPts().extent_int(1);
+    // Initialize output val
+    val = scalar_view("val", c);
+    // Get components of the control
+    std::vector<scalar_view> Z;
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfo_);
+    // Evaluate on FE basis
+    scalar_view Z0("Z0", c, p);
+    fe_->evaluateValue(Z0, Z[0]);
+    // Integrate the density minus volume fraction
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) Z0(i,j) -= volFraction_;
+    }
+    fe_->computeIntegral(val,weight_,Z0,true);
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Volume_NavierStokes::gradient_1 is zero.");
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c = fe_->cubPts().extent_int(0);
+    const int f = fe_->N().extent_int(1);
+    // Initialize output grad
+    std::vector<scalar_view> G;
+    G.resize(fieldInfo_->numFields);
+    G[0] = scalar_view("grad", c, f);
+    // Integrate density
+    fst::integrate(G[0],weight_,fe_->NdetJ(),false);
+    FieldUtils::combineFieldCoeff(grad, G, fieldInfo_);
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Volume_NavierStokes::HessVec_11 is zero.");
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Volume_NavierStokes::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Volume_NavierStokes::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Volume_NavierStokes::HessVec_22 is zero.");
+  }
+
+}; // QoI_Volume_NavierStokes
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/navier-stokes/pde_navier-stokesK.hpp b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/navier-stokes/pde_navier-stokesK.hpp
new file mode 100644
index 000000000000..8944227d73ad
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/navier-stokes/pde_navier-stokesK.hpp
@@ -0,0 +1,928 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  pde_navier-stokes.hpp
+    \brief Implements the local PDE interface for the Navier-Stokes control problem.
+*/
+
+#ifndef PDE_NAVIERSTOKES_HPP
+#define PDE_NAVIERSTOKES_HPP
+
+#include "../../../TOOLS/pdeK.hpp"
+#include "../../../TOOLS/feK.hpp"
+#include "../../../TOOLS/fieldhelperK.hpp"
+#include "impermiabilityK.hpp"
+
+#include "Intrepid2_HVOL_C0_FEM.hpp"
+#include "Intrepid2_HGRAD_QUAD_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_QUAD_C2_FEM.hpp"
+#include "Intrepid2_HGRAD_HEX_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_HEX_C2_FEM.hpp"
+#include "Intrepid2_DefaultCubatureFactory.hpp"
+#include "Intrepid2_FunctionSpaceTools.hpp"
+#include "Intrepid2_CellTools.hpp"
+
+#include "ROL_Ptr.hpp"
+
+template <class Real, class DeviceType>
+class PDE_NavierStokes : public PDE<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  // Finite element basis information
+  basis_ptr basisPtrVel_, basisPtrPrs_, basisPtrCtrl_;
+  std::vector<basis_ptr> basisPtrs_, basisPtrsCtrl_;
+  // Cell cubature information
+  ROL::Ptr<Intrepid2::Cubature<DeviceType,Real,Real>> cellCub_, bdryCub_;
+  // Cell node information
+  scalar_view volCellNodes_;
+  std::vector<std::vector<scalar_view>> bdryCellNodes_;
+  std::vector<std::vector<std::vector<int>>> bdryCellLocIds_;
+  // Finite element definition
+  ROL::Ptr<fe_type> feVel_, fePrs_, feCtrl_;
+  std::vector<std::vector<ROL::Ptr<fe_type>>> feVelBdry_;
+  // Local degrees of freedom on boundary, for each side of the reference cell (first index).
+  std::vector<std::vector<int>> fvidx_, fpidx_;
+  // Coordinates of degrees freedom on boundary cells.
+  // Indexing:  [sideset number][local side id](cell number, value at dof)
+  std::vector<std::vector<scalar_view>> bdryCellDofValues_;
+  // Field pattern, offsets, etc.
+  std::vector<std::vector<int>> fieldPattern_;  // local Field/DOF pattern; set from DOF manager 
+  int numFields_;                               // number of fields (equations in the PDE)
+  int numDofs_;                                 // total number of degrees of freedom for all (local) fields
+  std::vector<int> offset_;                     // for each field, a counting offset
+  std::vector<int> numFieldDofs_;               // for each field, number of degrees of freedom
+  std::vector<std::vector<int>> fieldPatternCtrl_;  // local Field/DOF pattern; set from DOF manager 
+  int numFieldsCtrl_;                               // number of fields (equations in the PDE)
+  int numDofsCtrl_;                                 // total number of degrees of freedom for all (local) fields
+  std::vector<int> offsetCtrl_;                     // for each field, a counting offset
+  std::vector<int> numFieldDofsCtrl_;               // for each field, number of degrees of freedom
+  
+  // Problem parameters.
+  Real viscosity_;
+
+  ROL::Ptr<Impermiability<Real,DeviceType>> imp_;
+  ROL::Ptr<FieldHelper<Real,DeviceType>> fieldHelper_;
+  ROL::Ptr<FieldUtils::FieldInfo> fieldInfo_, fieldInfoCtrl_;
+
+  // Extract velocity coefficients on boundary.
+  scalar_view getBoundaryCoeff(const scalar_view cell_coeff, int sideSet, int cell) const {
+    std::vector<int> bdryCellLocId = bdryCellLocIds_[sideSet][cell];
+    const int numCellsSide = bdryCellLocId.size();
+    const int f = basisPtrVel_->getCardinality();
+    
+    scalar_view bdry_coeff("bdry_coeff", numCellsSide, f);
+    for (int i = 0; i < numCellsSide; ++i) {
+      for (int j = 0; j < f; ++j)
+        bdry_coeff(i, j) = cell_coeff(bdryCellLocId[i], j);
+    }
+    return bdry_coeff;
+  }
+
+  Real DirichletFunc(const std::vector<Real> & coords, int sideset, int locSideId, int dir) const {
+    const std::vector<Real> param = PDE<Real,DeviceType>::getParameter();
+    Real val(0);
+    if ((sideset==1) && (dir==0)) { // inflow
+      const Real one(1), four(4);
+      val = four*coords[1]*(one-coords[1]);
+    }
+    else if ((sideset==0) && (dir==0)) { // outflow
+      const Real one(1), two(2), three(3), c(108);
+      val = c*(coords[1]-one/three)*(two/three-coords[1]);
+    }
+    return val;
+  }
+
+  void computeDirichlet(void) {
+    // Compute Dirichlet values at DOFs.
+    int d = basisPtrVel_->getBaseCellTopology().getDimension();
+    int numSidesets = bdryCellLocIds_.size();
+    bdryCellDofValues_.resize(numSidesets);
+    for (int i=0; i<numSidesets; ++i) {
+      int numLocSides = bdryCellLocIds_[i].size();
+      bdryCellDofValues_[i].resize(numLocSides);
+      for (int j=0; j<numLocSides; ++j) {
+        int c = bdryCellLocIds_[i][j].size();
+        int f = basisPtrVel_->getCardinality();
+        bdryCellDofValues_[i][j] = scalar_view("bdryCellDofValues", c, f, d);
+        scalar_view coords("coords", c, f, d);
+        if (c > 0) feVel_->computeDofCoords(coords, bdryCellNodes_[i][j]);
+        for (int k=0; k<c; ++k) {
+          for (int l=0; l<f; ++l) {
+            std::vector<Real> dofpoint(d);
+            //std::cout << "Sideset " << i << " LocalSide " << j << "  Cell " << k << "  Field " << l << "  Coord ";
+            for (int m=0; m<d; ++m) {
+              dofpoint[m] = coords(k, l, m);
+              //std::cout << dofpoint[m] << "  ";
+            }
+
+            for (int m=0; m<d; ++m) {
+              (bdryCellDofValues_[i][j])(k, l, m) = DirichletFunc(dofpoint, i, j, m);
+              //std::cout << "  " << m << "-Value " << DirichletFunc(dofpoint, i, j, m);
+            }
+            //std::cout << std::endl;
+          }
+        }
+      }
+    }
+  }
+
+  Real viscosityFunc(const std::vector<Real> & coords) const {
+    return viscosity_;
+  }
+
+  void computeViscosity(scalar_view &nu) const {
+    int c = feVel_->gradN().extent_int(0);
+    int p = feVel_->gradN().extent_int(2);
+    int d = feVel_->gradN().extent_int(3);
+    std::vector<Real> pt(d);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for ( int k = 0; k < d; ++k)
+          pt[k] = (feVel_->cubPts())(i,j,k);
+        // Compute spatially dependent viscosity
+        nu(i,j) = viscosityFunc(pt);
+      }
+    }
+  }
+
+public:
+  PDE_NavierStokes(ROL::ParameterList &parlist) {
+    // Finite element fields.
+    int cubDegree     = parlist.sublist("Problem").get("Cubature Degree",4);
+    int bdryCubDegree = parlist.sublist("Problem").get("Boundary Cubature Degree",2);
+    int probDim       = parlist.sublist("Problem").get("Problem Dimension",2);
+    int basisDegCtrl  = parlist.sublist("Problem").get("Density Basis Degree",1);
+    if (probDim > 3 || probDim < 2) {
+      TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument,
+        ">>> PDE-OPT/projects/navier-stokes-topopt/pde_navier-stokes.hpp: Problem dimension is not 2 or 3!");
+    }
+    if (probDim == 2) {
+      basisPtrVel_  = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C2_FEM<DeviceType,Real,Real>>();
+      basisPtrPrs_  = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceType,Real,Real>>();
+    }
+    else if (probDim == 3) {
+      basisPtrVel_  = ROL::makePtr<Intrepid2::Basis_HGRAD_HEX_C2_FEM<DeviceType,Real,Real>>();
+      basisPtrPrs_  = ROL::makePtr<Intrepid2::Basis_HGRAD_HEX_C1_FEM<DeviceType,Real,Real>>();
+    }
+    basisPtrs_.clear();
+    basisPtrs_.resize(probDim, basisPtrVel_); // Velocity component
+    basisPtrs_.push_back(basisPtrPrs_);       // Pressure component
+    shards::CellTopology cellType = basisPtrs_[0]->getBaseCellTopology();        // get the cell type from any basis
+    if (basisDegCtrl == 1) {
+      if (probDim == 2)
+        basisPtrCtrl_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceType,Real,Real>>();
+      else if (probDim == 3)
+        basisPtrCtrl_ = ROL::makePtr<Intrepid2::Basis_HGRAD_HEX_C1_FEM<DeviceType,Real,Real>>();
+    }
+    else
+      basisPtrCtrl_ = ROL::makePtr<Intrepid2::Basis_HVOL_C0_FEM<DeviceType,Real,Real>>(cellType);
+    basisPtrsCtrl_.clear();
+    basisPtrsCtrl_.push_back(basisPtrCtrl_);  // Control component
+    // Quadrature rules.
+    Intrepid2::DefaultCubatureFactory cubFactory;                            // create cubature factory
+    cellCub_ = cubFactory.create<DeviceType,Real,Real>(cellType, cubDegree); // create default cubature
+
+    shards::CellTopology bdryCellType = cellType.getCellTopologyData(probDim-1, 0);
+    bdryCub_ = cubFactory.create<DeviceType,Real,Real>(bdryCellType, bdryCubDegree);
+
+    // Other problem parameters.
+    viscosity_ = parlist.sublist("Problem").get("Viscosity", 5e-3);
+    imp_       = ROL::makePtr<Impermiability<Real,DeviceType>>(parlist);
+
+    numDofs_ = 0;
+    numFields_ = basisPtrs_.size();
+    offset_.resize(numFields_);
+    numFieldDofs_.resize(numFields_);
+    for (int i=0; i<numFields_; ++i) {
+      if (i==0) offset_[i]  = 0;
+      else      offset_[i]  = offset_[i-1] + basisPtrs_[i-1]->getCardinality();
+      numFieldDofs_[i] = basisPtrs_[i]->getCardinality();
+      numDofs_ += numFieldDofs_[i];
+    }
+    numDofsCtrl_ = 0;
+    numFieldsCtrl_ = basisPtrsCtrl_.size();
+    offsetCtrl_.resize(numFieldsCtrl_);
+    numFieldDofsCtrl_.resize(numFieldsCtrl_);
+    for (int i=0; i<numFieldsCtrl_; ++i) {
+      if (i==0) offsetCtrl_[i]  = 0;
+      else      offsetCtrl_[i]  = offsetCtrl_[i-1] + basisPtrsCtrl_[i-1]->getCardinality();
+      numFieldDofsCtrl_[i] = basisPtrsCtrl_[i]->getCardinality();
+      numDofsCtrl_ += numFieldDofsCtrl_[i];
+    }
+  }
+
+  void residual(scalar_view & res,
+                const scalar_view u_coeff,
+                const scalar_view z_coeff = scalar_view(),
+                const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fv = feVel_->gradN().extent_int(1);
+    const int fp = fePrs_->gradN().extent_int(1);
+    const int p  = feVel_->gradN().extent_int(2);
+    const int d  = feVel_->gradN().extent_int(3);
+    // Initialize residuals.
+    std::vector<scalar_view> R;
+    R.resize(numFields_);
+    for (int i = 0; i < d; ++i)
+      R[i] = scalar_view("res", c, fv);
+    R[d] = scalar_view("res", c, fp);
+    // Split coefficients into components.
+    std::vector<scalar_view> U, Z;
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoCtrl_);
+    // Evaluate/interpolate finite element fields on cells.
+    std::vector<scalar_view> nuGradVel(d), valVel(d), gradVel(d), valVelDotgradVel(d), alphaVel(d);
+    scalar_view nu("nu", c, p);
+    scalar_view valVel0("valVel0", c, p, d);
+    scalar_view valPres("valPres", c, p);
+    scalar_view divVel("divVel", c, p);
+    scalar_view valCtrl("valCtrl", c, p);
+    scalar_view alpha("alpha", c, p);
+    computeViscosity(nu);
+    feCtrl_->evaluateValue(valCtrl, Z[0]);
+    imp_->compute(alpha, valCtrl, feVel_->cubPts(), 0);
+    for (int i = 0; i < d; ++i) {
+      nuGradVel[i]        = scalar_view("nuGradVel", c, p, d);
+      valVel[i]           = scalar_view("valVel", c, p);
+      gradVel[i]          = scalar_view("gradVel", c, p, d);
+      valVelDotgradVel[i] = scalar_view("valVelDotgradVel", c, p);
+      alphaVel[i]         = scalar_view("alphaVel", c, p);
+      // Evaluate on FE basis
+      feVel_->evaluateValue(valVel[i], U[i]);
+      feVel_->evaluateGradient(gradVel[i], U[i]);
+      // Multiply velocity gradients with viscosity.
+      fst::tensorMultiplyDataData(nuGradVel[i], nu, gradVel[i]);
+      // Multiply velocity with alpha
+      fst::scalarMultiplyDataData(alphaVel[i], alpha, valVel[i]);
+      // Assemble the velocity vector and its divergence.
+      for (int j = 0; j < c; ++j) {
+        for (int k = 0; k < p; ++k) {
+          valVel0(j,k,i) = (valVel[i])(j,k);
+          divVel(j,k)   += (gradVel[i])(j,k,i);
+        }
+      }
+    }
+    // Compute nonlinear terms in the Navier-Stokes equations.
+    for (int i = 0; i < d; ++i)
+      fst::dotMultiplyDataData(valVelDotgradVel[i], valVel0, gradVel[i]);
+    // Negative pressure
+    fePrs_->evaluateValue(valPres, U[d]);
+    rst::scale(valPres, static_cast<Real>(-1));
+    /*** Evaluate weak form of the residual. ***/
+    // Velocity equation.
+    for (int i = 0; i < d; ++i) {
+      fst::integrate(R[i],nuGradVel[i],feVel_->gradNdetJ(),false);
+      fst::integrate(R[i],valVelDotgradVel[i],feVel_->NdetJ(),true);
+      fst::integrate(R[i],valPres,feVel_->DNDdetJ(i),true);
+      fst::integrate(R[i],alphaVel[i],feVel_->NdetJ(),true);
+    }
+    // Pressure equation.
+    fst::integrate(R[d],divVel,fePrs_->NdetJ(),false);
+    rst::scale(R[d], static_cast<Real>(-1));
+    // Boundary conditions.
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      // APPLY DIRICHLET CONDITIONS
+      computeDirichlet();
+      for (int i = 0; i < numSideSets; ++i) {
+        if (i==1 /* in flow */ || i==2 /* no slip */ ) { //|| i==0 /* out flow */) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                //std::cout << "\n  i=" << i << "   cidx=" << cidx << "   j=" << j << "  l=" << l << "  " << fvidx_[j][l] << " " << (*bdryCellDofValues_[i][j])(k,fvidx_[j][l],0);
+                //std::cout << "\n  i=" << i << "   cidx=" << cidx << "   j=" << j << "  l=" << l << "  " << fvidx_[j][l] << " " << (*bdryCellDofValues_[i][j])(k,fvidx_[j][l],1);
+                for (int m=0; m < d; ++m)
+                  (R[m])(cidx,fvidx_[j][l]) = (U[m])(cidx,fvidx_[j][l]) - (bdryCellDofValues_[i][j])(k,fvidx_[j][l],m);
+              }
+            }
+          }
+        }
+      }
+    }
+    // Combine the residuals.
+    FieldUtils::combineFieldCoeff<Real>(res, R, fieldInfo_);
+  }
+
+  void Jacobian_1(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fv = feVel_->gradN().extent_int(1);
+    const int fp = fePrs_->gradN().extent_int(1);
+    const int p  = feVel_->gradN().extent_int(2);
+    const int d  = feVel_->gradN().extent_int(3);
+    // Initialize Jacobians.
+    std::vector<std::vector<scalar_view>> J(numFields_);
+    J[d].resize(numFields_);
+    for (int i = 0; i < d; ++i) {
+      J[i].resize(numFields_);
+      for (int j = 0; j < d; ++j)
+        J[i][j] = scalar_view("jac", c, fv, fv);
+      J[i][d] = scalar_view("jac", c, fv, fp);
+      J[d][i] = scalar_view("jac", c, fp, fv);
+    }
+    J[d][d] = scalar_view("jac", c, fp, fp);
+    // Split coefficients into components.
+    std::vector<scalar_view> U, Z;
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoCtrl_);
+    // Evaluate/interpolate finite element fields on cells.
+    std::vector<scalar_view> valVel(d), gradVel(d);
+    std::vector<std::vector<scalar_view>> dVel(d), dVelPhi(d);
+    scalar_view nu("nu", c, p);
+    scalar_view nuGradPhi("nuGradPhi", c, fv, p, d);
+    scalar_view valCtrl("valCtrl", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view alphaPhi ("alphaPhi", c, fv, p);
+    scalar_view valVel0("valVel0", c, p, d);
+    scalar_view valVelDotgradPhi("valVelDotgralPhi", c, fv, p);
+    computeViscosity(nu);
+    feCtrl_->evaluateValue(valCtrl, Z[0]);
+    imp_->compute(alpha, valCtrl, feVel_->cubPts(), 0);
+    for (int i = 0; i < d; ++i) {
+      valVel[i]  = scalar_view("valVel", c, p);
+      gradVel[i] = scalar_view("gradVel", c, p, d);
+      feVel_->evaluateValue(valVel[i], U[i]);
+      feVel_->evaluateGradient(gradVel[i], U[i]);
+      for (int j = 0; j < c; ++j) {
+        for (int k = 0; k < p; ++k)
+          valVel0(j,k,i) = (valVel[i])(j,k);
+      }
+      dVel[i].resize(d);
+      dVelPhi[i].resize(d);
+      for (int j = 0; j < d; ++j) {
+        dVel[i][j]    = scalar_view("dVel", c, p);
+        dVelPhi[i][j] = scalar_view("dVelPhi", c, fv, p);
+        for (int k = 0; k < c; ++k) {
+          for (int l = 0; l < p; ++l)
+            (dVel[i][j])(k,l) = (gradVel[i])(k,l,j);
+        }
+      }
+    }
+    // Multiply velocity gradients with viscosity.
+    fst::tensorMultiplyDataField(nuGradPhi, nu, feVel_->gradN());
+    // Compute nonlinear terms in the Navier-Stokes equations.
+    fst::dotMultiplyDataField(valVelDotgradPhi, valVel0, feVel_->gradN());
+    for (int i = 0; i < d; ++i) {
+      for (int j = 0; j < d; ++j)
+        fst::scalarMultiplyDataField(dVelPhi[i][j], dVel[i][j], feVel_->N());
+    }
+    // Multiply velocity with alpha
+    fst::scalarMultiplyDataField(alphaPhi, alpha, feVel_->N());
+    // Negative pressure basis.
+    scalar_view negPrsN("negPrsN", c, fp, p);
+    Kokkos::deep_copy(negPrsN,fePrs_->N());
+    rst::scale(negPrsN, static_cast<Real>(-1));
+    /*** Evaluate weak form of the Jacobian. ***/
+    for (int i = 0; i < d; ++i) {
+      // Velocity equation.
+      fst::integrate(J[i][i],nuGradPhi,feVel_->gradNdetJ(),false);
+      fst::integrate(J[i][i],feVel_->NdetJ(),valVelDotgradPhi,true);
+      fst::integrate(J[i][i],alphaPhi,feVel_->NdetJ(),true);
+      for (int j = 0; j < d; ++j)
+        fst::integrate(J[i][j],feVel_->NdetJ(),dVelPhi[i][j],true);
+      fst::integrate(J[i][d],feVel_->DNDdetJ(i),negPrsN,false);
+      // Pressure equation.
+      fst::integrate(J[d][i],fePrs_->NdetJ(),feVel_->DND(i),false);
+      rst::scale(J[d][i], static_cast<Real>(-1));
+    }
+    // APPLY DIRICHLET CONDITIONS
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if (i==1 /* in flow */ || i==2 /* no slip */ ) { // || i==0 /* out flow */) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                for (int m=0; m < fv; ++m) {
+                  for (int n=0; n < d; ++n) {
+                    for (int q=0; q < d; ++q) {
+                      (J[n][q])(cidx,fvidx_[j][l],m) = static_cast<Real>(0);
+                    }
+                    (J[n][n])(cidx,fvidx_[j][l],fvidx_[j][l]) = static_cast<Real>(1);
+                  }
+                }
+                for (int m=0; m < fp; ++m) {
+                  for (int n=0; n < d; ++n) {
+                    (J[n][d])(cidx,fvidx_[j][l],m) = static_cast<Real>(0);
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    // Combine the jacobians.
+    FieldUtils::combineFieldCoeff<Real>(jac, J, fieldInfo_, fieldInfo_);
+  }
+
+
+  void Jacobian_2(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fv = feVel_->gradN().extent_int(1);
+    const int fp = fePrs_->gradN().extent_int(1);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int p  = feVel_->gradN().extent_int(2);
+    const int d  = feVel_->gradN().extent_int(3);
+    // Initialize Jacobians.
+    std::vector<std::vector<scalar_view>> J(numFields_);
+    for (int i = 0; i < d; ++i) {
+      J[i].resize(1);
+      J[i][0] = scalar_view("jac", c, fv, fc);
+    }
+    J[d].resize(1);
+    J[d][0] = scalar_view("jac", c, fp, fc);
+    // Split coefficients into components.
+    std::vector<scalar_view> U, Z;
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoCtrl_);
+    // Evaluate on FE basis.
+    scalar_view valCtrl("valCtrl", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view alphaPhi("alphaPhi", c, fc, p);
+    scalar_view valVel("valVel", c, p);
+    scalar_view alphaU("alphaU", c, fc, p);
+    feCtrl_->evaluateValue(valCtrl, Z[0]);
+    imp_->compute(alpha, valCtrl, feVel_->cubPts(), 1);
+    fst::scalarMultiplyDataField(alphaPhi, alpha, feCtrl_->N());
+    for (int i = 0; i < d; ++i) {
+      Kokkos::deep_copy(alphaU,static_cast<Real>(0));
+      Kokkos::deep_copy(valVel,static_cast<Real>(0));
+      feVel_->evaluateValue(valVel, U[i]);
+      // Multiply velocity with alpha
+      fst::scalarMultiplyDataField(alphaU, valVel, alphaPhi);
+      // Integrate
+      fst::integrate(J[i][0],feVel_->NdetJ(),alphaU,false);
+    }
+    // APPLY DIRICHLET CONDITIONS
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if (i==1 /* in flow */ || i==2 /* no slip */ ) { // || i==0 /* out flow */) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                //std::cout << "\n   j=" << j << "  l=" << l << "  " << fidx[j][l];
+                for (int n=0; n < d; ++n) {
+                  for (int m=0; m < fc; ++m) {
+                    (J[n][0])(cidx,fvidx_[j][l],m) = static_cast<Real>(0);
+                  }
+                }
+                //for (int m=0; m < fc; ++m) {
+                //  (J[d][0])(cidx,fpidx_[j][l],m) = static_cast<Real>(0);
+                //}
+              }
+            }
+          }
+        }
+      }
+    }
+    // Combine the jacobians.
+    FieldUtils::combineFieldCoeff<Real>(jac, J, fieldInfo_, fieldInfoCtrl_);
+  }
+
+  void Hessian_11(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fv = feVel_->gradN().extent_int(1);
+    const int fp = fePrs_->gradN().extent_int(1);
+    const int p  = feVel_->gradN().extent_int(2);
+    const int d  = feVel_->gradN().extent_int(3);
+    // Initialize hessians.
+    std::vector<std::vector<scalar_view>> H;
+    H.resize(numFields_);
+    H[d].resize(numFields_);
+    for (int i = 0; i < d; ++i) {
+      H[i].resize(numFields_);
+      for (int j = 0; j < d; ++j)
+        H[i][j] = scalar_view("hess", c, fv, fv);
+      H[i][d] = scalar_view("hess", c, fv, fp);
+      H[d][i] = scalar_view("hess", c, fp, fv);
+    }
+    H[d][d] = scalar_view("hess", c, fp, fp);
+    // Split coefficients into components.
+    std::vector<scalar_view> L;
+    FieldUtils::splitFieldCoeff<Real>(L, l_coeff, fieldInfo_);
+    // Apply Dirichlet conditions
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if (i==1 /* in flow */ || i==2 /* no slip */ ) { // || i==0 /* out flow */) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                //std::cout << "\n   j=" << j << "  l=" << l << "  " << fidx[j][l];
+                for (int m=0; m < d; ++m) {
+                  (L[m])(cidx,fvidx_[j][l]) = static_cast<Real>(0);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    // Evaluate/interpolate finite element fields on cells.
+    std::vector<scalar_view> Lval(d), LPhi(d);
+    for (int i = 0; i < d; ++i) {
+      Lval[i] = scalar_view("Lval", c, p);
+      LPhi[i] = scalar_view("Lphi", c, fv, p);
+      feVel_->evaluateValue(Lval[i], L[i]);
+      fst::scalarMultiplyDataField(LPhi[i], Lval[i], feVel_->N());
+    }
+    /*** Evaluate weak form of the Hessian. ***/
+    for (int i = 0; i < d; ++i) {
+      fst::integrate(H[i][i],LPhi[i],feVel_->DNDdetJ(i),false);
+      fst::integrate(H[i][i],feVel_->DNDdetJ(i),LPhi[i],true);
+      for (int j = 0; j < d; ++j) {
+        fst::integrate(H[i][j],feVel_->DNDdetJ(j),LPhi[i],false);
+        fst::integrate(H[i][j],LPhi[j],feVel_->DNDdetJ(i),true);
+      }
+    }
+    // Combine the Hessians.
+    FieldUtils::combineFieldCoeff<Real>(hess, H, fieldInfo_, fieldInfo_);
+  }
+
+  void Hessian_12(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fv = feVel_->gradN().extent_int(1);
+    const int fp = fePrs_->gradN().extent_int(1);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int p  = feVel_->gradN().extent_int(2);
+    const int d  = feVel_->gradN().extent_int(3);
+    // Initialize hessians.
+    std::vector<std::vector<scalar_view>> H(1);
+    H[0].resize(numFields_);
+    for (int i = 0; i < d; ++i)
+      H[0][i] = scalar_view("hess", c, fc, fv);
+    H[0][d] = scalar_view("hess", c, fc, fp);
+    // Split coefficients into components.
+    std::vector<scalar_view> L, Z;
+    FieldUtils::splitFieldCoeff<Real>(L, l_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoCtrl_);
+    // Apply Dirichlet conditions
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if (i==1 /* in flow */ || i==2 /* no slip */ ) { // || i==0 /* out flow */) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                //std::cout << "\n   j=" << j << "  l=" << l << "  " << fidx[j][l];
+                for (int m=0; m < d; ++m) {
+                  (L[m])(cidx,fvidx_[j][l]) = static_cast<Real>(0);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    // Evaluate/interpolate finite element fields on cells.
+    scalar_view Z0("Z0", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view Lval("Lval", c, p);
+    scalar_view alphaL("alphaL", c, fc, p);
+    feCtrl_->evaluateValue(Z0, Z[0]);
+    imp_->compute(alpha, Z0, feVel_->cubPts(), 1);
+    /*** Evaluate weak form of the Hessian. ***/
+    // Velocity equation.
+    for (int i = 0; i < d; ++i) {
+      // Multiply velocity with alpha
+      Kokkos::deep_copy(Lval,static_cast<Real>(0));
+      Kokkos::deep_copy(alphaL,static_cast<Real>(0));
+      feVel_->evaluateValue(Lval, L[i]);
+      for (int j = 0; j < c; ++j) {
+        for (int k = 0; k < fc; ++k) {
+          for (int l = 0; l < p; ++l)
+            alphaL(j,k,l) = alpha(j,l) * (feCtrl_->N())(j,k,l) * Lval(j,l);
+        }
+      }
+      fst::integrate(H[0][i],alphaL,feVel_->NdetJ(),false);
+    }
+    // Combine the Hessians.
+    FieldUtils::combineFieldCoeff<Real>(hess, H, fieldInfoCtrl_, fieldInfo_);
+  }
+
+  void Hessian_21(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fv = feVel_->gradN().extent_int(1);
+    const int fp = fePrs_->gradN().extent_int(1);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int p  = feVel_->gradN().extent_int(2);
+    const int d  = feVel_->gradN().extent_int(3);
+    // Initialize hessians.
+    std::vector<std::vector<scalar_view>> H(numFields_);
+    for (int i = 0; i < d; ++i) {
+      H[i].resize(1);
+      H[i][0] = scalar_view("hess", c, fv, fc);
+    }
+    H[d].resize(1);
+    H[d][0] = scalar_view("hess", c, fp, fc);
+    // Split coefficients into components.
+    std::vector<scalar_view> L, Z;
+    FieldUtils::splitFieldCoeff<Real>(L, l_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoCtrl_);
+    // Apply Dirichlet conditions
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if (i==1 /* in flow */ || i==2 /* no slip */ ) { // || i==0 /* out flow */) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                //std::cout << "\n   j=" << j << "  l=" << l << "  " << fidx[j][l];
+                for (int m=0; m < d; ++m)
+                  (L[m])(cidx,fvidx_[j][l]) = static_cast<Real>(0);
+              }
+            }
+          }
+        }
+      }
+    }
+    // Evaluate/interpolate finite element fields on cells.
+    scalar_view Z0("Z0", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view Lval("Lval", c, p);
+    scalar_view alphaL("alphaL", c, fc, p);
+    feCtrl_->evaluateValue(Z0, Z[0]);
+    imp_->compute(alpha, Z0, feVel_->cubPts(), 1);
+    /*** Evaluate weak form of the Hessian. ***/
+    // Velocity equation.
+    for (int i = 0; i < d; ++i) {
+      Kokkos::deep_copy(Lval,static_cast<Real>(0));
+      Kokkos::deep_copy(alphaL,static_cast<Real>(0));
+      // Multiply velocity with alpha
+      feVel_->evaluateValue(Lval, L[i]);
+      for (int j = 0; j < c; ++j) {
+        for (int k = 0; k < fc; ++k) {
+          for (int l = 0; l < p; ++l)
+            alphaL(j,k,l) = alpha(j,l) * (feCtrl_->N())(j,k,l) * Lval(j,l);
+        }
+      }
+      fst::integrate(H[i][0],feVel_->NdetJ(),alphaL,false);
+    }
+    // Combine the Hessians.
+    FieldUtils::combineFieldCoeff<Real>(hess, H, fieldInfo_, fieldInfoCtrl_);
+  }
+
+  void Hessian_22(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int p  = feVel_->gradN().extent_int(2);
+    const int d  = feVel_->gradN().extent_int(3);
+    // Initialize hessians.
+    std::vector<std::vector<scalar_view>> H(1);
+    H[0].resize(1);
+    H[0][0] = scalar_view("hess", c, fc, fc);
+    // Split coefficients into components.
+    std::vector<scalar_view> L, U, Z;
+    FieldUtils::splitFieldCoeff<Real>(L, l_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoCtrl_);
+    // Apply Dirichlet conditions
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if (i==1 /* in flow */ || i==2 /* no slip */ ) { // || i==0 /* out flow */) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                //std::cout << "\n   j=" << j << "  l=" << l << "  " << fidx[j][l];
+                for (int m=0; m < d; ++m) {
+                  (L[m])(cidx,fvidx_[j][l]) = static_cast<Real>(0);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    // Evaluate/interpolate finite element fields on cells.
+    std::vector<scalar_view> Lval(d), Uval(d);
+    scalar_view Z0("Z0", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view alphaUL("alphaUL", c, fc, p);
+    for (int i = 0; i < d; ++i) {
+      Lval[i] = scalar_view("Lval", c, p);
+      Uval[i] = scalar_view("Uval", c, p);
+      feVel_->evaluateValue(Lval[i], L[i]);
+      feVel_->evaluateValue(Uval[i], U[i]);
+    }
+    feCtrl_->evaluateValue(Z0, Z[0]);
+    imp_->compute(alpha, Z0, feVel_->cubPts(), 2);
+    // Multiply velocity with alpha
+    Real dot(0);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < fc; ++j) {
+        for (int k = 0; k < p; ++k) {
+          dot = static_cast<Real>(0);
+          for (int l = 0; l < d; ++l)
+            dot += (Uval[l])(i,k) * (Lval[l])(i,k);
+          alphaUL(i,j,k) = alpha(i,k) * (feCtrl_->N())(i,j,k) * dot;
+        }
+      }
+    }
+    /*** Evaluate weak form of the Hessian. ***/
+    // Velocity equation.
+    fst::integrate(H[0][0],alphaUL,feCtrl_->NdetJ(),false);
+    // Combine hessians.
+    FieldUtils::combineFieldCoeff<Real>(hess, H, fieldInfoCtrl_, fieldInfoCtrl_);
+  }
+
+  void RieszMap_1(scalar_view & riesz) override {
+    // Retrieve dimensions.
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fv = feVel_->gradN().extent_int(1);
+    const int fp = fePrs_->gradN().extent_int(1);
+    const int d  = feVel_->gradN().extent_int(3);
+    // Initialize Riesz maps.
+    std::vector<std::vector<scalar_view>> J(numFields_);
+    J[d].resize(numFields_);
+    for (int i = 0; i < d; ++i) {
+      J[i].resize(numFields_);
+      for (int j = 0; j < d; ++j)
+        J[i][j] = scalar_view("riesz1", c, fv, fv);
+      J[i][d] = scalar_view("riesz1", c, fv, fp);
+      J[d][i] = scalar_view("riesz1", c, fp, fv);
+    }
+    J[d][d] = scalar_view("riesz1", c, fp, fp);
+    // Build Riesz maps.
+    for (int i = 0; i < d; ++i) {
+      Kokkos::deep_copy(J[i][i],feVel_->stiffMat());
+      rst::add(J[i][i],feVel_->massMat());
+    }
+    Kokkos::deep_copy(J[d][d],fePrs_->massMat());
+    // Combine Riesz maps.
+    FieldUtils::combineFieldCoeff<Real>(riesz, J, fieldInfo_, fieldInfo_);
+  }
+
+  void RieszMap_2(scalar_view & riesz) override {
+    // Retrieve dimensions.
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    // Initialize residuals.
+    std::vector<std::vector<scalar_view>> J(1);
+    J[0].resize(1);
+    J[0][0] = scalar_view("riesz2", c, fc, fc);
+    Kokkos::deep_copy(J[0][0],feCtrl_->massMat());
+    // Combine Riesz maps.
+    FieldUtils::combineFieldCoeff<Real>(riesz, J, fieldInfoCtrl_, fieldInfoCtrl_);
+  }
+
+  std::vector<basis_ptr> getFields() override {
+    return basisPtrs_;
+  }
+
+  std::vector<basis_ptr> getFields2() override {
+    return basisPtrsCtrl_;
+  }
+
+  void setCellNodes(const scalar_view &volCellNodes,
+                    const std::vector<std::vector<scalar_view>> &bdryCellNodes,
+                    const std::vector<std::vector<std::vector<int>>> &bdryCellLocIds) override {
+    volCellNodes_ = volCellNodes;
+    bdryCellNodes_ = bdryCellNodes;
+    bdryCellLocIds_ = bdryCellLocIds;
+    // Finite element definition.
+    feVel_  = ROL::makePtr<fe_type>(volCellNodes_,basisPtrVel_,cellCub_);
+    fePrs_  = ROL::makePtr<fe_type>(volCellNodes_,basisPtrPrs_,cellCub_);
+    feCtrl_ = ROL::makePtr<fe_type>(volCellNodes_,basisPtrCtrl_,cellCub_,false);
+    fvidx_  = feVel_->getBoundaryDofs();
+    fpidx_  = fePrs_->getBoundaryDofs();
+    // Construct control boundary FE
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      feVelBdry_.resize(numSideSets);
+      for (int i = 0; i < numSideSets; ++i) {
+        int numLocSides = bdryCellNodes[i].size();
+        feVelBdry_[i].resize(numLocSides);
+        for (int j = 0; j < numLocSides; ++j) {
+          if (bdryCellNodes[i][j] != scalar_view())
+            feVelBdry_[i][j] = ROL::makePtr<fe_type>(bdryCellNodes[i][j],basisPtrVel_,bdryCub_,j);
+        }
+      }
+    }
+  }
+
+  void setFieldPattern(const std::vector<std::vector<int>> &fieldPattern,
+                       const std::vector<std::vector<int>> &fieldPattern2) override {
+    fieldPattern_     = fieldPattern;
+    fieldInfo_        = ROL::makePtr<FieldUtils::FieldInfo>(numFields_,numDofs_,numFieldDofs_,fieldPattern_);
+    fieldPatternCtrl_ = fieldPattern2;
+    fieldInfoCtrl_    = ROL::makePtr<FieldUtils::FieldInfo>(numFieldsCtrl_,numDofsCtrl_,numFieldDofsCtrl_,fieldPatternCtrl_);
+    fieldHelper_      = ROL::makePtr<FieldHelper<Real,DeviceType>>(numFields_, numDofs_, numFieldDofs_, fieldPattern_);
+  }
+
+  const ROL::Ptr<fe_type> getVelocityFE(void) const {
+    return feVel_;
+  }
+
+  const ROL::Ptr<fe_type> getPressureFE(void) const {
+    return fePrs_;
+  }
+
+  const ROL::Ptr<fe_type> getControlFE(void) const {
+    return feCtrl_;
+  }
+
+  const std::vector<ROL::Ptr<fe_type>> getVelocityBdryFE(const int sideset = -1) const {
+    int side = sideset;
+    if ( sideset < 0 || sideset > 2 ) {
+      //side = (useDirichletControl_) ? 6 : 10;
+      side = 0;
+    }
+    return feVelBdry_[side];
+  }
+
+  const std::vector<std::vector<int>> getBdryCellLocIds(const int sideset = -1) const {
+    int side = sideset;
+    if ( sideset < 0 || sideset > 2 ) {
+      //side = (useDirichletControl_) ? 6 : 10;
+      side = 0;
+    }
+    return bdryCellLocIds_[side];
+  }
+
+  const ROL::Ptr<FieldHelper<Real,DeviceType>> getFieldHelper(void) const {
+    return fieldHelper_;
+  }
+
+  const ROL::Ptr<const FieldUtils::FieldInfo> getStateFieldInfo(void) const {
+    return fieldInfo_;
+  }
+
+  const ROL::Ptr<const FieldUtils::FieldInfo> getControlFieldInfo(void) const {
+    return fieldInfoCtrl_;
+  }
+
+}; // PDE_NavierStokes
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/simpreg/CMakeLists.txt b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/simpreg/CMakeLists.txt
index bce71a6a0500..d51ec64e4bfa 100644
--- a/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/simpreg/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/simpreg/CMakeLists.txt
@@ -11,13 +11,13 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_01
     SOURCES example_01.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     TRSPG_SimplexRegressionDataCopy
     SOURCE_FILES
       input.xml
diff --git a/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/simpreg/example_01.cpp b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/simpreg/example_01.cpp
index 4c80e8184d27..3dccac517352 100644
--- a/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/simpreg/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/published/TRSPG_Kouri2021/simpreg/example_01.cpp
@@ -18,8 +18,7 @@
 #include "ROL_Stream.hpp"
 
 #include "Teuchos_Time.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 #include <random>
@@ -133,7 +132,7 @@ class SimpRegConstraint : public ROL::StdConstraint<Real> {
 };
 
 int main(int argc, char *argv[]) {
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/example/PDE-OPT/stefan-boltzmann/CMakeLists.txt b/packages/rol/example/PDE-OPT/stefan-boltzmann/CMakeLists.txt
index ce96ef26e897..a4df0d42a2e2 100644
--- a/packages/rol/example/PDE-OPT/stefan-boltzmann/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/stefan-boltzmann/CMakeLists.txt
@@ -11,46 +11,43 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
 
-  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  ROL_ADD_EXECUTABLE_AND_TEST(
     example_01
     SOURCES example_01.cpp
     ARGS PrintItAll
     NUM_MPI_PROCS 4
     NUM_TOTAL_CORES_USED 4
-    CATEGORIES CONTINUOUS
     PASS_REGULAR_EXPRESSION "TEST PASSED"
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  ROL_ADD_EXECUTABLE_AND_TEST(
     example_02
     SOURCES example_02.cpp
     ARGS PrintItAll
     NUM_MPI_PROCS 4
     NUM_TOTAL_CORES_USED 4
-    CATEGORIES CONTINUOUS
     PASS_REGULAR_EXPRESSION "TEST PASSED"
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  ROL_ADD_EXECUTABLE_AND_TEST(
     example_03
     SOURCES example_03.cpp
     ARGS PrintItAll
     NUM_MPI_PROCS 4
     NUM_TOTAL_CORES_USED 4
-    CATEGORIES CONTINUOUS
     PASS_REGULAR_EXPRESSION "TEST PASSED"
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_04
     SOURCES example_04.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     StochasticStefanBoltzmannDataCopy
     SOURCE_FILES
       input.xml
@@ -64,5 +61,68 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
   )
 
+ENDIF()
+
+IF(${PROJECT_NAME}_ENABLE_Intrepid2 AND
+   ${PROJECT_NAME}_ENABLE_Ifpack2   AND
+   ${PROJECT_NAME}_ENABLE_MueLu     AND
+   ${PROJECT_NAME}_ENABLE_Amesos2   AND
+   ${PROJECT_NAME}_ENABLE_Tpetra )
+
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/teuchos/src/sol)
+
+  # Need ROL_TpetraMultiVector.hpp
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
+
+  ROL_ADD_EXECUTABLE_AND_TEST(
+    example_01_Kokkos
+    SOURCES example_01K.cpp
+    ARGS PrintItAll
+    NUM_MPI_PROCS 4
+    NUM_TOTAL_CORES_USED 4
+    PASS_REGULAR_EXPRESSION "TEST PASSED"
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_ADD_EXECUTABLE_AND_TEST(
+    example_02_Kokkos
+    SOURCES example_02K.cpp
+    ARGS PrintItAll
+    NUM_MPI_PROCS 4
+    NUM_TOTAL_CORES_USED 4
+    PASS_REGULAR_EXPRESSION "TEST PASSED"
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_ADD_EXECUTABLE_AND_TEST(
+    example_03_Kokkos
+    SOURCES example_03K.cpp
+    ARGS PrintItAll
+    NUM_MPI_PROCS 4
+    NUM_TOTAL_CORES_USED 4
+    PASS_REGULAR_EXPRESSION "TEST PASSED"
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_ADD_EXECUTABLE(
+    example_04_Kokkos
+    SOURCES example_04K.cpp
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_COPY_FILES_TO_BINARY_DIR(
+    StochasticStefanBoltzmannDataCopyK
+    SOURCE_FILES
+      input.xml
+      input_app.xml
+      input_ex04.xml
+      SROMinput.xml
+      plotresults.m
+      plotstats.m
+      process.sh
+    SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}"
+    DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
+  )
 
 ENDIF()
diff --git a/packages/rol/example/PDE-OPT/stefan-boltzmann/example_01.cpp b/packages/rol/example/PDE-OPT/stefan-boltzmann/example_01.cpp
index 2282edf07e3d..cae71a6e2103 100644
--- a/packages/rol/example/PDE-OPT/stefan-boltzmann/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/stefan-boltzmann/example_01.cpp
@@ -13,8 +13,7 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -62,7 +61,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int> > comm
     = Tpetra::getDefaultComm();
   ROL::Ptr<const Teuchos::Comm<int> > serial_comm
diff --git a/packages/rol/example/PDE-OPT/stefan-boltzmann/example_01K.cpp b/packages/rol/example/PDE-OPT/stefan-boltzmann/example_01K.cpp
new file mode 100644
index 000000000000..de8f6c7101d3
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/stefan-boltzmann/example_01K.cpp
@@ -0,0 +1,291 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_01.cpp
+    \brief Shows how to solve the stochastic Stefan-Boltzmann problem.
+*/
+
+#include "Teuchos_Comm.hpp"
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+//#include <fenv.h>
+
+#include "ROL_Bounds.hpp"
+#include "ROL_Reduced_Objective_SimOpt.hpp"
+#include "ROL_MonteCarloGenerator.hpp"
+#include "ROL_StochasticProblem.hpp"
+#include "ROL_Solver.hpp"
+#include "ROL_TpetraTeuchosBatchManager.hpp"
+
+#include "../TOOLS/meshmanagerK.hpp"
+#include "../TOOLS/pdeconstraintK.hpp"
+#include "../TOOLS/pdeobjectiveK.hpp"
+#include "../TOOLS/pdevectorK.hpp"
+//#include "../TOOLS/batchmanagerK.hpp"
+#include "pde_stoch_stefan_boltzmannK.hpp"
+#include "obj_stoch_stefan_boltzmannK.hpp"
+#include "mesh_stoch_stefan_boltzmannK.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+template<class Real>
+Real random(const Teuchos::Comm<int> &comm,
+            Real a = -1, Real b = 1) {
+  Real val(0), u(0);
+  if ( Teuchos::rank<int>(comm)==0 ) {
+    u   = static_cast<Real>(rand())/static_cast<Real>(RAND_MAX);
+    val = (b-a)*u + a;
+  }
+  Teuchos::broadcast<int,Real>(comm,0,1,&val);
+  return val;
+}
+
+int main(int argc, char *argv[]) {
+  //feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
+
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  Kokkos::ScopeGuard kokkosScope (argc, argv);
+  auto comm = Tpetra::getDefaultComm();
+  auto serial_comm = ROL::makePtr<Teuchos::SerialComm<int>>();
+  const int myRank = comm->getRank();
+  if ((iprint > 0) && (myRank == 0))
+    outStream = ROL::makePtrFromRef(std::cout);
+  else
+    outStream = ROL::makePtrFromRef(bhs);
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+
+    /*** Read in XML input ***/
+    std::string filename = "input.xml";
+    auto parlist = ROL::getParametersFromXmlFile(filename);
+
+    // Problem dimensions
+    const int stochDim = 32;
+    const RealT one(1); 
+
+    /*************************************************************************/
+    /***************** BUILD GOVERNING PDE ***********************************/
+    /*************************************************************************/
+    /*** Initialize main data structure. ***/
+    auto meshMgr = ROL::makePtr<MeshManager_BackwardFacingStepChannel<RealT,DeviceT>>(*parlist);
+    //  = ROL::makePtr<StochasticStefanBoltzmannMeshManager<RealT>>(*parlist);
+    // Initialize Stefan-Boltzmann PDE.
+    auto pde = ROL::makePtr<StochasticStefanBoltzmannPDE<RealT,DeviceT>>(*parlist);
+    auto con = ROL::makePtr<PDE_Constraint<RealT,DeviceT>>(pde,meshMgr,serial_comm,*parlist,*outStream);
+    con->setSolveParameters(*parlist);
+    // Cast the constraint and get the assembler.
+    auto assembler = con->getAssembler();
+
+    /*************************************************************************/
+    /***************** BUILD VECTORS *****************************************/
+    /*************************************************************************/
+    auto  u_ptr = assembler->createStateVector();
+    auto  p_ptr = assembler->createStateVector();
+    auto du_ptr = assembler->createStateVector();
+    u_ptr->randomize();  //u_ptr->putScalar(static_cast<RealT>(1));
+    p_ptr->randomize();  //p_ptr->putScalar(static_cast<RealT>(1));
+    du_ptr->randomize(); //du_ptr->putScalar(static_cast<RealT>(0));
+    auto up  = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(u_ptr,pde,assembler);
+    auto pp  = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(p_ptr,pde,assembler);
+    auto dup = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(du_ptr,pde,assembler);
+    // Create residual vectors
+    auto r_ptr = assembler->createResidualVector();
+    r_ptr->randomize(); //r_ptr->putScalar(static_cast<RealT>(1));
+    auto rp = ROL::makePtr<PDE_DualSimVector<RealT,DeviceT>>(r_ptr,pde,assembler);
+    // Create control vector and set to ones
+    auto  z_ptr = assembler->createControlVector();
+    auto dz_ptr = assembler->createControlVector();
+    auto yz_ptr = assembler->createControlVector();
+    z_ptr->randomize();  z_ptr->putScalar(static_cast<RealT>(280));
+    dz_ptr->randomize(); //dz_ptr->putScalar(static_cast<RealT>(0));
+    yz_ptr->randomize(); //yz_ptr->putScalar(static_cast<RealT>(0));
+    auto zp  = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(z_ptr,pde,assembler);
+    auto dzp = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(dz_ptr,pde,assembler);
+    auto yzp = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(yz_ptr,pde,assembler);
+    //auto zp  = ROL::makePtr<PDE_OptVector<RealT>>(zpde);
+    //auto dzp = ROL::makePtr<PDE_OptVector<RealT>>(dzpde);
+    //auto yzp = ROL::makePtr<PDE_OptVector<RealT>>(yzpde);
+    // Create ROL SimOpt vectors
+    ROL::Vector_SimOpt<RealT> x(up,zp);
+    ROL::Vector_SimOpt<RealT> d(dup,dzp);
+
+    /*************************************************************************/
+    /***************** BUILD COST FUNCTIONAL *********************************/
+    /*************************************************************************/
+    std::vector<ROL::Ptr<QoI<RealT,DeviceT>>> qoi_vec(2,ROL::nullPtr);
+    qoi_vec[0] = ROL::makePtr<QoI_StateCost<RealT,DeviceT>>(pde->getVolFE(),*parlist);
+    qoi_vec[1] = ROL::makePtr<QoI_ControlCost<RealT,DeviceT>>(
+      pde->getVolFE(),pde->getBdryFE(0),pde->getBdryCellLocIds(0),*parlist);
+    auto std_obj = ROL::makePtr<StochasticStefanBoltzmannStdObjective<RealT>>(*parlist);
+    auto obj = ROL::makePtr<PDE_Objective<RealT,DeviceT>>(qoi_vec,std_obj,assembler);
+    auto objReduced = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(obj, con, up, zp, pp, true, false);
+
+    /*************************************************************************/
+    /***************** BUILD BOUND CONSTRAINT ********************************/
+    /*************************************************************************/
+    auto zlo_ptr = assembler->createControlVector();
+    auto zhi_ptr = assembler->createControlVector();
+    zlo_ptr->putScalar(static_cast<RealT>(280));
+    zhi_ptr->putScalar(static_cast<RealT>(370));
+    auto zlop = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(zlo_ptr,pde,assembler);
+    auto zhip = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(zhi_ptr,pde,assembler);
+    //auto zlop = ROL::makePtr<PDE_OptVector<RealT>>(zlopde);
+    //auto zhip = ROL::makePtr<PDE_OptVector<RealT>>(zhipde);
+    auto bnd = ROL::makePtr<ROL::Bounds<RealT>>(zlop,zhip);
+
+    /*************************************************************************/
+    /***************** BUILD SAMPLER *****************************************/
+    /*************************************************************************/
+    int nsamp = parlist->sublist("Problem").get("Number of Samples",100);
+    std::vector<RealT> tmp = {-one,one};
+    std::vector<std::vector<RealT>> bounds(stochDim,tmp);
+    auto bman = ROL::makePtr<ROL::TpetraTeuchosBatchManager<RealT>>(comm);
+    //auto bman = ROL::makePtr<PDE_OptVector_BatchManager<RealT,DeviceT>>(comm);
+    auto sampler = ROL::makePtr<ROL::MonteCarloGenerator<RealT>>(nsamp,bounds,bman);
+
+    /*************************************************************************/
+    /***************** BUILD STOCHASTIC PROBLEM ******************************/
+    /*************************************************************************/
+    auto opt = ROL::makePtr<ROL::StochasticProblem<RealT>>(objReduced,zp);
+    opt->addBoundConstraint(bnd);
+    parlist->sublist("SOL").set("Initial Statistic", one);
+    opt->makeObjectiveStochastic(*parlist,sampler);
+    opt->finalize(false,true,*outStream);
+
+    /*************************************************************************/
+    /***************** RUN VECTOR AND DERIVATIVE CHECKS **********************/
+    /*************************************************************************/
+    bool checkDeriv = parlist->sublist("Problem").get("Check Derivatives",false);
+    if ( checkDeriv ) {
+      up->checkVector(*pp,*dup,true,*outStream);
+      zp->checkVector(*yzp,*dzp,true,*outStream);
+      std::vector<RealT> param(stochDim,0);
+      objReduced->setParameter(param);
+      obj->checkGradient(x,d,true,*outStream);
+      obj->checkHessVec(x,d,true,*outStream);
+      con->setParameter(param);
+      con->checkApplyJacobian(x,d,*up,true,*outStream);
+      con->checkApplyAdjointHessian(x,*dup,d,x,true,*outStream);
+      con->checkAdjointConsistencyJacobian(*dup,d,x,true,*outStream);
+      con->checkInverseJacobian_1(*up,*up,*up,*zp,true,*outStream);
+      con->checkInverseAdjointJacobian_1(*up,*up,*up,*zp,true,*outStream);
+      objReduced->checkGradient(*zp,*dzp,true,*outStream);
+      objReduced->checkHessVec(*zp,*dzp,true,*outStream);
+      opt->check(true,*outStream);
+    }
+
+    /*************************************************************************/
+    /***************** SOLVE OPTIMIZATION PROBLEM ****************************/
+    /*************************************************************************/
+    parlist->sublist("Step").set("Type","Trust Region");
+    ROL::Solver<RealT> solver(opt,*parlist);
+    //zp->zero();
+    std::clock_t timer = std::clock();
+    solver.solve(*outStream);
+    *outStream << "Optimization time: "
+               << static_cast<RealT>(std::clock()-timer)/static_cast<RealT>(CLOCKS_PER_SEC)
+               << " seconds." << std::endl << std::endl;
+
+    /*************************************************************************/
+    /***************** OUTPUT RESULTS ****************************************/
+    /*************************************************************************/
+    std::clock_t timer_print = std::clock();
+    assembler->printMeshData(*outStream);
+    // Output control to file
+    con->outputTpetraVector(z_ptr,"control.txt");
+    // Output expected state and samples to file
+    up->zero(); pp->zero(); dup->zero();
+    RealT tol(1.e-8);
+    auto bman_Eu = ROL::makePtr<ROL::TpetraTeuchosBatchManager<RealT>>(comm);
+    std::vector<RealT> sample(stochDim);
+    std::stringstream name_samp;
+    name_samp << "samples_" << bman->batchID() << ".txt";
+    std::ofstream file_samp;
+    file_samp.open(name_samp.str());
+    file_samp << std::scientific << std::setprecision(15);
+    for (int i = 0; i < sampler->numMySamples(); ++i) {
+      sample = sampler->getMyPoint(i);
+      con->setParameter(sample);
+      con->solve(*rp,*dup,*zp,tol);
+      up->axpy(sampler->getMyWeight(i),*dup);
+      for (int j = 0; j < stochDim; ++j) {
+        file_samp << std::setw(25) << std::left << sample[j];
+      }
+      file_samp << std::endl;
+    }
+    file_samp.close();
+    bman_Eu->sumAll(*up,*pp);
+    con->outputTpetraVector(p_ptr,"mean_state.txt");
+    // Build objective function distribution
+    RealT val1(0), val2(0);
+    int nsamp_dist = parlist->sublist("Problem").get("Number of Output Samples",100);
+    auto stateCost = ROL::makePtr<IntegralObjective<RealT,DeviceT>>(qoi_vec[0],assembler);
+    auto redStateCost = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(stateCost, con, up, zp, pp, true, false);
+    auto ctrlCost = ROL::makePtr<IntegralObjective<RealT,DeviceT>>(qoi_vec[1],assembler);
+    auto redCtrlCost = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(ctrlCost, con, up, zp, pp, true, false);
+    auto sampler_dist = ROL::makePtr<ROL::MonteCarloGenerator<RealT>>(nsamp_dist,bounds,bman);
+    std::stringstream name;
+    name << "obj_samples_" << bman->batchID() << ".txt";
+    std::ofstream file;
+    file.open(name.str());
+    file << std::scientific << std::setprecision(15);
+    for (int i = 0; i < sampler_dist->numMySamples(); ++i) {
+      sample = sampler_dist->getMyPoint(i);
+      redStateCost->setParameter(sample);
+      val1 = redStateCost->value(*zp,tol);
+      redCtrlCost->setParameter(sample);
+      val2 = redCtrlCost->value(*zp,tol);
+      for (int j = 0; j < stochDim; ++j) {
+        file << std::setw(25) << std::left << sample[j];
+      }
+      file << std::setw(25) << std::left << val1;
+      file << std::setw(25) << std::left << val2 << std::endl;
+    }
+    file.close();
+    *outStream << "Output time: "
+               << static_cast<RealT>(std::clock()-timer_print)/static_cast<RealT>(CLOCKS_PER_SEC)
+               << " seconds." << std::endl << std::endl;
+
+    Teuchos::Array<RealT> res(1,0);
+    con->solve(*rp,*up,*zp,tol);
+    r_ptr->norm2(res.view(0,1));
+
+    /*************************************************************************/
+    /***************** CHECK RESIDUAL NORM ***********************************/
+    /*************************************************************************/
+    *outStream << "Residual Norm: " << res[0] << std::endl << std::endl;
+    errorFlag += (res[0] > 1.e-6 ? 1 : 0);
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/stefan-boltzmann/example_02.cpp b/packages/rol/example/PDE-OPT/stefan-boltzmann/example_02.cpp
index ccc09e830028..dc310c662445 100644
--- a/packages/rol/example/PDE-OPT/stefan-boltzmann/example_02.cpp
+++ b/packages/rol/example/PDE-OPT/stefan-boltzmann/example_02.cpp
@@ -13,8 +13,7 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -61,7 +60,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int> > comm
     = Tpetra::getDefaultComm();
   ROL::Ptr<const Teuchos::Comm<int> > serial_comm
diff --git a/packages/rol/example/PDE-OPT/stefan-boltzmann/example_02K.cpp b/packages/rol/example/PDE-OPT/stefan-boltzmann/example_02K.cpp
new file mode 100644
index 000000000000..ad4d3d8b3545
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/stefan-boltzmann/example_02K.cpp
@@ -0,0 +1,288 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_01.cpp
+    \brief Shows how to solve the stochastic Stefan-Boltzmann problem.
+*/
+
+#include "Teuchos_Comm.hpp"
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+//#include <fenv.h>
+
+#include "ROL_Bounds.hpp"
+#include "ROL_Reduced_Objective_SimOpt.hpp"
+#include "ROL_MonteCarloGenerator.hpp"
+#include "ROL_StochasticProblem.hpp"
+#include "ROL_Solver.hpp"
+#include "ROL_TpetraTeuchosBatchManager.hpp"
+
+#include "../TOOLS/meshmanagerK.hpp"
+#include "../TOOLS/pdeconstraintK.hpp"
+#include "../TOOLS/pdeobjectiveK.hpp"
+#include "../TOOLS/pdevectorK.hpp"
+#include "../TOOLS/batchmanagerK.hpp"
+#include "pde_stoch_stefan_boltzmannK.hpp"
+#include "obj_stoch_stefan_boltzmannK.hpp"
+#include "mesh_stoch_stefan_boltzmannK.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+template<class Real>
+Real random(const Teuchos::Comm<int> &comm,
+            Real a = -1, Real b = 1) {
+  Real val(0), u(0);
+  if ( Teuchos::rank<int>(comm)==0 ) {
+    u   = static_cast<Real>(rand())/static_cast<Real>(RAND_MAX);
+    val = (b-a)*u + a;
+  }
+  Teuchos::broadcast<int,Real>(comm,0,1,&val);
+  return val;
+}
+
+int main(int argc, char *argv[]) {
+  //feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
+
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  Kokkos::ScopeGuard kokkosScope (argc, argv);
+  auto comm = Tpetra::getDefaultComm();
+  auto serial_comm = ROL::makePtr<Teuchos::SerialComm<int>>();
+  const int myRank = comm->getRank();
+  if ((iprint > 0) && (myRank == 0))
+    outStream = ROL::makePtrFromRef(std::cout);
+  else
+    outStream = ROL::makePtrFromRef(bhs);
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+
+    /*** Read in XML input ***/
+    std::string filename = "input.xml";
+    auto parlist = ROL::getParametersFromXmlFile(filename);
+
+    // Problem dimensions
+    const int stochDim = 32, controlDim = 1;
+    const RealT one(1); 
+
+    /*************************************************************************/
+    /***************** BUILD GOVERNING PDE ***********************************/
+    /*************************************************************************/
+    /*** Initialize main data structure. ***/
+    auto meshMgr = ROL::makePtr<MeshManager_BackwardFacingStepChannel<RealT,DeviceT>>(*parlist);
+    //  = ROL::makePtr<StochasticStefanBoltzmannMeshManager<RealT,DeviceT>>(*parlist);
+    // Initialize Stefan-Boltzmann PDE.
+    auto pde = ROL::makePtr<StochasticStefanBoltzmannPDE<RealT,DeviceT>>(*parlist);
+    auto con = ROL::makePtr<PDE_Constraint<RealT,DeviceT>>(pde,meshMgr,serial_comm,*parlist,*outStream);
+    // Cast the constraint and get the assembler.
+    auto assembler = con->getAssembler();
+    con->setSolveParameters(*parlist);
+
+    /*************************************************************************/
+    /***************** BUILD VECTORS *****************************************/
+    /*************************************************************************/
+    auto  u_ptr = assembler->createStateVector();
+    auto  p_ptr = assembler->createStateVector();
+    auto du_ptr = assembler->createStateVector();
+    u_ptr->randomize();  //u_ptr->putScalar(static_cast<RealT>(1));
+    p_ptr->randomize();  //p_ptr->putScalar(static_cast<RealT>(1));
+    du_ptr->randomize(); //du_ptr->putScalar(static_cast<RealT>(0));
+    auto up  = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(u_ptr,pde,assembler);
+    auto pp  = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(p_ptr,pde,assembler);
+    auto dup = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(du_ptr,pde,assembler);
+    // Create residual vectors
+    auto r_ptr = assembler->createResidualVector();
+    r_ptr->randomize(); //r_ptr->putScalar(static_cast<RealT>(1));
+    auto rp = ROL::makePtr<PDE_DualSimVector<RealT,DeviceT>>(r_ptr,pde,assembler);
+    // Create control vector and set to ones
+    auto  z_ptr = ROL::makePtr<std::vector<RealT>>(controlDim);
+    auto dz_ptr = ROL::makePtr<std::vector<RealT>>(controlDim);
+    auto yz_ptr = ROL::makePtr<std::vector<RealT>>(controlDim);
+    // Create control direction vector and set to random
+    for (int i = 0; i < controlDim; ++i) {
+      (*z_ptr)[i]  = random<RealT>(*comm);
+      (*dz_ptr)[i] = 1e-3*random<RealT>(*comm);
+      (*yz_ptr)[i] = random<RealT>(*comm);
+    }
+    auto zp  = ROL::makePtr<PDE_OptVector<RealT>>(ROL::makePtr<ROL::StdVector<RealT>>(z_ptr));
+    auto dzp = ROL::makePtr<PDE_OptVector<RealT>>(ROL::makePtr<ROL::StdVector<RealT>>(dz_ptr));
+    auto yzp = ROL::makePtr<PDE_OptVector<RealT>>(ROL::makePtr<ROL::StdVector<RealT>>(yz_ptr));
+    // Create ROL SimOpt vectors
+    ROL::Vector_SimOpt<RealT> x(up,zp);
+    ROL::Vector_SimOpt<RealT> d(dup,dzp);
+
+    /*************************************************************************/
+    /***************** BUILD COST FUNCTIONAL *********************************/
+    /*************************************************************************/
+    std::vector<ROL::Ptr<QoI<RealT,DeviceT>>> qoi_vec(2,ROL::nullPtr);
+    qoi_vec[0] = ROL::makePtr<QoI_StateCost<RealT,DeviceT>>(pde->getVolFE(),*parlist);
+    qoi_vec[1] = ROL::makePtr<QoI_AdvectionCost<RealT,DeviceT>>();
+    auto std_obj = ROL::makePtr<StochasticStefanBoltzmannStdObjective<RealT>>(*parlist);
+    auto obj = ROL::makePtr<PDE_Objective<RealT,DeviceT>>(qoi_vec,std_obj,assembler);
+    auto objReduced = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(obj, con, up, zp, pp, true, false);
+
+    /*************************************************************************/
+    /***************** BUILD BOUND CONSTRAINT ********************************/
+    /*************************************************************************/
+    RealT upper = parlist->sublist("Problem").get("Upper Advection Bound", 100.0);
+    RealT lower = parlist->sublist("Problem").get("Lower Advection Bound",-100.0);
+    auto zlo_ptr = ROL::makePtr<std::vector<RealT>>(controlDim,lower);
+    auto zhi_ptr = ROL::makePtr<std::vector<RealT>>(controlDim,upper);
+    auto zlop = ROL::makePtr<PDE_OptVector<RealT>>(ROL::makePtr<ROL::StdVector<RealT>>(zlo_ptr));
+    auto zhip = ROL::makePtr<PDE_OptVector<RealT>>(ROL::makePtr<ROL::StdVector<RealT>>(zhi_ptr));
+    auto bnd = ROL::makePtr<ROL::Bounds<RealT>>(zlop,zhip);
+
+    /*************************************************************************/
+    /***************** BUILD SAMPLER *****************************************/
+    /*************************************************************************/
+    int nsamp = parlist->sublist("Problem").get("Number of Samples",100);
+    std::vector<RealT> tmp = {-one,one};
+    std::vector<std::vector<RealT>> bounds(stochDim,tmp);
+    auto bman = ROL::makePtr<PDE_OptVector_BatchManager<RealT>>(comm);
+    auto sampler = ROL::makePtr<ROL::MonteCarloGenerator<RealT>>(nsamp,bounds,bman);
+
+    /*************************************************************************/
+    /***************** BUILD STOCHASTIC PROBLEM ******************************/
+    /*************************************************************************/
+    auto opt = ROL::makePtr<ROL::StochasticProblem<RealT>>(objReduced,zp);
+    opt->addBoundConstraint(bnd);
+    parlist->sublist("SOL").set("Initial Statistic", one);
+    opt->makeObjectiveStochastic(*parlist,sampler);
+    opt->finalize(false,true,*outStream);
+
+    /*************************************************************************/
+    /***************** RUN VECTOR AND DERIVATIVE CHECKS **********************/
+    /*************************************************************************/
+    bool checkDeriv = parlist->sublist("Problem").get("Check Derivatives",false);
+    if ( checkDeriv ) {
+      up->checkVector(*pp,*dup,true,*outStream);
+      zp->checkVector(*yzp,*dzp,true,*outStream);
+      obj->checkGradient(x,d,true,*outStream);
+      obj->checkHessVec(x,d,true,*outStream);
+      con->checkApplyJacobian(x,d,*up,true,*outStream);
+      con->checkApplyAdjointHessian(x,*dup,d,x,true,*outStream);
+      con->checkAdjointConsistencyJacobian(*dup,d,x,true,*outStream);
+      con->checkInverseJacobian_1(*up,*up,*up,*zp,true,*outStream);
+      con->checkInverseAdjointJacobian_1(*up,*up,*up,*zp,true,*outStream);
+      objReduced->checkGradient(*zp,*dzp,true,*outStream);
+      objReduced->checkHessVec(*zp,*dzp,true,*outStream);
+      opt->check(true,*outStream);
+    }
+
+    /*************************************************************************/
+    /***************** SOLVE OPTIMIZATION PROBLEM ****************************/
+    /*************************************************************************/
+    parlist->sublist("Step").set("Type","Trust Region");
+    ROL::Solver<RealT> solver(opt,*parlist);
+    (*z_ptr)[0] = parlist->sublist("Problem").get("Advection Magnitude",0.0);
+    u_ptr->putScalar(450.0);
+    std::clock_t timer = std::clock();
+    solver.solve(*outStream);
+    *outStream << "Optimization time: "
+               << static_cast<RealT>(std::clock()-timer)/static_cast<RealT>(CLOCKS_PER_SEC)
+               << " seconds." << std::endl << std::endl;
+
+    /*************************************************************************/
+    /***************** OUTPUT RESULTS ****************************************/
+    /*************************************************************************/
+    std::clock_t timer_print = std::clock();
+    assembler->printMeshData(*outStream);
+    // Output control to file
+    *outStream << std::endl << "Advection value: " << (*z_ptr)[0] << std::endl;
+    // Output expected state and samples to file
+    *outStream << std::endl << "Print Expected Value of State" << std::endl;
+    up->zero(); pp->zero(); dup->zero();
+    RealT tol(1.e-8);
+    auto bman_Eu = ROL::makePtr<ROL::TpetraTeuchosBatchManager<RealT>>(comm);
+    std::vector<RealT> sample(stochDim);
+    std::stringstream name_samp;
+    name_samp << "samples_" << bman->batchID() << ".txt";
+    std::ofstream file_samp;
+    file_samp.open(name_samp.str());
+    file_samp << std::scientific << std::setprecision(15);
+    for (int i = 0; i < sampler->numMySamples(); ++i) {
+      *outStream << "Sample i = " << i << std::endl;
+      sample = sampler->getMyPoint(i);
+      con->setParameter(sample);
+      con->solve(*rp,*dup,*zp,tol);
+      up->axpy(sampler->getMyWeight(i),*dup);
+      for (int j = 0; j < stochDim; ++j) {
+        file_samp << std::setw(25) << std::left << sample[j];
+      }
+      file_samp << std::endl;
+    }
+    file_samp.close();
+    bman_Eu->sumAll(*up,*pp);
+    con->outputTpetraVector(p_ptr,"mean_state.txt");
+    // Build objective function distribution
+    *outStream << std::endl << "Print Objective CDF" << std::endl;
+    RealT val1(0), val2(0);
+    int nsamp_dist = parlist->sublist("Problem").get("Number of Output Samples",100);
+    auto stateCost = ROL::makePtr<IntegralObjective<RealT,DeviceT>>(qoi_vec[0],assembler);
+    auto redStateCost = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(stateCost, con, up, zp, pp, true, false);
+    auto ctrlCost = ROL::makePtr<IntegralObjective<RealT,DeviceT>>(qoi_vec[1],assembler);
+    auto redCtrlCost = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(ctrlCost, con, up, zp, pp, true, false);
+    auto sampler_dist = ROL::makePtr<ROL::MonteCarloGenerator<RealT>>(nsamp_dist,bounds,bman);
+    std::stringstream name;
+    name << "obj_samples_" << bman->batchID() << ".txt";
+    std::ofstream file;
+    file.open(name.str());
+    file << std::scientific << std::setprecision(15);
+    for (int i = 0; i < sampler_dist->numMySamples(); ++i) {
+      sample = sampler_dist->getMyPoint(i);
+      redStateCost->setParameter(sample);
+      val1 = redStateCost->value(*zp,tol);
+      redCtrlCost->setParameter(sample);
+      val2 = redCtrlCost->value(*zp,tol);
+      for (int j = 0; j < stochDim; ++j) {
+        file << std::setw(25) << std::left << sample[j];
+      }
+      file << std::setw(25) << std::left << val1;
+      file << std::setw(25) << std::left << val2 << std::endl;
+    }
+    file.close();
+    *outStream << "Output time: "
+               << static_cast<RealT>(std::clock()-timer_print)/static_cast<RealT>(CLOCKS_PER_SEC)
+               << " seconds." << std::endl << std::endl;
+
+    Teuchos::Array<RealT> res(1,0);
+    con->solve(*rp,*up,*zp,tol);
+    r_ptr->norm2(res.view(0,1));
+
+    /*************************************************************************/
+    /***************** CHECK RESIDUAL NORM ***********************************/
+    /*************************************************************************/
+    *outStream << "Residual Norm: " << res[0] << std::endl << std::endl;
+    errorFlag += (res[0] > 1.e-6 ? 1 : 0);
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/stefan-boltzmann/example_03.cpp b/packages/rol/example/PDE-OPT/stefan-boltzmann/example_03.cpp
index 3e712756f26a..f1dd1093646e 100644
--- a/packages/rol/example/PDE-OPT/stefan-boltzmann/example_03.cpp
+++ b/packages/rol/example/PDE-OPT/stefan-boltzmann/example_03.cpp
@@ -13,8 +13,7 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -61,7 +60,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int> > comm
     = Tpetra::getDefaultComm();
   ROL::Ptr<const Teuchos::Comm<int> > serial_comm
diff --git a/packages/rol/example/PDE-OPT/stefan-boltzmann/example_03K.cpp b/packages/rol/example/PDE-OPT/stefan-boltzmann/example_03K.cpp
new file mode 100644
index 000000000000..0cd439d8e300
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/stefan-boltzmann/example_03K.cpp
@@ -0,0 +1,317 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_01.cpp
+    \brief Shows how to solve the stochastic Stefan-Boltzmann problem.
+*/
+
+#include "Teuchos_Comm.hpp"
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+//#include <fenv.h>
+
+#include "ROL_Bounds.hpp"
+#include "ROL_Reduced_Objective_SimOpt.hpp"
+#include "ROL_MonteCarloGenerator.hpp"
+#include "ROL_StochasticProblem.hpp"
+#include "ROL_Solver.hpp"
+#include "ROL_TpetraTeuchosBatchManager.hpp"
+
+#include "../TOOLS/meshmanagerK.hpp"
+#include "../TOOLS/pdeconstraintK.hpp"
+#include "../TOOLS/pdeobjectiveK.hpp"
+#include "../TOOLS/pdevectorK.hpp"
+#include "../TOOLS/batchmanagerK.hpp"
+#include "pde_stoch_stefan_boltzmannK.hpp"
+#include "obj_stoch_stefan_boltzmannK.hpp"
+#include "mesh_stoch_stefan_boltzmannK.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+template<class Real>
+Real random(const Teuchos::Comm<int> &comm,
+            Real a = -1, Real b = 1) {
+  Real val(0), u(0);
+  if ( Teuchos::rank<int>(comm)==0 ) {
+    u   = static_cast<Real>(rand())/static_cast<Real>(RAND_MAX);
+    val = (b-a)*u + a;
+  }
+  Teuchos::broadcast<int,Real>(comm,0,1,&val);
+  return val;
+}
+
+int main(int argc, char *argv[]) {
+  //feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
+
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  Kokkos::ScopeGuard kokkosScope (argc, argv);
+  auto comm = Tpetra::getDefaultComm();
+  auto serial_comm = ROL::makePtr<Teuchos::SerialComm<int>>();
+  const int myRank = comm->getRank();
+  if ((iprint > 0) && (myRank == 0))
+    outStream = ROL::makePtrFromRef(std::cout);
+  else
+    outStream = ROL::makePtrFromRef(bhs);
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+
+    /*** Read in XML input ***/
+    std::string filename = "input.xml";
+    auto parlist = ROL::getParametersFromXmlFile(filename);
+
+    // Problem dimensions
+    const int stochDim = 32, controlDim = 1;
+    const RealT one(1); 
+
+    /*************************************************************************/
+    /***************** BUILD GOVERNING PDE ***********************************/
+    /*************************************************************************/
+    /*** Initialize main data structure. ***/
+    auto meshMgr = ROL::makePtr<MeshManager_BackwardFacingStepChannel<RealT,DeviceT>>(*parlist);
+    //  = ROL::makePtr<StochasticStefanBoltzmannMeshManager<RealT,DeviceT>>(*parlist);
+    // Initialize PDE describing advection-diffusion equation
+    auto pde = ROL::makePtr<StochasticStefanBoltzmannPDE<RealT,DeviceT>>(*parlist);
+    auto con = ROL::makePtr<PDE_Constraint<RealT,DeviceT>>(pde,meshMgr,serial_comm,*parlist,*outStream);
+    // Cast the constraint and get the assembler.
+    auto assembler = con->getAssembler();
+    con->setSolveParameters(*parlist);
+
+    /*************************************************************************/
+    /***************** BUILD VECTORS *****************************************/
+    /*************************************************************************/
+    auto  u_ptr = assembler->createStateVector();
+    auto  p_ptr = assembler->createStateVector();
+    auto du_ptr = assembler->createStateVector();
+    u_ptr->randomize();  //u_ptr->putScalar(static_cast<RealT>(1));
+    p_ptr->randomize();  //p_ptr->putScalar(static_cast<RealT>(1));
+    du_ptr->randomize(); //du_ptr->putScalar(static_cast<RealT>(0));
+    auto up  = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(u_ptr,pde,assembler);
+    auto pp  = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(p_ptr,pde,assembler);
+    auto dup = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(du_ptr,pde,assembler);
+    // Create residual vectors
+    auto r_ptr = assembler->createResidualVector();
+    r_ptr->randomize(); //r_ptr->putScalar(static_cast<RealT>(1));
+    auto rp = ROL::makePtr<PDE_DualSimVector<RealT,DeviceT>>(r_ptr,pde,assembler);
+    // Create control vector --- FIELD
+    auto  zbc_ptr = assembler->createControlVector();
+    auto dzbc_ptr = assembler->createControlVector();
+    auto yzbc_ptr = assembler->createControlVector();
+    zbc_ptr->randomize();  zbc_ptr->putScalar(static_cast<RealT>(280));
+    dzbc_ptr->randomize(); //dzbc_ptr->putScalar(static_cast<RealT>(0));
+    yzbc_ptr->randomize(); //yzbc_ptr->putScalar(static_cast<RealT>(0));
+    auto  zbc = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(zbc_ptr,pde,assembler);
+    auto dzbc = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(dzbc_ptr,pde,assembler);
+    auto yzbc = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(yzbc_ptr,pde,assembler);
+    // Create control vector --- PARAMETER
+    auto  zp_ptr = ROL::makePtr<std::vector<RealT>>(controlDim);
+    auto dzp_ptr = ROL::makePtr<std::vector<RealT>>(controlDim);
+    auto yzp_ptr = ROL::makePtr<std::vector<RealT>>(controlDim);
+    for (int i = 0; i < controlDim; ++i) {
+      (*zp_ptr)[i]  = random<RealT>(*comm);
+      (*dzp_ptr)[i] = random<RealT>(*comm);
+      (*yzp_ptr)[i] = random<RealT>(*comm);
+    }
+    auto  zparam = ROL::makePtr<ROL::StdVector<RealT>>(zp_ptr);
+    auto dzparam = ROL::makePtr<ROL::StdVector<RealT>>(dzp_ptr);
+    auto yzparam = ROL::makePtr<ROL::StdVector<RealT>>(yzp_ptr);
+    // Create OptVectors
+    auto  zp = ROL::makePtr<PDE_OptVector<RealT>>(zbc, zparam);
+    auto dzp = ROL::makePtr<PDE_OptVector<RealT>>(dzbc, dzparam);
+    auto yzp = ROL::makePtr<PDE_OptVector<RealT>>(yzbc, yzparam);
+    // Create ROL SimOpt vectors
+    ROL::Vector_SimOpt<RealT> x(up,zp);
+    ROL::Vector_SimOpt<RealT> d(dup,dzp);
+
+    /*************************************************************************/
+    /***************** BUILD COST FUNCTIONAL *********************************/
+    /*************************************************************************/
+    std::vector<ROL::Ptr<QoI<RealT,DeviceT>>> qoi_vec(3,ROL::nullPtr);
+    qoi_vec[0] = ROL::makePtr<QoI_StateCost<RealT,DeviceT>>(pde->getVolFE(),*parlist);
+    qoi_vec[1] = ROL::makePtr<QoI_ControlCost<RealT,DeviceT>>(
+      pde->getVolFE(),pde->getBdryFE(0),pde->getBdryCellLocIds(0),*parlist);
+    qoi_vec[2] = ROL::makePtr<QoI_AdvectionCost<RealT,DeviceT>>();
+    auto std_obj = ROL::makePtr<StochasticStefanBoltzmannStdObjective3<RealT>>(*parlist);
+    auto obj = ROL::makePtr<PDE_Objective<RealT,DeviceT>>(qoi_vec,std_obj,assembler);
+    auto objReduced = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(obj, con, up, zp, pp, true, false);
+
+    /*************************************************************************/
+    /***************** BUILD BOUND CONSTRAINT ********************************/
+    /*************************************************************************/
+    // Bounds for boundary control
+    RealT lower_bc = parlist->sublist("Problem").get("Lower Control Bound", 280.0);
+    RealT upper_bc = parlist->sublist("Problem").get("Upper Control Bound", 370.0);
+    auto zlo_bc_ptr = assembler->createControlVector();
+    auto zhi_bc_ptr = assembler->createControlVector();
+    zlo_bc_ptr->putScalar(static_cast<RealT>(lower_bc));
+    zhi_bc_ptr->putScalar(static_cast<RealT>(upper_bc));
+    auto zlo_bc = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(zlo_bc_ptr,pde,assembler);
+    auto zhi_bc = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(zhi_bc_ptr,pde,assembler);
+    // Bounds for advection control
+    RealT lower = parlist->sublist("Problem").get("Lower Advection Bound",-100.0);
+    RealT upper = parlist->sublist("Problem").get("Upper Advection Bound", 100.0);
+    auto zlo_param_ptr = ROL::makePtr<std::vector<RealT>>(controlDim,lower);
+    auto zhi_param_ptr = ROL::makePtr<std::vector<RealT>>(controlDim,upper);
+    auto zlo_adv = ROL::makePtr<ROL::StdVector<RealT>>(zlo_param_ptr);
+    auto zhi_adv = ROL::makePtr<ROL::StdVector<RealT>>(zhi_param_ptr);
+    // Combined bounds
+    auto zlop = ROL::makePtr<PDE_OptVector<RealT>>(zlo_bc, zlo_adv);
+    auto zhip = ROL::makePtr<PDE_OptVector<RealT>>(zhi_bc, zhi_adv);
+    auto bnd = ROL::makePtr<ROL::Bounds<RealT>>(zlop,zhip);
+
+    /*************************************************************************/
+    /***************** BUILD SAMPLER *****************************************/
+    /*************************************************************************/
+    int nsamp = parlist->sublist("Problem").get("Number of Samples",100);
+    std::vector<RealT> tmp = {-one,one};
+    std::vector<std::vector<RealT>> bounds(stochDim,tmp);
+    auto bman = ROL::makePtr<PDE_OptVector_BatchManager<RealT>>(comm);
+    auto sampler = ROL::makePtr<ROL::MonteCarloGenerator<RealT>>(nsamp,bounds,bman);
+
+    /*************************************************************************/
+    /***************** BUILD STOCHASTIC PROBLEM ******************************/
+    /*************************************************************************/
+    auto opt = ROL::makePtr<ROL::StochasticProblem<RealT>>(objReduced,zp);
+    opt->addBoundConstraint(bnd);
+    parlist->sublist("SOL").set("Initial Statistic", one);
+    opt->makeObjectiveStochastic(*parlist,sampler);
+    opt->finalize(false,true,*outStream);
+
+    /*************************************************************************/
+    /***************** RUN VECTOR AND DERIVATIVE CHECKS **********************/
+    /*************************************************************************/
+    bool checkDeriv = parlist->sublist("Problem").get("Check Derivatives",false);
+    if ( checkDeriv ) {
+      up->checkVector(*pp,*dup,true,*outStream);
+      zp->checkVector(*yzp,*dzp,true,*outStream);
+      obj->checkGradient(x,d,true,*outStream);
+      obj->checkHessVec(x,d,true,*outStream);
+      con->checkApplyJacobian(x,d,*up,true,*outStream);
+      con->checkApplyAdjointHessian(x,*dup,d,x,true,*outStream);
+      con->checkAdjointConsistencyJacobian(*dup,d,x,true,*outStream);
+      con->checkInverseJacobian_1(*up,*up,*up,*zp,true,*outStream);
+      con->checkInverseAdjointJacobian_1(*up,*up,*up,*zp,true,*outStream);
+      objReduced->checkGradient(*zp,*dzp,true,*outStream);
+      objReduced->checkHessVec(*zp,*dzp,true,*outStream);
+      opt->check(true,*outStream);
+    }
+
+    /*************************************************************************/
+    /***************** SOLVE OPTIMIZATION PROBLEM ****************************/
+    /*************************************************************************/
+    parlist->sublist("Step").set("Type","Trust Region");
+    ROL::Solver<RealT> solver(opt,*parlist);
+    (*zp_ptr)[0] = parlist->sublist("Problem").get("Advection Magnitude",0.0);
+    u_ptr->putScalar(450.0);
+    std::clock_t timer = std::clock();
+    solver.solve(*outStream);
+    *outStream << "Optimization time: "
+               << static_cast<RealT>(std::clock()-timer)/static_cast<RealT>(CLOCKS_PER_SEC)
+               << " seconds." << std::endl << std::endl;
+
+    /*************************************************************************/
+    /***************** OUTPUT RESULTS ****************************************/
+    /*************************************************************************/
+    std::clock_t timer_print = std::clock();
+    assembler->printMeshData(*outStream);
+    // Output control to file
+    con->outputTpetraVector(zbc_ptr,"control.txt");
+    *outStream << std::endl << "Advection value: " << (*zp_ptr)[0] << std::endl;
+    // Output expected state and samples to file
+    *outStream << std::endl << "Print Expected Value of State" << std::endl;
+    up->zero(); pp->zero(); dup->zero();
+    RealT tol(1.e-8);
+    auto bman_Eu = ROL::makePtr<ROL::TpetraTeuchosBatchManager<RealT>>(comm);
+    std::vector<RealT> sample(stochDim);
+    std::stringstream name_samp;
+    name_samp << "samples_" << bman->batchID() << ".txt";
+    std::ofstream file_samp;
+    file_samp.open(name_samp.str());
+    file_samp << std::scientific << std::setprecision(15);
+    for (int i = 0; i < sampler->numMySamples(); ++i) {
+      *outStream << "Sample i = " << i << std::endl;
+      sample = sampler->getMyPoint(i);
+      con->setParameter(sample);
+      con->solve(*rp,*dup,*zp,tol);
+      up->axpy(sampler->getMyWeight(i),*dup);
+      for (int j = 0; j < stochDim; ++j) {
+        file_samp << std::setw(25) << std::left << sample[j];
+      }
+      file_samp << std::endl;
+    }
+    file_samp.close();
+    bman_Eu->sumAll(*up,*pp);
+    con->outputTpetraVector(p_ptr,"mean_state.txt");
+    // Build objective function distribution
+    *outStream << std::endl << "Print Objective CDF" << std::endl;
+    RealT val1(0), val2(0);
+    int nsamp_dist = parlist->sublist("Problem").get("Number of Output Samples",100);
+    auto stateCost = ROL::makePtr<IntegralObjective<RealT,DeviceT>>(qoi_vec[0],assembler);
+    auto redStateCost = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(stateCost, con, up, zp, pp, true, false);
+    auto ctrlCost = ROL::makePtr<IntegralObjective<RealT,DeviceT>>(qoi_vec[1],assembler);
+    auto redCtrlCost = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(ctrlCost, con, up, zp, pp, true, false);
+    auto sampler_dist = ROL::makePtr<ROL::MonteCarloGenerator<RealT>>(nsamp_dist,bounds,bman);
+    std::stringstream name;
+    name << "obj_samples_" << bman->batchID() << ".txt";
+    std::ofstream file;
+    file.open(name.str());
+    file << std::scientific << std::setprecision(15);
+    for (int i = 0; i < sampler_dist->numMySamples(); ++i) {
+      sample = sampler_dist->getMyPoint(i);
+      redStateCost->setParameter(sample);
+      val1 = redStateCost->value(*zp,tol);
+      redCtrlCost->setParameter(sample);
+      val2 = redCtrlCost->value(*zp,tol);
+      for (int j = 0; j < stochDim; ++j) {
+        file << std::setw(25) << std::left << sample[j];
+      }
+      file << std::setw(25) << std::left << val1;
+      file << std::setw(25) << std::left << val2 << std::endl;
+    }
+    file.close();
+    *outStream << "Output time: "
+               << static_cast<RealT>(std::clock()-timer_print)/static_cast<RealT>(CLOCKS_PER_SEC)
+               << " seconds." << std::endl << std::endl;
+
+    Teuchos::Array<RealT> res(1,0);
+    con->solve(*rp,*up,*zp,tol);
+    r_ptr->norm2(res.view(0,1));
+
+    /*************************************************************************/
+    /***************** CHECK RESIDUAL NORM ***********************************/
+    /*************************************************************************/
+    *outStream << "Residual Norm: " << res[0] << std::endl << std::endl;
+    errorFlag += (res[0] > 1.e-6 ? 1 : 0);
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/stefan-boltzmann/example_04.cpp b/packages/rol/example/PDE-OPT/stefan-boltzmann/example_04.cpp
index 8394173f671a..6d89e297af79 100644
--- a/packages/rol/example/PDE-OPT/stefan-boltzmann/example_04.cpp
+++ b/packages/rol/example/PDE-OPT/stefan-boltzmann/example_04.cpp
@@ -13,8 +13,7 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -47,7 +46,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int> > comm
     = Tpetra::getDefaultComm();
   const int myRank = comm->getRank();
diff --git a/packages/rol/example/PDE-OPT/stefan-boltzmann/example_04K.cpp b/packages/rol/example/PDE-OPT/stefan-boltzmann/example_04K.cpp
new file mode 100644
index 000000000000..7d5a9c6b7e78
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/stefan-boltzmann/example_04K.cpp
@@ -0,0 +1,210 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_04.cpp
+    \brief Shows how to solve the Stefan-Boltzmann problem.
+*/
+
+#include "Teuchos_Comm.hpp"
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+//#include <fenv.h>
+//
+#include "ROL_Solver.hpp"
+#include "ROL_Bounds.hpp"
+#include "ROL_BoundConstraint_SimOpt.hpp"
+#include "ROL_Reduced_Objective_SimOpt.hpp"
+
+#include "../TOOLS/meshmanagerK.hpp"
+#include "../TOOLS/pdeconstraintK.hpp"
+#include "../TOOLS/pdeobjectiveK.hpp"
+#include "../TOOLS/pdevectorK.hpp"
+#include "pde_stoch_stefan_boltzmannK.hpp"
+#include "obj_stoch_stefan_boltzmannK.hpp"
+#include "mesh_stoch_stefan_boltzmannK.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+int main(int argc, char *argv[]) {
+  //feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
+
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  Kokkos::ScopeGuard kokkosScope (argc, argv);
+  auto comm = Tpetra::getDefaultComm();
+  const int myRank = comm->getRank();
+  if ((iprint > 0) && (myRank == 0))
+    outStream = ROL::makePtrFromRef(std::cout);
+  else
+    outStream = ROL::makePtrFromRef(bhs);
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+
+    /*** Read in XML input ***/
+    std::string filename = "input_ex04.xml";
+    auto parlist = ROL::getParametersFromXmlFile(filename);
+
+    // Problem dimensions
+    const int controlDim = 1;
+    RealT tol(1e-8);
+
+    /*************************************************************************/
+    /***************** BUILD GOVERNING PDE ***********************************/
+    /*************************************************************************/
+    /*** Initialize main data structure. ***/
+    auto meshMgr = ROL::makePtr<MeshManager_BackwardFacingStepChannel<RealT,DeviceT>>(*parlist);
+    // Initialize PDE describing advection-diffusion equation
+    auto pde = ROL::makePtr<StochasticStefanBoltzmannPDE<RealT,DeviceT>>(*parlist);
+    auto con = ROL::makePtr<PDE_Constraint<RealT,DeviceT>>(pde,meshMgr,comm,*parlist,*outStream);
+    // Cast the constraint and get the assembler.
+    auto assembler = con->getAssembler();
+    con->setSolveParameters(*parlist);
+
+    /*************************************************************************/
+    /***************** BUILD VECTORS *****************************************/
+    /*************************************************************************/
+    auto u_ptr   = assembler->createStateVector();    u_ptr->putScalar(0.0);
+    auto p_ptr   = assembler->createStateVector();    p_ptr->putScalar(0.0);
+    auto r_ptr   = assembler->createResidualVector(); r_ptr->putScalar(0.0);
+    auto zbc_ptr = assembler->createControlVector();  zbc_ptr->putScalar(280.0);
+    auto zp_ptr  = ROL::makePtr<std::vector<RealT>>(controlDim,0.0);
+    auto up      = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(u_ptr,pde,assembler);
+    auto pp      = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(p_ptr,pde,assembler);
+    auto rp      = ROL::makePtr<PDE_DualSimVector<RealT,DeviceT>>(r_ptr,pde,assembler);
+    auto zbc     = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(zbc_ptr,pde,assembler);
+    auto zparam  = ROL::makePtr<ROL::StdVector<RealT>>(zp_ptr);
+    auto zp      = ROL::makePtr<PDE_OptVector<RealT>>(zbc, zparam);
+    auto xp      = ROL::makePtr<ROL::Vector_SimOpt<RealT>>(up,zp);
+
+    /*************************************************************************/
+    /***************** BUILD COST FUNCTIONAL *********************************/
+    /*************************************************************************/
+    std::vector<ROL::Ptr<QoI<RealT,DeviceT>>> qoi_vec(3,ROL::nullPtr);
+    qoi_vec[0] = ROL::makePtr<QoI_StateCost<RealT,DeviceT>>(pde->getVolFE(),*parlist);
+    qoi_vec[1] = ROL::makePtr<QoI_ControlCost<RealT,DeviceT>>(
+      pde->getVolFE(),pde->getBdryFE(0),pde->getBdryCellLocIds(0),*parlist);
+    qoi_vec[2] = ROL::makePtr<QoI_AdvectionCost<RealT,DeviceT>>();
+    auto std_obj = ROL::makePtr<StochasticStefanBoltzmannStdObjective3<RealT>>(*parlist);
+    auto obj = ROL::makePtr<PDE_Objective<RealT,DeviceT>>(qoi_vec,std_obj,assembler);
+    auto objReduced = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(obj, con, up, zp, pp, true, false);
+
+    /*************************************************************************/
+    /***************** BUILD BOUND CONSTRAINT ********************************/
+    /*************************************************************************/
+    // Bounds for boundary control
+    RealT lower_bc = parlist->sublist("Problem").get("Lower Control Bound", 280.0);
+    RealT upper_bc = parlist->sublist("Problem").get("Upper Control Bound", 370.0);
+    auto zlo_bc_ptr = assembler->createControlVector();
+    auto zhi_bc_ptr = assembler->createControlVector();
+    zlo_bc_ptr->putScalar(static_cast<RealT>(lower_bc));
+    zhi_bc_ptr->putScalar(static_cast<RealT>(upper_bc));
+    auto zlo_bc = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(zlo_bc_ptr,pde,assembler);
+    auto zhi_bc = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(zhi_bc_ptr,pde,assembler);
+    // Bounds for advection control
+    RealT lower = parlist->sublist("Problem").get("Lower Advection Bound",-100.0);
+    RealT upper = parlist->sublist("Problem").get("Upper Advection Bound", 100.0);
+    auto zlo_param_ptr = ROL::makePtr<std::vector<RealT>>(controlDim,lower);
+    auto zhi_param_ptr = ROL::makePtr<std::vector<RealT>>(controlDim,upper);
+    auto zlo_adv = ROL::makePtr<ROL::StdVector<RealT>>(zlo_param_ptr);
+    auto zhi_adv = ROL::makePtr<ROL::StdVector<RealT>>(zhi_param_ptr);
+    // Combined bounds
+    auto zlop = ROL::makePtr<PDE_OptVector<RealT>>(zlo_bc, zlo_adv);
+    auto zhip = ROL::makePtr<PDE_OptVector<RealT>>(zhi_bc, zhi_adv);
+    auto zbnd = ROL::makePtr<ROL::Bounds<RealT>>(zlop,zhip);
+    auto ulo_ptr = assembler->createStateVector();
+    auto uhi_ptr = assembler->createStateVector();
+    ulo_ptr->putScalar(ROL::ROL_NINF<RealT>()); uhi_ptr->putScalar(ROL::ROL_INF<RealT>());
+    auto ulop = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(ulo_ptr,pde,assembler,*parlist);
+    auto uhip = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(uhi_ptr,pde,assembler,*parlist);
+    auto ubnd = ROL::makePtr<ROL::Bounds<RealT>>(ulop,uhip);
+    auto bnd = ROL::makePtr<ROL::BoundConstraint_SimOpt<RealT> >(ubnd,zbnd);
+
+    /*************************************************************************/
+    /***************** BUILD OPTIMIZATION PROBLEM ****************************/
+    /*************************************************************************/
+    ROL::Ptr<ROL::Problem<RealT>> problem;
+    bool useReducedSpace = parlist->sublist("Problem").get("Use Reduced Space",true);
+    if (useReducedSpace) {
+      problem = ROL::makePtr<ROL::Problem<RealT>>(objReduced,zp);
+      problem->addBoundConstraint(zbnd);
+    }
+    else {
+      problem = ROL::makePtr<ROL::Problem<RealT>>(obj,xp);
+      problem->addBoundConstraint(bnd);
+      problem->addConstraint("PDE",con,pp);
+    }
+    problem->finalize(false,true,*outStream);
+    bool checkDeriv = parlist->sublist("Problem").get("Check Derivatives",false);
+    if ( checkDeriv ) problem->check(true,*outStream);
+
+    /*************************************************************************/
+    /***************** SOLVE OPTIMIZATION PROBLEM ****************************/
+    /*************************************************************************/
+    ROL::Solver<RealT> solver(problem,*parlist);
+    (*zp_ptr)[0] = parlist->sublist("Problem").get("Advection Magnitude",0.0);
+    u_ptr->putScalar(450.0);
+
+    bool solveFS = parlist->sublist("Problem").get("Initial Solve for Full Space",true); 
+    if (solveFS) con->solve(*rp,*up,*zp,tol);
+
+    std::clock_t timer = std::clock();
+    solver.solve(*outStream);
+    *outStream << "Optimization time: "
+               << static_cast<RealT>(std::clock()-timer)/static_cast<RealT>(CLOCKS_PER_SEC)
+               << " seconds." << std::endl << std::endl;
+
+    /*************************************************************************/
+    /***************** OUTPUT RESULTS ****************************************/
+    /*************************************************************************/
+    std::clock_t timer_print = std::clock();
+    assembler->printMeshData(*outStream);
+    // Output control to file
+    con->outputTpetraVector(zbc_ptr,"control.txt");
+    con->outputTpetraVector(u_ptr,"state.txt");
+    *outStream << std::endl << "Advection value: " << (*zp_ptr)[0] << std::endl;
+    *outStream << "Output time: "
+               << static_cast<RealT>(std::clock()-timer_print)/static_cast<RealT>(CLOCKS_PER_SEC)
+               << " seconds." << std::endl << std::endl;
+
+    Teuchos::Array<RealT> res(1,0);
+    con->solve(*rp,*up,*zp,tol);
+    r_ptr->norm2(res.view(0,1));
+
+    /*************************************************************************/
+    /***************** CHECK RESIDUAL NORM ***********************************/
+    /*************************************************************************/
+    *outStream << "Residual Norm: " << res[0] << std::endl << std::endl;
+    errorFlag += (res[0] > 1.e-6 ? 1 : 0);
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/stefan-boltzmann/input.xml b/packages/rol/example/PDE-OPT/stefan-boltzmann/input.xml
index 397afbd7b79f..f38cf6b00548 100644
--- a/packages/rol/example/PDE-OPT/stefan-boltzmann/input.xml
+++ b/packages/rol/example/PDE-OPT/stefan-boltzmann/input.xml
@@ -28,7 +28,7 @@
     <Parameter name="Statistic Initial Guess"             type="double"  value="5.0"     />
     <Parameter name="Number of Samples"                   type="int"     value="4"       />
     <!-- OUTPUT -->
-    <Parameter name="Check Derivatives"                   type="bool"    value="true"    />
+    <Parameter name="Check Derivatives"                   type="bool"    value="false"    />
     <Parameter name="Print Mean Value State"              type="bool"    value="false"   />
     <Parameter name="Number of Output Samples"            type="int"     value="4"       />
     <!-- FINITE ELEMENT INFORMATION -->
@@ -65,10 +65,16 @@
   <ParameterList name="SOL">
     <Parameter name="Type" type="string" value="Mean Value"   />
     <!--
-      <Parameter name="Type" type="string" value="Risk Averse"  />
       <Parameter name="Type" type="string" value="Risk Neutral" />
+      <Parameter name="Type" type="string" value="Risk Averse"  />
       <Parameter name="Type" type="string" value="Probability"  />
     -->
+    <ParameterList name="Objective">
+      <Parameter name="Type" type="string" value="Mean Value"   />
+      <!--
+      <Parameter name="Type" type="string" value="Risk Neutral" />
+      -->
+    </ParameterList>
     <ParameterList name="Risk Measure">
       <Parameter name="Name" type="string" value="Entropic Risk" />
       <!--
diff --git a/packages/rol/example/PDE-OPT/stefan-boltzmann/mesh_stoch_stefan_boltzmannK.hpp b/packages/rol/example/PDE-OPT/stefan-boltzmann/mesh_stoch_stefan_boltzmannK.hpp
new file mode 100644
index 000000000000..493b333d6fe7
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/stefan-boltzmann/mesh_stoch_stefan_boltzmannK.hpp
@@ -0,0 +1,102 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+#include "../TOOLS/meshmanagerK.hpp"
+
+template <class Real, class DeviceType>
+class StochasticStefanBoltzmannMeshManager : public MeshManager_Rectangle<Real,DeviceType> {
+private:
+
+  int nx_, ny_;
+  ROL::Ptr<std::vector<std::vector<std::vector<int>>>>  meshSideSets_;
+
+public: 
+
+  StochasticStefanBoltzmannMeshManager(ROL::ParameterList &parlist)
+    : MeshManager_Rectangle<Real, DeviceType>(parlist) {
+    nx_ = parlist.sublist("Geometry").get("NX", 3);
+    ny_ = parlist.sublist("Geometry").get("NY", 3);
+    computeSideSets();
+  }
+
+
+  void computeSideSets(void) {
+
+    int numSideSets = 7;
+    meshSideSets_ = ROL::makePtr<std::vector<std::vector<std::vector<int>>>>(numSideSets);
+
+    // Dirichlet Control
+    (*meshSideSets_)[0].resize(4);
+    (*meshSideSets_)[0][0].resize(nx_);
+    (*meshSideSets_)[0][1].resize(0);
+    (*meshSideSets_)[0][2].resize(0);
+    (*meshSideSets_)[0][3].resize(0);
+    // Neumann
+    (*meshSideSets_)[1].resize(4);
+    (*meshSideSets_)[1][0].resize(0);
+    (*meshSideSets_)[1][1].resize(ny_);
+    (*meshSideSets_)[1][2].resize(0);
+    (*meshSideSets_)[1][3].resize(0);
+    // Stefan-Boltzmann
+    (*meshSideSets_)[2].resize(4);
+    (*meshSideSets_)[2][0].resize(0);
+    (*meshSideSets_)[2][1].resize(0);
+    (*meshSideSets_)[2][2].resize(0);
+    (*meshSideSets_)[2][3].resize(0);
+    // Stefan-Boltzmann
+    (*meshSideSets_)[3].resize(4);
+    (*meshSideSets_)[3][0].resize(0);
+    (*meshSideSets_)[3][1].resize(0);
+    (*meshSideSets_)[3][2].resize(nx_);
+    (*meshSideSets_)[3][3].resize(0);
+    // Dirichlet
+    (*meshSideSets_)[4].resize(4);
+    (*meshSideSets_)[4][0].resize(0);
+    (*meshSideSets_)[4][1].resize(0);
+    (*meshSideSets_)[4][2].resize(0);
+    (*meshSideSets_)[4][3].resize(ny_);
+    // Stefan-Boltzmann
+    (*meshSideSets_)[5].resize(4);
+    (*meshSideSets_)[5][0].resize(0);
+    (*meshSideSets_)[5][1].resize(0);
+    (*meshSideSets_)[5][2].resize(0);
+    (*meshSideSets_)[5][3].resize(0);
+    // Neumann
+    (*meshSideSets_)[6].resize(4);
+    (*meshSideSets_)[6][0].resize(0);
+    (*meshSideSets_)[6][1].resize(0);
+    (*meshSideSets_)[6][2].resize(0);
+    (*meshSideSets_)[6][3].resize(0);
+
+    for (int i=0; i<nx_; ++i) {
+      (*meshSideSets_)[0][0][i] = i;
+    }
+    for (int i=0; i<ny_; ++i) {
+      (*meshSideSets_)[1][1][i] = (i+1)*nx_-1;
+    }
+    for (int i=0; i<nx_; ++i) {
+      (*meshSideSets_)[3][2][i] = i + nx_*(ny_-1);
+    }
+    for (int i=0; i<ny_; ++i) {
+      (*meshSideSets_)[4][3][i] = i*nx_;
+    }
+
+  } // computeSideSets
+
+  ROL::Ptr<std::vector<std::vector<std::vector<int>>>> getSideSets(
+              const bool verbose = false,
+              std::ostream & outStream = std::cout) const { 
+    if (verbose) {
+      outStream << "Mesh_stoch_adv_diff: getSideSets called" << std::endl;
+      outStream << "Mesh_stoch_adv_diff: numSideSets = "     << meshSideSets_->size() << std::endl;
+    }
+    return meshSideSets_;
+  }
+  
+}; // MeshManager_stoch_adv_diff
diff --git a/packages/rol/example/PDE-OPT/stefan-boltzmann/obj_stoch_stefan_boltzmannK.hpp b/packages/rol/example/PDE-OPT/stefan-boltzmann/obj_stoch_stefan_boltzmannK.hpp
new file mode 100644
index 000000000000..430f60c616ff
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/stefan-boltzmann/obj_stoch_stefan_boltzmannK.hpp
@@ -0,0 +1,686 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  obj.hpp
+    \brief Provides the interface for local (cell-based) objective function computations.
+*/
+
+#ifndef PDEOPT_QOI_STOCHASTIC_STEFAN_BOLTZMANNK_HPP
+#define PDEOPT_QOI_STOCHASTIC_STEFAN_BOLTZMANNK_HPP
+
+#include "../TOOLS/qoiK.hpp"
+#include "pde_stoch_stefan_boltzmannK.hpp"
+
+template <class Real, class DeviceType>
+class QoI_StateCost : public QoI<Real, DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> fe_;
+  scalar_view weight_;
+  Real T_;
+  Real rate_;
+  bool exp_;
+  Real xmid_;
+  Real vol_;
+
+  Real weightFunc(const std::vector<Real> & x) const {
+    return ((x[1] < xmid_) ? static_cast<Real>(0) : static_cast<Real>(1));
+  }
+
+  Real cost(Real u, int deriv = 0) const {
+    if ( exp_ ) {
+      if ( deriv == 1 ) {
+        return std::exp(rate_ * (u - T_)) / vol_;
+      }
+      if ( deriv == 2 ) {
+        return rate_ * std::exp(rate_ * (u - T_)) / vol_;
+      }
+      return (std::exp(rate_ * (u - T_)) - static_cast<Real>(1)) / (rate_ * vol_);
+    }
+    else {    
+      if ( u > T_ ) {
+        if ( deriv == 0 ) {
+          return static_cast<Real>(0.5)*(u-T_)*(u-T_)/vol_;
+        }
+        if ( deriv == 1 ) {
+          return (u-T_)/vol_;
+        }
+        if ( deriv == 2 ) {
+          return static_cast<Real>(1)/vol_;
+        }
+      }
+      return static_cast<Real>(0);
+    }
+  }
+
+public:
+  QoI_StateCost(const ROL::Ptr<fe_type> &fe,
+                ROL::ParameterList &parlist) : fe_(fe) {
+    T_    = parlist.sublist("Problem").get("Desired engine temperature",373.0);
+    exp_  = parlist.sublist("Problem").get("Use exponential engine cost model",false);
+    rate_ = parlist.sublist("Problem").get("Exponential engine cost rate",1.0);
+    
+    xmid_ = parlist.sublist("Geometry").get("Step height",0.5);
+    Real height = parlist.sublist("Geometry").get("Channel height",1.0);
+    Real width  = parlist.sublist("Geometry").get("Channel width",5.0);
+    vol_ = (height - xmid_) * width;
+
+    int c = fe_->cubPts().extent_int(0);
+    int p = fe_->cubPts().extent_int(1);
+    int d = fe_->cubPts().extent_int(2);
+    std::vector<Real> pt(d);
+    weight_ = scalar_view("weight",c,p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k) {
+          pt[k] = (fe_->cubPts())(i,j,k);
+        }
+        weight_(i,j) = weightFunc(pt);
+      }
+    }
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c = fe_->gradN().extent_int(0);
+    const int p = fe_->gradN().extent_int(2);
+    // Initialize output val
+    val = scalar_view("val", c);
+    // Evaluate state on FE basis
+    scalar_view valU_eval("valU_eval", c, p);
+    fe_->evaluateValue(valU_eval, u_coeff);
+    // Compute cost
+    scalar_view costU("costU", c, p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        costU(i,j) = cost(valU_eval(i,j),0);
+      }
+    }
+    // Compute squared L2-norm of diff
+    fe_->computeIntegral(val,costU,weight_);
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c = fe_->gradN().extent_int(0);
+    const int f = fe_->gradN().extent_int(1);
+    const int p = fe_->gradN().extent_int(2);
+    // Initialize output grad
+    grad = scalar_view("grad", c, f);
+    // Evaluate state on FE basis
+    scalar_view valU_eval("valU_eval", c, p);
+    fe_->evaluateValue(valU_eval, u_coeff);
+    // Compute cost
+    scalar_view costU("costU", c, p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        costU(i,j) = cost(valU_eval(i,j),1);
+      }
+    }
+    // Multiply cost and weight
+    scalar_view WcostU("WcostU", c, p);
+    fst::scalarMultiplyDataData(WcostU,weight_,costU);
+    // Compute gradient of squared L2-norm of diff
+    fst::integrate(grad,WcostU,fe_->NdetJ(),false);
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_StateCost::gradient_2 is zero.");
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c = fe_->gradN().extent_int(0);
+    const int f = fe_->gradN().extent_int(1);
+    const int p = fe_->gradN().extent_int(2);
+    // Initialize output grad
+    hess = scalar_view("hess", c, f);
+    // Evaluate state on FE basis
+    scalar_view valU_eval("valU_eval", c, p);
+    fe_->evaluateValue(valU_eval, u_coeff);
+    // Evaluate direction on FE basis
+    scalar_view valV_eval("valV_eval", c, p);
+    fe_->evaluateValue(valV_eval, v_coeff);
+    // Compute cost
+    scalar_view costU("costU", c, p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        costU(i,j) = cost(valU_eval(i,j),2);
+      }
+    }
+    // Multiply cost and weight
+    scalar_view WcostU("WcostU", c, p);
+    fst::scalarMultiplyDataData(WcostU,weight_,costU);
+    // Multiply weighted cost and direction
+    scalar_view WcostUV("WcostUV", c, p);
+    fst::scalarMultiplyDataData(WcostUV,WcostU,valV_eval);
+    // Compute gradient of squared L2-norm of diff
+    fst::integrate(hess,WcostUV,fe_->NdetJ(),false);
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_StateCost::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_StateCost::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_StateCost::HessVec_22 is zero.");
+  }
+
+}; // QoI_L2Tracking
+
+template <class Real, class DeviceType>
+class QoI_ControlCost : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> fe_vol_;
+  const std::vector<ROL::Ptr<fe_type>> fe_bdry_;
+  const std::vector<std::vector<int>> bdryCellLocIds_;
+  std::vector<scalar_view> weight_;
+  Real T_;
+  bool exp_;
+  Real rate1_;
+  Real rate2_;
+  Real area_;
+
+  Real weightFunc(const std::vector<Real> & x) const {
+    return static_cast<Real>(1);
+  }
+
+  Real cost(Real z, int deriv = 0) const {
+    if ( exp_ ) {
+      if ( deriv == 1 ) {
+        return (std::exp(rate1_ * (z - T_)) - std::exp(rate2_ * (T_ - z))) / area_;
+      }
+      if ( deriv == 2 ) {
+        return (rate1_ * std::exp(rate1_ * (z - T_))
+              + rate2_ * std::exp(rate2_ * (T_ - z))) / area_;
+      }
+      return (std::exp(rate1_ * (z - T_)) - static_cast<Real>(1)) / (rate1_ * area_)
+            +(std::exp(rate2_ * (T_ - z)) - static_cast<Real>(1)) / (rate2_ * area_);
+    }
+    else {
+      if ( deriv == 0 ) {
+        return static_cast<Real>(0.5)*(z-T_)*(z-T_)/area_;
+      }
+      if ( deriv == 1 ) {
+        return (z-T_)/area_;
+      }
+      if ( deriv == 2 ) {
+        return static_cast<Real>(1)/area_;
+      }
+      return static_cast<Real>(0);
+    }
+  }
+
+  scalar_view getBoundaryCoeff(const scalar_view cell_coeff, int locSideId) const {
+    std::vector<int> bdryCellLocId = bdryCellLocIds_[locSideId];
+    const int numCellsSide = bdryCellLocId.size();
+    const int f = fe_vol_->N().extent_int(1);
+    
+    scalar_view bdry_coeff("bdry_coeff", numCellsSide, f);
+    for (int i = 0; i < numCellsSide; ++i) {
+      for (int j = 0; j < f; ++j) {
+        bdry_coeff(i, j) = cell_coeff(bdryCellLocId[i], j);
+      }
+    }
+    return bdry_coeff;
+  }
+
+public:
+  QoI_ControlCost(const ROL::Ptr<fe_type> &fe_vol,
+                  const std::vector<ROL::Ptr<fe_type>> &fe_bdry,
+                  const std::vector<std::vector<int>> &bdryCellLocIds,
+                  ROL::ParameterList &parlist)
+    : fe_vol_(fe_vol), fe_bdry_(fe_bdry), bdryCellLocIds_(bdryCellLocIds) {
+    T_     = parlist.sublist("Problem").get("Desired control temperature",293.0);
+    exp_   = parlist.sublist("Problem").get("Use exponential control cost model",false);
+    rate1_ = parlist.sublist("Problem").get("Exponential upper control cost rate",1.0);
+    rate2_ = parlist.sublist("Problem").get("Exponential lower control cost rate",1.0);
+    Real w1 = parlist.sublist("Geometry").get("Channel width",5.0);
+    Real w2 = parlist.sublist("Geometry").get("Step width",1.0);
+    area_ = w1 - w2;
+    const int numLocSides = bdryCellLocIds_.size();
+    const int d = fe_vol_->gradN().extent_int(3);
+    std::vector<Real> pt(d);
+    weight_.resize(numLocSides);
+    for (int l = 0; l < numLocSides; ++l) {
+      const int numCellsSide  = bdryCellLocIds_[l].size();
+      if ( numCellsSide ) {
+        const int numCubPerSide = fe_bdry_[l]->cubPts().extent_int(1);
+        weight_[l] = scalar_view("weight", numCellsSide, numCubPerSide);
+        for (int i = 0; i < numCellsSide; ++i) {
+          for (int j = 0; j < numCubPerSide; ++j) {
+            for (int k = 0; k < d; ++k) {
+              pt[k] = (fe_bdry_[l]->cubPts())(i,j,k);
+            }
+            (weight_[l])(i,j) = weightFunc(pt);
+          }
+        }
+      }
+    }
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if ( z_coeff != scalar_view() ) {
+      const int c = fe_vol_->gradN().extent_int(0);
+      // Initialize output val
+      val = scalar_view("val", c);
+      // Compute cost integral
+      const int numLocSides = bdryCellLocIds_.size();
+      for (int l = 0; l < numLocSides; ++l) {
+        const int numCellsSide  = bdryCellLocIds_[l].size();
+        if ( numCellsSide ) {
+          const int numCubPerSide = fe_bdry_[l]->cubPts().extent_int(1);
+          // Evaluate control on FE basis
+          scalar_view z_coeff_bdry = getBoundaryCoeff(z_coeff, l);
+          scalar_view valZ_eval("valZ_eval", numCellsSide, numCubPerSide);
+          fe_bdry_[l]->evaluateValue(valZ_eval, z_coeff_bdry);
+          // Compute cost
+          scalar_view costZ("costZ", numCellsSide, numCubPerSide);
+          for (int i = 0; i < numCellsSide; ++i) {
+            for (int j = 0; j < numCubPerSide; ++j) {
+              costZ(i,j) = cost(valZ_eval(i,j),0);
+            }
+          }
+          // Integrate cell cost
+          scalar_view intVal("intVal", numCellsSide);
+          fe_bdry_[l]->computeIntegral(intVal,costZ,weight_[l]);
+          // Add to integral value
+          for (int i = 0; i < numCellsSide; ++i) {
+            int cidx = bdryCellLocIds_[l][i];
+            val(cidx) += intVal(i);
+          }
+        }
+      }
+    }
+    else {
+      throw Exception::Zero(">>> QoI_ControlCost: Value is zero.");
+    }
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_ControlCost::gradient_1 is zero.");
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if ( z_coeff != scalar_view() ) {
+      const int c = fe_vol_->gradN().extent_int(0);
+      const int f = fe_vol_->gradN().extent_int(1);
+      // Initialize output val
+      grad = scalar_view("grad", c, f);
+      // Compute cost integral
+      const int numLocSides = bdryCellLocIds_.size();
+      for (int l = 0; l < numLocSides; ++l) {
+        const int numCellsSide  = bdryCellLocIds_[l].size();
+        if ( numCellsSide ) {
+          const int numCubPerSide = fe_bdry_[l]->cubPts().extent_int(1);
+          // Evaluate control on FE basis
+          scalar_view z_coeff_bdry = getBoundaryCoeff(z_coeff, l);
+          scalar_view valZ_eval("valZ_eval", numCellsSide, numCubPerSide);
+          fe_bdry_[l]->evaluateValue(valZ_eval, z_coeff_bdry);
+          // Compute cost
+          scalar_view costZ("costZ", numCellsSide, numCubPerSide);
+          for (int i = 0; i < numCellsSide; ++i) {
+            for (int j = 0; j < numCubPerSide; ++j) {
+              costZ(i,j) = cost(valZ_eval(i,j),1);
+            }
+          }
+          // Multiply cost and weight
+          scalar_view WcostZ("WcostZ", numCellsSide, numCubPerSide);
+          fst::scalarMultiplyDataData(WcostZ,weight_[l],costZ);
+          // Compute gradient of squared L2-norm of diff
+          scalar_view intGrad("intGrad",numCellsSide, f);
+          fst::integrate(intGrad,WcostZ,fe_bdry_[l]->NdetJ(),false);
+          // Add to integral value
+          for (int i = 0; i < numCellsSide; ++i) {
+            int cidx = bdryCellLocIds_[l][i];
+            for (int j = 0; j < f; ++j) {
+              grad(cidx,j) += intGrad(i,j);
+            }
+          }
+        }
+      }
+    }
+    else {
+      throw Exception::Zero(">>> QoI_ControlCost: Gradient_2 is zero.");
+    }
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_ControlCost::HessVec_11 is zero.");
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_ControlCost::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_ControlCost::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if ( z_coeff != scalar_view() ) {
+      const int c = fe_vol_->gradN().extent_int(0);
+      const int f = fe_vol_->gradN().extent_int(1);
+      // Initialize output val
+      hess = scalar_view("hess", c, f);
+      // Compute cost integral
+      const int numLocSides = bdryCellLocIds_.size();
+      for (int l = 0; l < numLocSides; ++l) {
+        const int numCellsSide  = bdryCellLocIds_[l].size();
+        if ( numCellsSide ) {
+          const int numCubPerSide = fe_bdry_[l]->cubPts().extent_int(1);
+          // Evaluate control on FE basis
+          scalar_view z_coeff_bdry = getBoundaryCoeff(z_coeff, l);
+          scalar_view valZ_eval("valZ_eval", numCellsSide, numCubPerSide);
+          fe_bdry_[l]->evaluateValue(valZ_eval, z_coeff_bdry);
+          // Evaluate direction on FE basis
+          scalar_view v_coeff_bdry = getBoundaryCoeff(v_coeff, l);
+          scalar_view valV_eval("valV_eval", numCellsSide, numCubPerSide);
+          fe_bdry_[l]->evaluateValue(valV_eval, v_coeff_bdry);
+          // Compute cost
+          scalar_view costZ("costZ", numCellsSide, numCubPerSide);
+          for (int i = 0; i < numCellsSide; ++i) {
+            for (int j = 0; j < numCubPerSide; ++j) {
+              costZ(i,j) = cost(valZ_eval(i,j),2);
+            }
+          }
+          // Multiply cost and weight
+          scalar_view WcostZ("WcostZ", numCellsSide, numCubPerSide);
+          fst::scalarMultiplyDataData(WcostZ,weight_[l],costZ);
+          // Multiply weighted cost and direction
+          scalar_view WcostZV("WcostZV", numCellsSide, numCubPerSide);
+          fst::scalarMultiplyDataData(WcostZV,WcostZ,valV_eval);
+          // Compute hessian times a vector of cost
+          scalar_view intHess("intHess", numCellsSide, f);
+          fst::integrate(intHess,WcostZV,fe_bdry_[l]->NdetJ(),false);
+          // Add to integral value
+          for (int i = 0; i < numCellsSide; ++i) {
+            int cidx = bdryCellLocIds_[l][i];
+            for (int j = 0; j < f; ++j) {
+              hess(cidx,j) += intHess(i,j);
+            }
+          }
+        }
+      }
+    }
+    else {
+      throw Exception::Zero(">>> QoI_ControlCost: HessVec_22 is zero.");
+    }
+  }
+
+}; // QoI_L2Penalty
+
+template <class Real, class DeviceType>
+class QoI_AdvectionCost : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+public:
+  QoI_AdvectionCost(void) {}
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if ( z_param != ROL::nullPtr ) {
+      const int size = z_param->size();
+      Real sum(0);
+      for (int i = 0; i < size; ++i) {
+        sum += std::pow((*z_param)[i], 2);
+      }
+      val = scalar_view();
+      return static_cast<Real>(0.5)*sum;
+    }
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_AdvectionCost::gradient_1 is zero.");
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_AdvectionCost::gradient_2 is zero.");
+  }
+
+  std::vector<Real> gradient_3(std::vector<scalar_view> & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if ( z_param != ROL::nullPtr ) {
+      return *z_param;
+    }
+    else {
+      throw Exception::Zero(">>> QoI_AdvectionCost::gradient_3 is zero.");
+    }
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_AdvectionCost::HessVec_11 is zero.");
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_AdvectionCost::HessVec_12 is zero.");
+  }
+
+  void HessVec_13(scalar_view & hess,
+                  const ROL::Ptr<const std::vector<Real>> & v_param,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_AdvectionCost::HessVec_13 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_AdvectionCost::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_AdvectionCost::HessVec_22 is zero.");
+  }
+
+  void HessVec_23(scalar_view & hess,
+                  const ROL::Ptr<const std::vector<Real>> & v_param,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_AdvectionCost::HessVec_23 is zero.");
+  }
+
+  std::vector<Real> HessVec_31(std::vector<scalar_view> & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_AdvectionCost::HessVec_31 is zero.");
+  }
+
+  std::vector<Real> HessVec_32(std::vector<scalar_view> & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_AdvectionCost::HessVec_32 is zero.");
+  }
+
+  std::vector<Real> HessVec_33(std::vector<scalar_view> & hess,
+                  const ROL::Ptr<const std::vector<Real>> & v_param,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if ( z_param != ROL::nullPtr ) {
+      return *v_param;
+    }
+    else {
+      throw Exception::Zero(">>> QoI_AdvectionCost::HessVec_33 is zero.");
+    }
+  }
+
+}; // QoI_Advection_Cost
+
+template <class Real>
+class StochasticStefanBoltzmannStdObjective : public ROL::StdObjective<Real> {
+private:
+  Real alpha0_;
+  Real alpha1_;
+
+public:
+  StochasticStefanBoltzmannStdObjective(ROL::ParameterList &parlist) {
+    alpha0_ = parlist.sublist("Problem").get("State Cost",1.0);
+    alpha1_ = parlist.sublist("Problem").get("Control Cost",1.0);
+  }
+
+  Real value(const std::vector<Real> &x, Real &tol) {
+    return alpha0_*x[0] + alpha1_*x[1];
+  }
+
+  void gradient(std::vector<Real> &g, const std::vector<Real> &x, Real &tol) {
+    g[0] = alpha0_;
+    g[1] = alpha1_;
+  }
+
+  void hessVec(std::vector<Real> &hv, const std::vector<Real> &v, const std::vector<Real> &x, Real &tol) {
+    const Real zero(0);
+    hv[0] = zero;
+    hv[1] = zero;
+  }
+
+}; // OBJ_SCALAR
+
+template <class Real>
+class StochasticStefanBoltzmannStdObjective3 : public ROL::StdObjective<Real> {
+private:
+  Real alpha0_;
+  Real alpha1_;
+  Real alpha2_;
+
+public:
+  StochasticStefanBoltzmannStdObjective3(ROL::ParameterList &parlist) {
+    alpha0_ = parlist.sublist("Problem").get("State Cost",1.0);
+    alpha1_ = parlist.sublist("Problem").get("Control Cost",1.0);
+    alpha2_ = parlist.sublist("Problem").get("Advection Cost",1.0);
+  }
+
+  Real value(const std::vector<Real> &x, Real &tol) {
+    return alpha0_*x[0] + alpha1_*x[1] + alpha2_*x[2];
+  }
+
+  void gradient(std::vector<Real> &g, const std::vector<Real> &x, Real &tol) {
+    g[0] = alpha0_;
+    g[1] = alpha1_;
+    g[2] = alpha2_;
+  }
+
+  void hessVec(std::vector<Real> &hv, const std::vector<Real> &v, const std::vector<Real> &x, Real &tol) {
+    const Real zero(0);
+    hv[0] = zero;
+    hv[1] = zero;
+    hv[2] = zero;
+  }
+
+}; // OBJ_SCALAR
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/stefan-boltzmann/pde_stoch_stefan_boltzmannK.hpp b/packages/rol/example/PDE-OPT/stefan-boltzmann/pde_stoch_stefan_boltzmannK.hpp
new file mode 100644
index 000000000000..d5c80f9b37ef
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/stefan-boltzmann/pde_stoch_stefan_boltzmannK.hpp
@@ -0,0 +1,1051 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  pde.hpp
+    \brief Implements the local PDE interface for the Stefan_Boltzmann control problem.
+*/
+
+#ifndef PDEOPT_PDE_STOCH_STEFAN_BOLTZMANNK_HPP
+#define PDEOPT_PDE_STOCH_STEFAN_BOLTZMANNK_HPP
+
+#include "../TOOLS/pdeK.hpp"
+#include "../TOOLS/feK.hpp"
+
+#include "Intrepid2_HGRAD_QUAD_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_QUAD_C2_FEM.hpp"
+#include "Intrepid2_DefaultCubatureFactory.hpp"
+#include "Intrepid2_FunctionSpaceTools.hpp"
+#include "Intrepid2_CellTools.hpp"
+
+#include "ROL_Ptr.hpp"
+#include "Teuchos_LAPACK.hpp"
+
+template <class Real, class DeviceType>
+class StochasticStefanBoltzmannPDE : public PDE<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+
+  // Finite element basis information
+  basis_ptr basisPtr_;
+  std::vector<basis_ptr> basisPtrs_;
+  // Cell cubature information
+  ROL::Ptr<Intrepid2::Cubature<DeviceType,Real,Real>> cellCub_, bdryCub_;
+  // Cell node information
+  scalar_view volCellNodes_;
+  std::vector<std::vector<scalar_view>> bdryCellNodes_;
+  std::vector<std::vector<std::vector<int>>> bdryCellLocIds_;
+  // Finite element definition
+  ROL::Ptr<fe_type> fe_vol_;
+  std::vector<std::vector<ROL::Ptr<fe_type>>> fe_bdry_;
+  // Local degrees of freedom on boundary, for each side of the reference cell (first index).
+  std::vector<std::vector<int>> fidx_;
+
+  const Real scale_;
+  Real xmid_;
+  Real engTemp_;
+  Real airTemp_;
+  Real H2OTemp_;
+  Real advMag_;
+  Real SBscale_;
+  Real nonLin_;
+
+public:
+
+  StochasticStefanBoltzmannPDE(ROL::ParameterList &parlist) : scale_(1) {
+    xmid_ = parlist.sublist("Geometry").get<Real>("Step height");
+    engTemp_ = parlist.sublist("Problem").get("Engine: Ambient Temperature",450.0);
+    airTemp_ = parlist.sublist("Problem").get("Air: Ambient Temperature",293.0);
+    H2OTemp_ = parlist.sublist("Problem").get("Water: Ambient Temperature",303.0);
+    advMag_  = parlist.sublist("Problem").get("Advection Magnitude",6.3);
+    bool useSB = parlist.sublist("Problem").get("Use Stefan-Boltzmann",true);
+    SBscale_ = (useSB) ? static_cast<Real>(1) : static_cast<Real>(0);
+    bool useND = parlist.sublist("Problem").get("Use Nonlinear Conductivities",true);
+    nonLin_ = (useND) ? static_cast<Real>(1) : static_cast<Real>(0);
+    // Finite element fields.
+    int basisOrder = parlist.sublist("Problem").get("Basis Order",1);
+    if (basisOrder == 1)
+      basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceType,Real,Real>>();
+    else if (basisOrder == 2)
+      basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C2_FEM<DeviceType,Real,Real>>();
+    basisPtrs_.clear(); basisPtrs_.push_back(basisPtr_);
+    // Quadrature rules.
+    shards::CellTopology cellType = basisPtr_->getBaseCellTopology();        // get the cell type from any basis
+    Intrepid2::DefaultCubatureFactory cubFactory;                       // create cubature factory
+    int cubDegree = parlist.sublist("Problem").get("Volume Cubature Degree",2); // set cubature degree, e.g., 2
+    cellCub_ = cubFactory.create<DeviceType,Real,Real>(cellType, cubDegree);                       // create default cubature
+
+    int d = cellType.getDimension();
+    shards::CellTopology bdryCellType = cellType.getCellTopologyData(d-1, 0);
+    int bdryCubDegree = parlist.sublist("Problem").get("Boundary Cubature Degree",2); // set cubature degree, e.g., 2
+    bdryCub_ = cubFactory.create<DeviceType,Real,Real>(bdryCellType, bdryCubDegree);
+  }
+  
+  void residual(scalar_view & res,
+                const scalar_view u_coeff,
+                const scalar_view z_coeff = scalar_view(),
+                const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // GET DIMENSIONS
+    int c = fe_vol_->gradN().extent_int(0);
+    int f = fe_vol_->gradN().extent_int(1);
+    int p = fe_vol_->gradN().extent_int(2);
+    int d = fe_vol_->gradN().extent_int(3);
+    // INITIALIZE RESIDUAL
+    res = scalar_view("res", c, f);
+    // EVALUATE STATE ON FE BASIS
+    scalar_view U_eval("U_eval", c, p);
+    fe_vol_->evaluateValue(U_eval, u_coeff);
+    scalar_view gradU_eval("gradU_eval", c, p, d);
+    fe_vol_->evaluateGradient(gradU_eval, u_coeff);
+    // COMPUTE CONSTANT PDE COEFFICIENTS
+    scalar_view V("V", c, p, d);
+    scalar_view rhs("rhs", c, p);
+    computeCoefficients(V,rhs,z_param);
+    // COMPUTE DIFFUSIVITY
+    scalar_view kappa("kappa", c, p);
+    computeDiffusivity(kappa,U_eval,0);
+    // MULTIPLY kappa * grad(U)
+    scalar_view kappa_gradU("kappa_gradU", c, p, d);
+    fst::tensorMultiplyDataData(kappa_gradU,kappa,gradU_eval);
+    // INTEGRATE (kappa * grad(U)) . grad(N)
+    fst::integrate(res,kappa_gradU,fe_vol_->gradNdetJ(),false);
+    // MULTIPLY V . grad(U)
+    scalar_view V_gradU("V_gradU", c, p);
+    fst::dotMultiplyDataData(V_gradU,V,gradU_eval);
+    // INTEGRATE (V . grad(U)) * N
+    fst::integrate(res,V_gradU,fe_vol_->NdetJ(),true);
+    // ADD RHS TO RESIDUAL
+    fst::integrate(res,rhs,fe_vol_->NdetJ(),true);
+
+    // APPLY NEUMANN CONDITIONS: Sideset 3, 6
+    // ---> Nothing to do
+    int numLocalSideIds(0);
+    // APPLY STEFAN-BOLTZMANN CONDITIONS: Sideset 1, 2, 4, 5
+    std::vector<int> sidesets = {1, 2, 4, 5};
+    for (int i = 0; i < 4; ++i) {
+      numLocalSideIds = bdryCellLocIds_[sidesets[i]].size();
+      const int numCubPerSide = bdryCub_->getNumPoints();
+      for (int j = 0; j < numLocalSideIds; ++j) {
+        int numCellsSide = bdryCellLocIds_[sidesets[i]][j].size();
+        if (numCellsSide) {
+          // Get U coefficients on Stefan-Boltzmann boundary
+          scalar_view u_coeff_bdry = getBoundaryCoeff(u_coeff, sidesets[i], j);
+          // Evaluate U on FE basis
+          scalar_view valU_eval_bdry("valU_eval_bdry", numCellsSide, numCubPerSide);
+          fe_bdry_[sidesets[i]][j]->evaluateValue(valU_eval_bdry, u_coeff_bdry);
+          // Compute Stefan-Boltzmann residual
+          scalar_view sb_valU("sb_valU", numCellsSide, numCubPerSide);
+          computeStefanBoltzmann(sb_valU,valU_eval_bdry,sidesets[i],j,0);
+          scalar_view sbRes("sbRes", numCellsSide, f);
+          fst::integrate(sbRes,sb_valU,fe_bdry_[sidesets[i]][j]->NdetJ(),false);
+          // Add Stefan-Boltzmann residual to volume residual
+          for (int k = 0; k < numCellsSide; ++k) {
+            int cidx = bdryCellLocIds_[sidesets[i]][j][k];
+            for (int l = 0; l < f; ++l) { 
+              res(cidx,l) += sbRes(k,l);
+            }
+          }
+        }
+      }
+    }
+    // APPLY ROBIN CONTROLS: Sideset 0
+    int sideset = 0;
+    numLocalSideIds = bdryCellLocIds_[sideset].size();
+    const int numCubPerSide = bdryCub_->getNumPoints();
+    for (int j = 0; j < numLocalSideIds; ++j) {
+      int numCellsSide = bdryCellLocIds_[sideset][j].size();
+      if (numCellsSide) {
+        // Get U coefficients on Robin boundary
+        scalar_view u_coeff_bdry = getBoundaryCoeff(u_coeff, sideset, j);
+        // Evaluate U on FE basis
+        scalar_view valU_eval_bdry("valU_eval_bdry", numCellsSide, numCubPerSide);
+        fe_bdry_[sideset][j]->evaluateValue(valU_eval_bdry, u_coeff_bdry);
+        // Compute Stefan-Boltzmann residual
+        scalar_view robinVal("robinVal", numCellsSide, numCubPerSide);
+        scalar_view valZ_eval_bdry;
+        if (z_coeff != scalar_view()) {
+          // Get Z coefficients on Robin boundary
+          scalar_view z_coeff_bdry = getBoundaryCoeff(z_coeff, sideset, j);
+          // Evaluate Z on FE basis
+          valZ_eval_bdry = scalar_view("valZ_eval_bdry", numCellsSide, numCubPerSide);
+          fe_bdry_[sideset][j]->evaluateValue(valZ_eval_bdry, z_coeff_bdry);
+        }
+        else {
+          valZ_eval_bdry = scalar_view();
+        }
+        computeRobin(robinVal,valU_eval_bdry,valZ_eval_bdry,sideset,j,0);
+        scalar_view robinRes("robinRes", numCellsSide, f);
+        fst::integrate(robinRes,robinVal,fe_bdry_[sideset][j]->NdetJ(),false);
+        // Add Stefan-Boltzmann residual to volume residual
+        for (int k = 0; k < numCellsSide; ++k) {
+          int cidx = bdryCellLocIds_[sideset][j][k];
+          for (int l = 0; l < f; ++l) { 
+            res(cidx,l) += robinRes(k,l);
+          }
+        }
+      }
+    }
+//    // APPLY DIRICHLET CONTROLS: Sideset 0
+//    int sideset = 0;
+//    numLocalSideIds = bdryCellLocIds_[sideset].size();
+//    for (int j = 0; j < numLocalSideIds; ++j) {
+//      int numCellsSide = bdryCellLocIds_[sideset][j].size();
+//      int numBdryDofs = fidx_[j].size();
+//      for (int k = 0; k < numCellsSide; ++k) {
+//        int cidx = bdryCellLocIds_[sideset][j][k];
+//        for (int l = 0; l < numBdryDofs; ++l) {
+//          (*res)(cidx,fidx_[j][l])
+//            = (*u_coeff)(cidx,fidx_[j][l]) - (*z_coeff)(cidx,fidx_[j][l]);
+//        }
+//      }
+//    }
+  }
+  
+  void Jacobian_1(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // GET DIMENSIONS
+    int c = fe_vol_->gradN().extent_int(0);
+    int f = fe_vol_->gradN().extent_int(1);
+    int p = fe_vol_->gradN().extent_int(2);
+    int d = fe_vol_->gradN().extent_int(3);
+    // INITILAIZE JACOBIAN
+    jac = scalar_view("jac", c, f, f);
+    // EVALUATE STATE ON FE BASIS
+    scalar_view U_eval("U_eval", c, p);
+    fe_vol_->evaluateValue(U_eval, u_coeff);
+    scalar_view gradU_eval("gradU_eval", c, p, d);
+    fe_vol_->evaluateGradient(gradU_eval, u_coeff);
+    // COMPUTE CONSTNAT PDE COEFFICIENTS
+    scalar_view V("V", c, p, d);
+    scalar_view rhs("rhs", c, p);
+    computeCoefficients(V,rhs,z_param);
+    // COMPUTE DIFFUSIVITY
+    scalar_view kappa("kappa", c, p);
+    computeDiffusivity(kappa,U_eval,0);
+    scalar_view d_kappa("d_kappa", c, p);
+    computeDiffusivity(d_kappa,U_eval,1);
+    // MULTIPLY kappa * grad(N)
+    scalar_view kappa_gradN("kappa_gradN", c, f, p, d);
+    fst::tensorMultiplyDataField(kappa_gradN,kappa,fe_vol_->gradN());
+    // INTEGRATE (kappa * grad(N)) . grad(N)
+    fst::integrate(jac,kappa_gradN,fe_vol_->gradNdetJ(),false);
+    // MULTIPLY d_kappa * grad(U)
+    scalar_view d_kappa_gradU("kappa_gradU", c, p, d);
+    fst::tensorMultiplyDataData(d_kappa_gradU,d_kappa,gradU_eval);
+    // MULTIPLY (d_kappa * grad(U)) . grad(N)
+    scalar_view d_kappa_gradU_gradN("d_kappa_gradU_gradN", c, f, p);
+    fst::dotMultiplyDataField(d_kappa_gradU_gradN,d_kappa_gradU,fe_vol_->gradNdetJ());
+    // INTEGRATE (d_kappa * grad(U)) . grad(N) * N
+    fst::integrate(jac,d_kappa_gradU_gradN,fe_vol_->N(),true);
+    // MULTIPLY V . grad(N)
+    scalar_view V_gradN("V_gradN", c, f, p);
+    fst::dotMultiplyDataField(V_gradN,V,fe_vol_->gradN());
+    // INTEGRATE (V . grad(U)) * N
+    fst::integrate(jac,fe_vol_->NdetJ(),V_gradN,true);
+
+    // APPLY NEUMANN CONDITIONS: Sideset 3, 6
+    // ---> Nothing to do
+    int numLocalSideIds(0);
+    // APPLY STEFAN-BOLTZMANN CONDITIONS: Sideset 1, 2, 4, 5
+    std::vector<int> sidesets = {1, 2, 4, 5};
+    for (int i = 0; i < 4; ++i) {
+      numLocalSideIds = bdryCellLocIds_[sidesets[i]].size();
+      const int numCubPerSide = bdryCub_->getNumPoints();
+      for (int j = 0; j < numLocalSideIds; ++j) {
+        int numCellsSide = bdryCellLocIds_[sidesets[i]][j].size();
+        if (numCellsSide) {
+          // Get U coefficients on Stefan-Boltzmann boundary
+          scalar_view u_coeff_bdry = getBoundaryCoeff(u_coeff, sidesets[i], j);
+          // Evaluate U on FE basis
+          scalar_view valU_eval_bdry("valU_eval_bdry", numCellsSide, numCubPerSide);
+          fe_bdry_[sidesets[i]][j]->evaluateValue(valU_eval_bdry, u_coeff_bdry);
+          // Compute Stefan-Boltzmann residual
+          scalar_view sb_derivU("sb_derivU", numCellsSide, numCubPerSide);
+          computeStefanBoltzmann(sb_derivU,valU_eval_bdry,sidesets[i],j,1);
+          scalar_view sb_derivU_N("sb_derivU_N", numCellsSide, f, numCubPerSide);
+          fst::scalarMultiplyDataField(sb_derivU_N,sb_derivU,fe_bdry_[sidesets[i]][j]->N());
+          scalar_view sbJac("sbJac", numCellsSide, f, f);
+          fst::integrate(sbJac,sb_derivU_N,fe_bdry_[sidesets[i]][j]->NdetJ(),false);
+          // Add Stefan-Boltzmann residual to volume residual
+          for (int k = 0; k < numCellsSide; ++k) {
+            int cidx = bdryCellLocIds_[sidesets[i]][j][k];
+            for (int l = 0; l < f; ++l) { 
+              for (int m = 0; m < f; ++m) { 
+                jac(cidx,l,m) += sbJac(k,l,m);
+              }
+            }
+          }
+        }
+      }
+    }
+    // APPLY ROBIN CONTROL: Sideset 0
+    int sideset = 0;
+    numLocalSideIds = bdryCellLocIds_[sideset].size();
+    const int numCubPerSide = bdryCub_->getNumPoints();
+    for (int j = 0; j < numLocalSideIds; ++j) {
+      int numCellsSide = bdryCellLocIds_[sideset][j].size();
+      if (numCellsSide) {
+        // Get U coefficients on Robin boundary
+        scalar_view u_coeff_bdry = getBoundaryCoeff(u_coeff, sideset, j);
+        // Evaluate U on FE basis
+        scalar_view valU_eval_bdry("valU_eval_bdry", numCellsSide, numCubPerSide);
+        fe_bdry_[sideset][j]->evaluateValue(valU_eval_bdry, u_coeff_bdry);
+        // Compute Stefan-Boltzmann residual
+        scalar_view robinVal("robinVal", numCellsSide, numCubPerSide);
+        scalar_view valZ_eval_bdry;
+        if (z_coeff != scalar_view()) {
+          // Get Z coefficients on Robin boundary
+          scalar_view z_coeff_bdry = getBoundaryCoeff(z_coeff, sideset, j);
+          // Evaluate Z on FE basis
+          valZ_eval_bdry = scalar_view("valZ_eval_bdry", numCellsSide, numCubPerSide);
+          fe_bdry_[sideset][j]->evaluateValue(valZ_eval_bdry, z_coeff_bdry);
+        }
+        else {
+          valZ_eval_bdry = scalar_view();
+        }
+        computeRobin(robinVal,valU_eval_bdry,valZ_eval_bdry,sideset,j,1,1);
+        scalar_view robinVal_N("robinVal_N", numCellsSide, f, numCubPerSide);
+        fst::scalarMultiplyDataField(robinVal_N,robinVal,fe_bdry_[sideset][j]->N());
+        scalar_view robinJac("robinJac", numCellsSide, f, f);
+        fst::integrate(robinJac,robinVal_N,fe_bdry_[sideset][j]->NdetJ(),false);
+        // Add Stefan-Boltzmann residual to volume residual
+        for (int k = 0; k < numCellsSide; ++k) {
+          int cidx = bdryCellLocIds_[sideset][j][k];
+          for (int l = 0; l < f; ++l) { 
+            for (int m = 0; m < f; ++m) { 
+              jac(cidx,l,m) += robinJac(k,l,m);
+            }
+          }
+        }
+      }
+    }
+//    // APPLY DIRICHLET CONDITIONS: Sideset 0
+//    int sideset = 0;
+//    numLocalSideIds = bdryCellLocIds_[sideset].size();
+//    for (int j = 0; j < numLocalSideIds; ++j) {
+//      int numCellsSide = bdryCellLocIds_[sideset][j].size();
+//      int numBdryDofs = fidx_[j].size();
+//      for (int k = 0; k < numCellsSide; ++k) {
+//        int cidx = bdryCellLocIds_[sideset][j][k];
+//        for (int l = 0; l < numBdryDofs; ++l) {
+//          for (int m = 0; m < f; ++m) {
+//            (*jac)(cidx,fidx_[j][l],m) = static_cast<Real>(0);
+//          }
+//          (*jac)(cidx,fidx_[j][l],fidx_[j][l]) = static_cast<Real>(1);
+//        }
+//      }
+//    }
+  }
+  
+  void Jacobian_2(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if (z_coeff != scalar_view()) {
+      // GET DIMENSIONS
+      int c = fe_vol_->gradN().extent_int(0);
+      int f = fe_vol_->gradN().extent_int(1);
+      // INITILAIZE JACOBIAN
+      jac = scalar_view("jac", c, f, f);
+      // APPLY ROBIN CONTROL: Sideset 0
+      int sideset = 0;
+      int numLocalSideIds = bdryCellLocIds_[sideset].size();
+      const int numCubPerSide = bdryCub_->getNumPoints();
+      for (int j = 0; j < numLocalSideIds; ++j) {
+        int numCellsSide = bdryCellLocIds_[sideset][j].size();
+        if (numCellsSide) {
+          // Get U coefficients on Robin boundary
+          scalar_view u_coeff_bdry = getBoundaryCoeff(u_coeff, sideset, j);
+          // Get Z coefficients on Robin boundary
+          scalar_view z_coeff_bdry = getBoundaryCoeff(z_coeff, sideset, j);
+          // Evaluate U on FE basis
+          scalar_view valU_eval_bdry("valU_eval_bdry", numCellsSide, numCubPerSide);
+          fe_bdry_[sideset][j]->evaluateValue(valU_eval_bdry, u_coeff_bdry);
+          // Evaluate Z on FE basis
+          scalar_view valZ_eval_bdry("valZ_eval_bdry", numCellsSide, numCubPerSide);
+          fe_bdry_[sideset][j]->evaluateValue(valZ_eval_bdry, z_coeff_bdry);
+          // Compute Stefan-Boltzmann residual
+          scalar_view robinVal("robinVal", numCellsSide, numCubPerSide);
+          computeRobin(robinVal,valU_eval_bdry,valZ_eval_bdry,sideset,j,1,2);
+          scalar_view robinVal_N("robinVal_N", numCellsSide, f, numCubPerSide);
+          fst::scalarMultiplyDataField(robinVal_N,robinVal,fe_bdry_[sideset][j]->N());
+          scalar_view robinJac("robinJac", numCellsSide, f, f);
+          fst::integrate(robinJac,robinVal_N,fe_bdry_[sideset][j]->NdetJ(),false);
+          // Add Stefan-Boltzmann residual to volume residual
+          for (int k = 0; k < numCellsSide; ++k) {
+            int cidx = bdryCellLocIds_[sideset][j][k];
+            for (int l = 0; l < f; ++l) { 
+              for (int m = 0; m < f; ++m) { 
+                jac(cidx,l,m) += robinJac(k,l,m);
+              }
+            }
+          }
+        }
+      }
+    }
+    else {
+      throw Exception::Zero(">>> (StochasticStefanBoltzmannPDE::Jacobian_2): Jacobian_2 is zero.");
+    }
+//    // APPLY DIRICHLET CONTROLS: Sideset 0
+//    int sideset = 0;
+//    int numLocalSideIds = bdryCellLocIds_[sideset].size();
+//    for (int j = 0; j < numLocalSideIds; ++j) {
+//      int numCellsSide = bdryCellLocIds_[sideset][j].size();
+//      int numBdryDofs = fidx_[j].size();
+//      for (int k = 0; k < numCellsSide; ++k) {
+//        int cidx = bdryCellLocIds_[sideset][j][k];
+//        for (int l = 0; l < numBdryDofs; ++l) {
+//          (*jac)(cidx,fidx_[j][l],fidx_[j][l]) = static_cast<Real>(-1);
+//        }
+//      }
+//    }
+  }
+
+  void Jacobian_3(std::vector<scalar_view> & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if (z_param != ROL::nullPtr) {
+      // GET DIMENSIONS
+      int c = fe_vol_->gradN().extent_int(0);
+      int f = fe_vol_->gradN().extent_int(1);
+      int p = fe_vol_->gradN().extent_int(2);
+      int d = fe_vol_->gradN().extent_int(3);
+      // INITIALIZE RESIDUAL
+      jac.resize(z_param->size());
+      jac[0] = scalar_view("jac", c, f);
+      // EVALUATE STATE ON FE BASIS
+      scalar_view U_eval("U_eval", c, p);
+      fe_vol_->evaluateValue(U_eval, u_coeff);
+      scalar_view gradU_eval("gradU_eval", c, p, d);
+      fe_vol_->evaluateGradient(gradU_eval, u_coeff);
+      ROL::Ptr<std::vector<Real>> one = ROL::makePtr<std::vector<Real>>(z_param->size(), 1); 
+      // COMPUTE CONSTANT PDE COEFFICIENTS
+      scalar_view V("V", c, p, d);
+      scalar_view rhs("rhs", c, p);
+      computeCoefficients(V,rhs,one);
+      // MULTIPLY V . grad(U)
+      scalar_view V_gradU("V_gradU", c, p);
+      fst::dotMultiplyDataData(V_gradU,V,gradU_eval);
+      // INTEGRATE (V . grad(U)) * N
+      fst::integrate(jac[0],V_gradU,fe_vol_->NdetJ(),true);
+    }
+    else{
+      throw Exception::Zero(">>> (StochasticStefanBoltzmannPDE::Jacobian_3): Jacobian_3 is zero.");
+    }
+  }
+
+  void Hessian_11(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // GET DIMENSIONS
+    int c = fe_vol_->gradN().extent_int(0);
+    int f = fe_vol_->gradN().extent_int(1);
+    int p = fe_vol_->gradN().extent_int(2);
+    int d = fe_vol_->gradN().extent_int(3);
+    // INITILAIZE JACOBIAN
+    hess = scalar_view("hess", c, f, f);
+    scalar_view l_coeff_dbc("l_coeff_dbc", c, f);
+    Kokkos::deep_copy(l_coeff_dbc, l_coeff);
+//    // APPLY DIRICHLET CONDITIONS TO LAGRANGE MULTIPLIERS: Sideset 0
+//    int sideset = 0;
+//    int numLocalSideIds = bdryCellLocIds_[sideset].size();
+//    for (int j = 0; j < numLocalSideIds; ++j) {
+//      int numCellsSide = bdryCellLocIds_[sideset][j].size();
+//      int numBdryDofs = fidx_[j].size();
+//      for (int k = 0; k < numCellsSide; ++k) {
+//        int cidx = bdryCellLocIds_[sideset][j][k];
+//        for (int l = 0; l < numBdryDofs; ++l) {
+//          (*l_coeff_dbc)(cidx,fidx_[j][l]) = static_cast<Real>(0);
+//        }
+//      }
+//    }
+    // EVALUATE STATE ON FE BASIS
+    scalar_view U_eval("U_eval", c, p);
+    fe_vol_->evaluateValue(U_eval, u_coeff);
+    scalar_view gradU_eval("gradU_eval", c, p, d);
+    fe_vol_->evaluateGradient(gradU_eval, u_coeff);
+    scalar_view gradL_eval("gradL_eval", c, p, d);
+    fe_vol_->evaluateGradient(gradL_eval, l_coeff_dbc);
+    // COMPUTE DIFFUSIVITY
+    scalar_view d1_kappa("d1_kappa", c, p);
+    computeDiffusivity(d1_kappa,U_eval,1);
+    scalar_view d2_kappa("d2_kappa", c, p);
+    computeDiffusivity(d2_kappa,U_eval,2);
+    // MULTIPLY d1_kappa * grad(L)
+    scalar_view d1_kappa_gradL("d1_kappa_gradL", c, p, d);
+    fst::tensorMultiplyDataData(d1_kappa_gradL,d1_kappa,gradL_eval);
+    // MULTIPLY (d_1kappa * grad(L)) . grad(N)det(J)
+    scalar_view d1_kappa_gradL_gradNdetJ("d1_kappa_gradL_gradNdetJ", c, f, p);
+    fst::dotMultiplyDataField(d1_kappa_gradL_gradNdetJ,d1_kappa_gradL,fe_vol_->gradNdetJ());
+    // INTEGRATE (d1_kappa * grad(L)) . grad(N) * N
+    fst::integrate(hess,fe_vol_->N(),d1_kappa_gradL_gradNdetJ,false);
+    // MULTIPLY d1_kappa * grad(L) . grad(N)
+    scalar_view d1_kappa_gradL_gradN("d1_kappa_gradL_gradN", c, f, p);
+    fst::dotMultiplyDataField(d1_kappa_gradL_gradN,d1_kappa_gradL,fe_vol_->gradN());
+    // INTEGRATE (d1_kappa * grad(L)) . grad(N) * N
+    fst::integrate(hess,d1_kappa_gradL_gradN,fe_vol_->NdetJ(),true);
+    // MULTIPLY grad(U) . grad(L)
+    scalar_view gradU_gradL("gradU_gradL", c, p);
+    fst::dotMultiplyDataData(gradU_gradL,gradU_eval,gradL_eval);
+    // MULTIPLY d2_kappa * grad(U) . grad(L)
+    scalar_view d2_kappa_gradU_gradL("d2_kappa_gradU_gradL", c, p);
+    fst::scalarMultiplyDataData(d2_kappa_gradU_gradL,d2_kappa,gradU_gradL);
+    // MULTIPLY d2_kappa * grad(U) . grad(L) * N
+    scalar_view d2_kappa_gradU_gradL_N("d2_kappa_gradU_gradL_N", c, f, p);
+    fst::scalarMultiplyDataField(d2_kappa_gradU_gradL_N,d2_kappa_gradU_gradL,fe_vol_->N());
+    // INTEGRATE (d2_kappa * grad(U) . grad(L) ) * N * N
+    fst::integrate(hess,d2_kappa_gradU_gradL_N,fe_vol_->NdetJ(),true);
+    // APPLY NEUMANN CONDITIONS: Sideset 3, 6
+    // ---> Nothing to do
+    // APPLY STEFAN-BOLTZMANN CONDITIONS: Sideset 1, 2, 4, 5
+    std::vector<int> sidesets = {1, 2, 4, 5};
+    for (int i = 0; i < 4; ++i) {
+      int numLocalSideIds = bdryCellLocIds_[sidesets[i]].size();
+      const int numCubPerSide = bdryCub_->getNumPoints();
+      for (int j = 0; j < numLocalSideIds; ++j) {
+        int numCellsSide = bdryCellLocIds_[sidesets[i]][j].size();
+        if (numCellsSide) {
+          // Get U coefficients on Stefan-Boltzmann boundary
+          scalar_view u_coeff_bdry = getBoundaryCoeff(u_coeff, sidesets[i], j);
+          scalar_view l_coeff_bdry = getBoundaryCoeff(l_coeff_dbc, sidesets[i], j);
+          // Evaluate U on FE basis
+          scalar_view valU_eval_bdry("valU_eval_bdry", numCellsSide, numCubPerSide);
+          fe_bdry_[sidesets[i]][j]->evaluateValue(valU_eval_bdry, u_coeff_bdry);
+          scalar_view valL_eval_bdry("valL_eval_bdry", numCellsSide, numCubPerSide);
+          fe_bdry_[sidesets[i]][j]->evaluateValue(valL_eval_bdry, l_coeff_bdry);
+          // Compute Stefan-Boltzmann residual
+          scalar_view sb_derivU("sb_derivU", numCellsSide, numCubPerSide);
+          computeStefanBoltzmann(sb_derivU,valU_eval_bdry,sidesets[i],j,2);
+          scalar_view sb_derivU_L("sb_derivU_L", numCellsSide, numCubPerSide);
+          fst::scalarMultiplyDataData(sb_derivU_L,sb_derivU,valL_eval_bdry);
+          scalar_view sb_derivU_L_N("sb_derivU_L_N", numCellsSide, f, numCubPerSide);
+          fst::scalarMultiplyDataField(sb_derivU_L_N,sb_derivU_L,fe_bdry_[sidesets[i]][j]->N());
+          scalar_view sbHess("sbHess", numCellsSide, f, f);
+          fst::integrate(sbHess,sb_derivU_L_N,fe_bdry_[sidesets[i]][j]->NdetJ(),false);
+          // Add Stefan-Boltzmann residual to volume residual
+          for (int k = 0; k < numCellsSide; ++k) {
+            int cidx = bdryCellLocIds_[sidesets[i]][j][k];
+            for (int l = 0; l < f; ++l) { 
+              for (int m = 0; m < f; ++m) { 
+                hess(cidx,l,m) += sbHess(k,l,m);
+              }
+            }
+          }
+        }
+      }
+    }
+    // APPLY ROBIN CONTROL: Sideset 0
+    // --> Nothing to do
+  }
+
+  void Hessian_12(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (StochasticStefanBoltzmannPDE::Hessian_12): Hessian is zero.");
+  }
+
+  void Hessian_21(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (StochasticStefanBoltzmannPDE::Hessian_21): Hessian is zero.");
+  }
+
+  void Hessian_22(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (StochasticStefanBoltzmannPDE::Hessian_22): Hessian is zero.");
+  }
+
+  void Hessian_13(std::vector<scalar_view> & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if ( z_param != ROL::nullPtr ) {
+      // GET DIMENSIONS
+      int c = fe_vol_->gradN().extent_int(0);
+      int f = fe_vol_->gradN().extent_int(1);
+      int p = fe_vol_->gradN().extent_int(2);
+      int d = fe_vol_->gradN().extent_int(3);
+      // INITILAIZE HESSIAN
+      hess.resize(z_param->size());
+      hess[0] = scalar_view("hess", c, f);
+      // EVALUATE STATE ON FE BASIS
+      scalar_view L_eval("L_eval", c, p);
+      fe_vol_->evaluateValue(L_eval, l_coeff);
+      // COMPUTE CONSTANT PDE COEFFICIENTS
+      scalar_view V("V", c, p, d);
+      scalar_view rhs("rhs", c, p);
+      ROL::Ptr<std::vector<Real>> one = ROL::makePtr<std::vector<Real>>(z_param->size(), 1); 
+      computeCoefficients(V,rhs,one);
+      // MULTIPLY V . grad(N)
+      scalar_view V_gradN("V_gradN", c, f, p);
+      fst::dotMultiplyDataField(V_gradN,V,fe_vol_->gradNdetJ());
+      // INTEGRATE (V . grad(U)) * N
+      fst::integrate(hess[0],L_eval,V_gradN,false);
+    }
+    else {
+      throw Exception::Zero(">>> (StochasticStefanBoltzmannPDE::Hessian_13): Hessian_13 is zero.");
+    }
+  }
+
+  void Hessian_23(std::vector<scalar_view> & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (StochasticStefanBoltzmannPDE::Hessian_23): Hessian_23 is zero.");
+  }
+
+  void Hessian_31(std::vector<scalar_view> & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    if ( z_param != ROL::nullPtr ) {
+      // GET DIMENSIONS
+      int c = fe_vol_->gradN().extent_int(0);
+      int f = fe_vol_->gradN().extent_int(1);
+      int p = fe_vol_->gradN().extent_int(2);
+      int d = fe_vol_->gradN().extent_int(3);
+      // INITILAIZE HESSIAN
+      hess.resize(z_param->size());
+      hess[0] = scalar_view("hess", c, f);
+      // EVALUATE STATE ON FE BASIS
+      scalar_view L_eval("L_eval", c, p);
+      fe_vol_->evaluateValue(L_eval, l_coeff);
+      // COMPUTE CONSTANT PDE COEFFICIENTS
+      scalar_view V("V", c, p, d);
+      scalar_view rhs("rhs", c, p);
+      ROL::Ptr<std::vector<Real>> one = ROL::makePtr<std::vector<Real>>(z_param->size(), 1); 
+      computeCoefficients(V,rhs,one);
+      // MULTIPLY V . grad(N)
+      scalar_view V_gradN("V_gradN", c, f, p);
+      fst::dotMultiplyDataField(V_gradN,V,fe_vol_->gradNdetJ());
+      // INTEGRATE (V . grad(U)) * N
+      fst::integrate(hess[0],L_eval,V_gradN,false);
+    }
+    else {
+      throw Exception::Zero(">>> (StochasticStefanBoltzmannPDE::Hessian_31): Hessian_31 is zero.");
+    }
+  }
+
+  void Hessian_32(std::vector<scalar_view> & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (StochasticStefanBoltzmannPDE::Hessian_32): Hessian_32 is zero.");
+  }
+
+  void Hessian_33(std::vector<std::vector<scalar_view>> & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (StochasticStefanBoltzmannPDE::Hessian_33): Hessian_33 is zero.");
+  }
+
+  void RieszMap_1(scalar_view & riesz) override {
+    // GET DIMENSIONS
+    int c = fe_vol_->N().extent_int(0);
+    int f = fe_vol_->N().extent_int(1);
+    // INITIALIZE RIESZ
+    riesz = scalar_view("riesz", c, f, f);
+    Kokkos::deep_copy(riesz, fe_vol_->stiffMat());
+    rst::add(riesz,fe_vol_->massMat());
+  }
+
+  void RieszMap_2(scalar_view & riesz) override {
+    throw Exception::NotImplemented(">>> (StochasticStefanBoltzmannPDE::RieszMap2): Not implemented.");
+    // GET DIMENSIONS
+    int c = fe_vol_->N().extent_int(0);
+    int f = fe_vol_->N().extent_int(1);
+    // INITIALIZE RIESZ
+    riesz = scalar_view("riesz", c, f, f);
+    Kokkos::deep_copy(riesz,fe_vol_->massMat());
+  }
+ 
+  std::vector<basis_ptr> getFields(void) override {
+    return basisPtrs_;
+  }
+
+  void setCellNodes(const scalar_view &volCellNodes,
+                    const std::vector<std::vector<scalar_view>> &bdryCellNodes, 
+                    const std::vector<std::vector<std::vector<int>>> &bdryCellLocIds ) override {
+    volCellNodes_   = volCellNodes;
+    bdryCellNodes_  = bdryCellNodes;
+    bdryCellLocIds_ = bdryCellLocIds;
+    // Finite element definition
+    fe_vol_ = ROL::makePtr<fe_type>(volCellNodes_,basisPtr_,cellCub_);
+    // Set local boundary DOFs
+    fidx_ = fe_vol_->getBoundaryDofs();
+    // Construct boundary FEs
+    const int numSidesets = bdryCellNodes.size();
+    fe_bdry_.resize(numSidesets);
+    for(int i = 0; i < numSidesets; ++i) {
+      int numLocSides = bdryCellNodes[i].size();
+      fe_bdry_[i].resize(numLocSides);
+      for (int j = 0; j < numLocSides; ++j) {
+        if (bdryCellNodes_[i][j] != scalar_view()) {
+          fe_bdry_[i][j] = ROL::makePtr<fe_type>(bdryCellNodes_[i][j],basisPtr_,bdryCub_,j);
+        }
+      }
+    }
+  }
+
+  const ROL::Ptr<fe_type> getVolFE(void) const {
+    return fe_vol_;
+  }
+
+  const std::vector<ROL::Ptr<fe_type>> getBdryFE(const int sideset) const {
+    return fe_bdry_[sideset];
+  }
+
+  const std::vector<std::vector<int>> getBdryCellLocIds(const int sideset) const {
+    return bdryCellLocIds_[sideset];
+  }
+ 
+private:
+    
+  /***************************************************************************/
+  /************** EVALUATE PDE COEFFICIENTS AT DOF COORDINATES ***************/
+  /***************************************************************************/
+  Real evaluateDiffusivity(Real u, const std::vector<Real> & x, int deriv = 0) const {
+    const std::vector<Real> param = PDE<Real,DeviceType>::getParameter();
+    const int size_w = 11, size_a = 9;
+    if ( x[1] < xmid_ ) {
+      // Water Thermal Conductivity: 0.5818 at 280K and 0.6797 at 370K
+      std::vector<Real> param_w(size_w,0);
+      for (int i = 0; i < size_w; ++i) {
+        Real pi = (static_cast<int>(param.size()) > i ? param[i] : static_cast<Real>(0.5));
+        param_w[i] = 0.01*pi;
+      }
+      std::vector<Real> c(3,0);
+      getWaterCoeff(c,param_w);
+      Real val = c[0] + nonLin_*(c[1] * u + c[2] * u * u);
+      const Real min = 0.1;
+      if ( deriv == 1 ) {
+        return (val < min ? static_cast<Real>(0) : nonLin_*(c[1] + static_cast<Real>(2)*c[2]*u));
+      }
+      if ( deriv == 2 ) {
+        return (val < min ? static_cast<Real>(0) : nonLin_*(static_cast<Real>(2)*c[2]));
+      }
+      return (val < min ? min : c[0] + nonLin_*(c[1] * u + c[2] * u * u));
+    }
+    else {
+      // Aluminum Thermal Conductivity: 236 at 273K and 240 at 400K
+      std::vector<Real> param_a(size_a,0);
+      for (int i = 0; i < size_a; ++i) {
+        Real pi = (static_cast<int>(param.size()) > size_w+i ? param[size_w+i] : static_cast<Real>(0.5));
+        param_a[i] = pi;
+      }
+      std::vector<Real> c(5,0);
+      getAluminumCoeff(c,param_a);
+      Real u2 = u*u, u3 = u2*u, u4 = u3*u;
+      Real val = c[0] + nonLin_*(c[1]*u + c[2]*u2 + c[3]*u3 + c[4]*u4);
+      const Real min = 100.0;
+      if ( deriv == 1 ) {
+        return (val < min ? static_cast<Real>(0) : 
+               nonLin_*(c[1] + static_cast<Real>(2)*c[2]*u
+                             + static_cast<Real>(3)*c[3]*u2
+                             + static_cast<Real>(4)*c[4]*u3));
+      }
+      if ( deriv == 2 ) {
+        return (val < min ? static_cast<Real>(0) : 
+               nonLin_*(static_cast<Real>(2)*c[2] +  static_cast<Real>(6)*c[3]*u
+                                                  + static_cast<Real>(12)*c[4]*u2));
+      }
+      return (val < min ? min : c[0] + nonLin_*(c[1]*u + c[2]*u2 + c[3]*u3 + c[4]*u4));
+    }
+  }
+
+  void evaluateVelocity(std::vector<Real> &adv, const std::vector<Real> &x, const std::vector<Real> &z_param) const {
+    if ( x[1] < xmid_ ) {
+      const std::vector<Real> param = PDE<Real,DeviceType>::getParameter();
+      Real p20 = (param.size() > 20 ? param[20] : static_cast<Real>(0.5));
+      const Real min = static_cast<Real>(0.1)*xmid_;
+      const Real max = static_cast<Real>(0.9)*xmid_;
+      const Real x1  = static_cast<Real>(0.5)*((max-min)*p20 + (max+min));
+      const Real mag = ((x[1] <  x1) ? x[1]/x1 : (xmid_-x[1])/(xmid_-x1));
+      adv[0] = -z_param[0]*mag;
+      adv[1] = static_cast<Real>(0);
+    }
+    else {
+      adv[0] = static_cast<Real>(0);
+      adv[1] = static_cast<Real>(0);
+    }
+  }
+
+  Real evaluateRHS(const std::vector<Real> &x) const {
+    return static_cast<Real>(0);
+  }
+
+  Real evaluateStefanBoltzmann(Real u, const std::vector<Real> &x,
+                               int sideset, int locSideId,
+                               int deriv = 0) const {
+    const std::vector<Real> param = PDE<Real,DeviceType>::getParameter();
+    // sig is the Stefan-Boltzmann constant
+    // c1 is sig times the emissivity [0.5,1.5]
+    // c2 is the ambient temperature away from the aluminum
+    // c3 is the thermal convectivity of air (5), oil (40), and water (440)
+    Real c1(0), c2(0), c3(0), sig(5.67e-8);
+    if ( sideset == 2 ) {
+      Real p21 = (param.size()>21 ? param[21] : static_cast<Real>(0.5));
+      Real p22 = (param.size()>22 ? param[22] : static_cast<Real>(0.5));
+      Real p23 = (param.size()>23 ? param[23] : static_cast<Real>(0.5));
+      c1 = SBscale_ * sig * (static_cast<Real>(0.09) + static_cast<Real>(5.e-3) * p21);
+      c2 = airTemp_             + static_cast<Real>(0.02*airTemp_) * p22;
+      c3 = static_cast<Real>(5) + static_cast<Real>(0.5)           * p23;
+    }
+    else if ( sideset == 4 || sideset == 5 ) {
+      Real p24 = (param.size()>24 ? param[24] : static_cast<Real>(0.5));
+      Real p25 = (param.size()>25 ? param[25] : static_cast<Real>(0.5));
+      Real p26 = (param.size()>26 ? param[26] : static_cast<Real>(0.5));
+      c1 = SBscale_ * sig * (static_cast<Real>(0.09) + static_cast<Real>(5.e-3) * p24);
+      c2 = engTemp_ + static_cast<Real>(0.2)*engTemp_ * p25;
+      c3 = static_cast<Real>(40) + static_cast<Real>(2) * p26;
+    }
+    else if ( sideset == 1 ) {
+      Real p27 = (param.size()>27 ? param[27] : static_cast<Real>(0.5));
+      Real p28 = (param.size()>28 ? param[28] : static_cast<Real>(0.5));
+      Real p29 = (param.size()>29 ? param[29] : static_cast<Real>(0.5));
+      c1 = SBscale_ * sig * (static_cast<Real>(0.09) + static_cast<Real>(5.e-3) * p27);
+      c2 = H2OTemp_ + static_cast<Real>(0.05)*H2OTemp_ * (p28 + static_cast<Real>(1));
+      c3 = static_cast<Real>(440) + static_cast<Real>(20) * p29;
+    }
+    if ( deriv == 1 ) {
+      return c1 * static_cast<Real>(4) * std::pow(u,3) + c3;
+    }
+    if ( deriv == 2 ) {
+      return c1 * static_cast<Real>(4) * static_cast<Real>(3) * std::pow(u,2);
+    }
+    return c1 * (std::pow(u,4) - std::pow(c2,4)) + c3 * (u - c2);
+  }
+
+  Real evaluateRobin(Real u, Real z, const std::vector<Real> &x,
+                     int sideset, int locSideId,
+                     int deriv = 0, int component = 1) const {
+    const std::vector<Real> param = PDE<Real,DeviceType>::getParameter();
+    Real p30 = (param.size()>30 ? param[30] : static_cast<Real>(0.5));
+    // c is the thermal convectivity of water (440)
+    Real c = static_cast<Real>(440) + static_cast<Real>(20) * p30;
+    if ( deriv == 1 ) {
+      return (component==1) ? c : -c;
+    }
+    if ( deriv > 1 ) {
+      return static_cast<Real>(0);
+    }
+    return c * (u - z);
+  }
+
+  Real evaluateRobin(Real u, std::vector<Real> &x,
+                     int sideset, int locSideId,
+                     int deriv = 0, int component = 1) const {
+    const std::vector<Real> param = PDE<Real,DeviceType>::getParameter();
+    Real p31 = (param.size()>31 ? param[31] : static_cast<Real>(0.5));
+    // c is the thermal convectivity of water (440)
+    Real c = static_cast<Real>(440) + static_cast<Real>(20) * p31;
+    if ( deriv == 1 ) {
+      return (component==1) ? c : static_cast<Real>(0);
+    }
+    if ( deriv > 1 ) {
+      return static_cast<Real>(0);
+    }
+    return c * (u - static_cast<Real>(293));
+  }
+ 
+  /***************************************************************************/
+  /************** COMPUTE PDE COEFFICIENTS AT DOFS ***************************/
+  /***************************************************************************/
+  void computeDiffusivity(scalar_view &kappa, const scalar_view u, int deriv = 0 ) const {
+    // GET DIMENSIONS
+    int c = fe_vol_->gradN().extent_int(0);
+    int p = fe_vol_->gradN().extent_int(2);
+    int d = fe_vol_->gradN().extent_int(3);
+    std::vector<Real> pt(d);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for ( int k = 0; k < d; ++k)
+          pt[k] = (fe_vol_->cubPts())(i,j,k);
+        // Compute diffusivity
+        kappa(i,j) = scale_*evaluateDiffusivity(u(i,j),pt,deriv);
+      }
+    }
+  }
+
+  void computeCoefficients(scalar_view &V, scalar_view &rhs,
+       const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr) const {
+    // GET DIMENSIONS
+    int c = fe_vol_->gradN().extent_int(0);
+    int p = fe_vol_->gradN().extent_int(2);
+    int d = fe_vol_->gradN().extent_int(3);
+    std::vector<Real> pt(d), adv(d);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for ( int k = 0; k < d; ++k)
+          pt[k] = (fe_vol_->cubPts())(i,j,k);
+        // Compute advection velocity field V
+        if (z_param != ROL::nullPtr) { 
+          evaluateVelocity(adv,pt,*z_param);
+        }
+        else {
+          std::vector<Real> param = {advMag_};
+          evaluateVelocity(adv,pt,param);
+        }
+        for (int k = 0; k < d; ++k) {
+          V(i,j,k) = scale_*adv[k];
+        }
+        // Compute forcing term f
+        rhs(i,j) = -scale_*evaluateRHS(pt);
+      }
+    }
+  }
+
+  void computeStefanBoltzmann(scalar_view &sb,
+                              const scalar_view u,
+                              int sideset,
+                              int locSideId,
+                              int deriv = 0) const {
+    const int c = u.extent_int(0);
+    const int p = u.extent_int(1);
+    const int d = fe_vol_->gradN().extent_int(3);
+    std::vector<Real> pt(d);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k) {
+          pt[k] = (fe_bdry_[sideset][locSideId]->cubPts())(i,j,k);
+        }
+        sb(i,j) = evaluateStefanBoltzmann(u(i,j),pt,sideset,locSideId,deriv);
+      }
+    }
+  }
+
+  void computeRobin(scalar_view &robin,
+                    const scalar_view u,
+                    const scalar_view z,
+                    int sideset,
+                    int locSideId,
+                    int deriv = 0,
+                    int component = 1) const {
+    const int c = u.extent_int(0);
+    const int p = u.extent_int(1);
+    const int d = fe_vol_->gradN().extent_int(3);
+    std::vector<Real> pt(d);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k) {
+          pt[k] = (fe_bdry_[sideset][locSideId]->cubPts())(i,j,k);
+        }
+        if (z != scalar_view()) {
+          robin(i,j) = evaluateRobin(u(i,j),z(i,j),pt,sideset,locSideId,deriv,component);
+        }
+        else {
+          robin(i,j) = evaluateRobin(u(i,j),pt,sideset,locSideId,deriv,component);
+        }
+      }
+    }
+  }
+
+  /***************************************************************************/
+  /************** EXTRACT COEFFICIENTS ON BOUNDARY ***************************/
+  /***************************************************************************/
+  scalar_view getBoundaryCoeff(const scalar_view cell_coeff, int sideSet, int cell) const {
+    std::vector<int> bdryCellLocId = bdryCellLocIds_[sideSet][cell];
+    const int numCellsSide = bdryCellLocId.size();
+    const int f = basisPtr_->getCardinality();
+    
+    scalar_view bdry_coeff("bdry_coeff", numCellsSide, f);
+    for (int i = 0; i < numCellsSide; ++i) {
+      for (int j = 0; j < f; ++j) {
+        bdry_coeff(i, j) = cell_coeff(bdryCellLocId[i], j);
+      }
+    }
+    return bdry_coeff;
+  }
+
+  /***************************************************************************/
+  /************** COMPUTE LEAST SQUARES COEFFICIENTS *************************/
+  /***************************************************************************/
+  void getAluminumCoeff(std::vector<Real> &c, const std::vector<Real> &param) const {
+    const std::vector<Real> Ta = {273, 300, 350, 400, 500, 600, 700, 800, 900};
+    const std::vector<Real> Ka = {236, 237, 240, 240, 237, 232, 226, 220, 213};
+    Teuchos::LAPACK<int,Real> lapack;
+    const char trans = 'N';
+    const int m = Ta.size();
+    const int n = 5;
+    const int nrhs = 1;
+    const int lda = m;
+    const int ldb = m;
+    std::vector<Real> A(m*n,1);
+    std::vector<Real> b(m,1);
+    for (int i = 0; i < m; ++i) {
+      b[i] = Ka[i] + param[i];
+      for (int j = 0; j < n; ++j) {
+        A[j*m + i] = std::pow(Ta[i],j);
+      }
+    }
+    const int lwork = n + m;
+    std::vector<Real> work(lwork,0);
+    int info;
+    lapack.GELS(trans,m,n,nrhs,&A[0],lda,&b[0],ldb,&work[0],lwork,&info);
+    c.clear(); c.resize(n,0);
+    for (int i = 0; i < n; ++i) {
+      c[i] = b[i];
+    }
+  }
+
+  void getWaterCoeff(std::vector<Real> &c, const std::vector<Real> &param) const {
+    const std::vector<Real> Tw = {270, 280, 290, 300, 310, 320, 330, 340, 350, 370, 400};
+    const std::vector<Real> Kw = {0.5551, 0.5818, 0.5918, 0.6084, 0.6233, 0.6367, 0.6485, 0.6587, 0.6673, 0.6797, 0.6864};
+    Teuchos::LAPACK<int,Real> lapack;
+    const char trans = 'N';
+    const int m = Tw.size();
+    const int n = 3;
+    const int nrhs = 1;
+    const int lda = m;
+    const int ldb = m;
+    std::vector<Real> A(m*n,1);
+    std::vector<Real> b(m,1);
+    for (int i = 0; i < m; ++i) {
+      b[i] = Kw[i] + param[i];
+      for (int j = 0; j < n; ++j) {
+        A[j*m + i] = std::pow(Tw[i],j);
+      }
+    }
+    const int lwork = n + m;
+    std::vector<Real> work(lwork,0);
+    int info;
+    lapack.GELS(trans,m,n,nrhs,&A[0],lda,&b[0],ldb,&work[0],lwork,&info);
+    c.clear(); c.resize(n,0);
+    for (int i = 0; i < n; ++i) {
+      c[i] = b[i];
+    }
+  }
+
+}; // PDE_stefan_boltzmann
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/stokes/CMakeLists.txt b/packages/rol/example/PDE-OPT/stokes/CMakeLists.txt
index d0164db7df71..8c2806c3b4be 100644
--- a/packages/rol/example/PDE-OPT/stokes/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/stokes/CMakeLists.txt
@@ -9,13 +9,13 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_01
     SOURCES example_01.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     StokesDataCopy
     SOURCE_FILES
       input.xml plotresults.m
@@ -23,5 +23,33 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
   )
 
+ENDIF()
+
+IF(${PROJECT_NAME}_ENABLE_Intrepid2 AND
+   ${PROJECT_NAME}_ENABLE_Ifpack2  AND
+   ${PROJECT_NAME}_ENABLE_MueLu    AND
+   ${PROJECT_NAME}_ENABLE_Amesos2  AND
+   ${PROJECT_NAME}_ENABLE_Tpetra )
+
+  # Need ROL_TpetraMultiVector.hpp
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
+
+  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+    example_01_Kokkos
+    SOURCES example_01K.cpp
+    ARGS PrintItAll
+    NUM_MPI_PROCS 4
+    NUM_TOTAL_CORES_USED 4
+    PASS_REGULAR_EXPRESSION "TEST PASSED"
+    ADD_DIR_TO_NAME
+  )
+
+  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+    StokesDataCopyK
+    SOURCE_FILES
+      input.xml plotresults.m
+    SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}"
+    DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
+  )
 
 ENDIF()
diff --git a/packages/rol/example/PDE-OPT/stokes/example_01.cpp b/packages/rol/example/PDE-OPT/stokes/example_01.cpp
index 0360e37107f3..9f128f05ec1c 100644
--- a/packages/rol/example/PDE-OPT/stokes/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/stokes/example_01.cpp
@@ -13,8 +13,7 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -39,7 +38,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int>> comm
     = Tpetra::getDefaultComm();
   const int myRank = comm->getRank();
diff --git a/packages/rol/example/PDE-OPT/stokes/example_01K.cpp b/packages/rol/example/PDE-OPT/stokes/example_01K.cpp
new file mode 100644
index 000000000000..e9d5f6c52f46
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/stokes/example_01K.cpp
@@ -0,0 +1,113 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_01.cpp
+    \brief Shows how to solve the Navier-Stokes control problem.
+*/
+
+#include "Teuchos_Comm.hpp"
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+//#include <fenv.h>
+
+#include "../TOOLS/linearpdeconstraintK.hpp"
+#include "../TOOLS/pdevectorK.hpp"
+#include "pde_stokesK.hpp"
+#include "mesh_stokesK.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+int main(int argc, char *argv[]) {
+  //feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
+
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  Kokkos::ScopeGuard kokkosScope (argc, argv);
+  auto comm = Tpetra::getDefaultComm();
+  const int myRank = comm->getRank();
+  if ((iprint > 0) && (myRank == 0))
+    outStream = ROL::makePtrFromRef(std::cout);
+  else
+    outStream = ROL::makePtrFromRef(bhs);
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+
+    /*** Read in XML input ***/
+    std::string filename = "input.xml";
+    auto parlist = ROL::getParametersFromXmlFile(filename);
+
+    /*** Initialize main data structure. ***/
+    auto meshMgr = ROL::makePtr<MeshManager_Stokes<RealT,DeviceT>>(*parlist);
+    // Initialize PDE describing Navier-Stokes equations.
+    auto pde = ROL::makePtr<PDE_Stokes<RealT,DeviceT>>(*parlist);
+    auto con = ROL::makePtr<Linear_PDE_Constraint<RealT,DeviceT>>(pde,meshMgr,comm,*parlist,*outStream);
+    // Cast the constraint and get the assembler.
+    auto assembler = con->getAssembler();
+    con->setSolveParameters(*parlist);
+    con->outputTpetraData();
+
+    // Create state vector and set to zeroes
+    auto u_ptr = assembler->createStateVector();    u_ptr->randomize();
+    auto r_ptr = assembler->createResidualVector(); r_ptr->randomize();
+    auto z_ptr = assembler->createControlVector();  z_ptr->putScalar(0);
+    auto up = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(u_ptr,pde,assembler);
+    auto rp = ROL::makePtr<PDE_DualSimVector<RealT,DeviceT>>(r_ptr,pde,assembler);
+    auto zp = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(z_ptr,pde,assembler);
+
+    // Run derivative checks
+    bool checkDeriv = parlist->sublist("Problem").get("Check derivatives",false);
+    if ( checkDeriv ) {
+      auto pp  = up->clone(); pp->randomize(-1.0,1.0);
+      auto dup = up->clone(); dup->randomize(-1.0,1.0);
+
+      *outStream << std::endl << "Check Jacobian_1 of Constraint" << std::endl;
+      con->checkApplyJacobian_1(*up,*zp,*dup,*rp,true,*outStream);
+      *outStream << std::endl << "Check Adjoint Jacobian_1 of Constraint" << std::endl;
+      con->checkAdjointConsistencyJacobian_1(*pp,*dup,*up,*zp,true,*outStream);
+      *outStream << std::endl << "Check Constraint Solve" << std::endl;
+      con->checkSolve(*up,*zp,*rp,true,*outStream);
+      *outStream << std::endl << "Check Inverse Jacobian_1 of Constraint" << std::endl;
+      con->checkInverseJacobian_1(*rp,*dup,*up,*zp,true,*outStream);
+      *outStream << std::endl << "Check Inverse Adjoint Jacobian_1 of Constraint" << std::endl;
+      con->checkInverseAdjointJacobian_1(*rp,*pp,*up,*zp,true,*outStream);
+    }
+
+    RealT tol(1.e-8);
+    up->zero();
+    con->solve(*rp,*up,*zp,tol);
+    con->outputTpetraData();
+    con->outputTpetraVector(u_ptr,"state.txt");
+    assembler->printMeshData(*outStream);
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/stokes/mesh_stokesK.hpp b/packages/rol/example/PDE-OPT/stokes/mesh_stokesK.hpp
new file mode 100644
index 000000000000..f522ba1da419
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/stokes/mesh_stokesK.hpp
@@ -0,0 +1,96 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+#include "../TOOLS/meshmanagerK.hpp"
+
+template<class Real, class DeviceType>
+class MeshManager_Stokes : public MeshManager_Rectangle<Real,DeviceType> {
+private:
+  int nx_;
+  int ny_;
+
+  ROL::Ptr<std::vector<std::vector<std::vector<int>>>>  meshSideSets_;
+
+public:
+
+  MeshManager_Stokes(ROL::ParameterList &parlist) : MeshManager_Rectangle<Real,DeviceType>(parlist) {
+    nx_ = parlist.sublist("Geometry").get("NX", 3);
+    ny_ = parlist.sublist("Geometry").get("NY", 1);
+    computeSideSets();
+  }
+
+  void computeSideSets() override {
+
+    int numSideSets = 5;
+    meshSideSets_ = ROL::makePtr<std::vector<std::vector<std::vector<int>>>>(numSideSets);
+
+    // Bottom
+    (*meshSideSets_)[0].resize(4);
+    (*meshSideSets_)[0][0].resize(nx_);
+    (*meshSideSets_)[0][1].resize(0);
+    (*meshSideSets_)[0][2].resize(0);
+    (*meshSideSets_)[0][3].resize(0);
+    // Right
+    (*meshSideSets_)[1].resize(4);
+    (*meshSideSets_)[1][0].resize(0);
+    (*meshSideSets_)[1][1].resize(ny_);
+    (*meshSideSets_)[1][2].resize(0);
+    (*meshSideSets_)[1][3].resize(0);
+    // Left
+    (*meshSideSets_)[2].resize(4);
+    (*meshSideSets_)[2][0].resize(0);
+    (*meshSideSets_)[2][1].resize(0);
+    (*meshSideSets_)[2][2].resize(0);
+    (*meshSideSets_)[2][3].resize(ny_);
+    // Top
+    (*meshSideSets_)[3].resize(4);
+    (*meshSideSets_)[3][0].resize(0);
+    (*meshSideSets_)[3][1].resize(0);
+    (*meshSideSets_)[3][2].resize(nx_);
+    (*meshSideSets_)[3][3].resize(0);
+    // Pressure Pinning
+    (*meshSideSets_)[4].resize(4);
+    //(*meshSideSets_)[4][0].resize(1);
+    //(*meshSideSets_)[4][1].resize(1);
+    //(*meshSideSets_)[4][2].resize(1);
+    //(*meshSideSets_)[4][3].resize(1);
+    (*meshSideSets_)[4][0].resize(1);
+    (*meshSideSets_)[4][1].resize(0);
+    (*meshSideSets_)[4][2].resize(0);
+    (*meshSideSets_)[4][3].resize(0);
+    
+    // Bottom
+    for (int i=0; i<nx_; ++i)
+      (*meshSideSets_)[0][0][i] = i;
+    // Right
+    for (int i=0; i<ny_; ++i)
+      (*meshSideSets_)[1][1][i] = (i+1)*nx_-1;
+    // Left
+    for (int i=0; i<ny_; ++i)
+      (*meshSideSets_)[2][3][i] = i*nx_;
+    // Top
+    for (int i=0; i<nx_; ++i)
+      (*meshSideSets_)[3][2][i] = i + nx_*(ny_-1);
+    // Pressure Pinning
+    //int offset = nx_ * (ny_/2) + (nx_-1)/2;
+    //(*meshSideSets_)[4][0][0] = offset + nx_ + 1;
+    //(*meshSideSets_)[4][1][0] = offset + nx_;
+    //(*meshSideSets_)[4][2][0] = offset;
+    //(*meshSideSets_)[4][3][0] = offset + 1;
+    (*meshSideSets_)[4][0][0] = 0;
+
+  } // computeSideSets
+
+  ROL::Ptr<std::vector<std::vector<std::vector<int>>>> getSideSets(
+              const bool verbose = false,
+              std::ostream & outStream = std::cout) const override { 
+    return meshSideSets_;
+  }
+
+}; // MeshManager_Stokes
diff --git a/packages/rol/example/PDE-OPT/stokes/pde_stokesK.hpp b/packages/rol/example/PDE-OPT/stokes/pde_stokesK.hpp
new file mode 100644
index 000000000000..350158b6f45a
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/stokes/pde_stokesK.hpp
@@ -0,0 +1,589 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  pde_stokes.hpp
+    \brief Implements the local PDE interface for the Stokes control problem.
+*/
+
+#ifndef PDE_STOKESK_HPP
+#define PDE_STOKESK_HPP
+
+#include "../TOOLS/pdeK.hpp"
+#include "../TOOLS/feK.hpp"
+#include "../TOOLS/fieldhelperK.hpp"
+
+#include "Intrepid2_HGRAD_QUAD_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_QUAD_C2_FEM.hpp"
+#include "Intrepid2_DefaultCubatureFactory.hpp"
+#include "Intrepid2_FunctionSpaceTools.hpp"
+#include "Intrepid2_CellTools.hpp"
+
+#include "ROL_Ptr.hpp"
+
+template <class Real, class DeviceType>
+class PDE_Stokes : public PDE<Real, DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+private:
+  // Finite element basis information
+  basis_ptr basisPtrVel_, basisPtrPrs_;
+  std::vector<basis_ptr> basisPtrs_;
+  // Cell cubature information
+  ROL::Ptr<Intrepid2::Cubature<DeviceType,Real,Real>> cellCub_, bdryCub_;
+  // Cell node information
+  scalar_view volCellNodes_;
+  std::vector<std::vector<scalar_view>> bdryCellNodes_;
+  std::vector<std::vector<std::vector<int>>> bdryCellLocIds_;
+  // Finite element definition
+  ROL::Ptr<fe_type> feVel_, fePrs_;
+  std::vector<std::vector<ROL::Ptr<fe_type>>> feVelBdry_, fePrsBdry_;
+  // Local degrees of freedom on boundary, for each side of the reference cell (first index).
+  std::vector<std::vector<int>> fvidx_, fpidx_;
+  // Coordinates of degrees freedom on boundary cells.
+  // Indexing:  [sideset number][local side id](cell number, value at dof)
+  std::vector<std::vector<scalar_view>> bdryCellVDofValues_;
+  // Field pattern, offsets, etc.
+  std::vector<std::vector<int>> fieldPattern_; // local Field/DOF pattern; set from DOF manager 
+  int numFields_;                              // number of fields (equations in the PDE)
+  int numDofs_;                                // total number of degrees of freedom for all (local) fields
+  std::vector<int> offset_;                    // for each field, a counting offset
+  std::vector<int> numFieldDofs_;              // for each field, number of degrees of freedom
+  
+  // Problem parameters.
+  Real Re_;
+  bool pinPressure_, dirType_;
+
+  ROL::Ptr<FieldHelper<Real,DeviceType>> fieldHelper_;
+
+  Real velocityDirichletFunc(const std::vector<Real> & coords, int sideset, int locSideId, int dir) const {
+    Real val(0);
+    if (dirType_ == 0) {
+      if ((sideset==3) && (dir==0)) {
+        val = static_cast<Real>(1);
+      }
+    }
+    else if (dirType_ == 1) {
+      Real one(1);
+      if ((sideset==1) && (dir==0)) {
+        val = coords[1]*(one-coords[1]);
+      }
+      else if ((sideset==2) && (dir==0)) {
+        val = coords[1]*(one-coords[1]);
+      }
+    }
+    return val;
+  }
+
+  void computeDirichlet(void) {
+    // Compute Dirichlet values at DOFs.
+    int d  = basisPtrVel_->getBaseCellTopology().getDimension();
+    int fv = basisPtrVel_->getCardinality();
+    int numSidesets = bdryCellLocIds_.size();
+    bdryCellVDofValues_.resize(numSidesets);
+    for (int i=0; i<numSidesets; ++i) {
+      int numLocSides = bdryCellLocIds_[i].size();
+      bdryCellVDofValues_[i].resize(numLocSides);
+      for (int j=0; j<numLocSides; ++j) {
+        int c = bdryCellLocIds_[i][j].size();
+	std::stringstream name;
+	name << "bdryCellDofValues" << i << j;
+        bdryCellVDofValues_[i][j] = scalar_view(name.str(), c, fv, d);
+        scalar_view Vcoords;
+        Vcoords = scalar_view("Vcoords", c, fv, d);
+        if (c > 0) {
+          feVel_->computeDofCoords(Vcoords, bdryCellNodes_[i][j]);
+        }
+        for (int k=0; k<c; ++k) {
+          for (int l=0; l<fv; ++l) {
+            std::vector<Real> dofpoint(d);
+            //std::cout << "Sideset " << i << " LocalSide " << j << "  Cell " << k << "  Field " << l << "  Coord ";
+            for (int m=0; m<d; ++m) {
+              dofpoint[m] = Vcoords(k, l, m);
+              //std::cout << dofpoint[m] << "  ";
+            }
+
+            for (int m=0; m<d; ++m) {
+              (bdryCellVDofValues_[i][j])(k, l, m) = velocityDirichletFunc(dofpoint, i, j, m);
+              //std::cout << "  " << m << "-Value " << DirichletFunc(dofpoint, i, j, m);
+            }
+            //std::cout << std::endl;
+          }
+        }
+      }
+    }
+  }
+
+public:
+  PDE_Stokes(ROL::ParameterList &parlist) {
+    // Finite element fields -- NOT DIMENSION INDEPENDENT!
+    basisPtrVel_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C2_FEM<DeviceType,Real,Real>>();
+    basisPtrPrs_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceType,Real,Real>>();
+    // Volume quadrature rules.
+    shards::CellTopology cellType = basisPtrVel_->getBaseCellTopology();     // get the cell type from any basis
+    Intrepid2::DefaultCubatureFactory cubFactory;                            // create cubature factory
+    int cubDegree = parlist.sublist("Problem").get("Cubature Degree", 4);    // set cubature degree, e.g., 4
+    cellCub_ = cubFactory.create<DeviceType,Real,Real>(cellType, cubDegree); // create default cubature
+    // Boundary quadrature rules.
+    int d = cellType.getDimension();
+    shards::CellTopology bdryCellType = cellType.getCellTopologyData(d-1, 0);
+    int bdryCubDegree = parlist.sublist("Problem").get("Boundary Cubature Degree",4); // set cubature degree, e.g., 4
+    bdryCub_ = cubFactory.create<DeviceType,Real,Real>(bdryCellType, bdryCubDegree);
+    // Fill finite element basis container
+    basisPtrs_.clear();
+    basisPtrs_.resize(d,basisPtrVel_);  // Velocity
+    basisPtrs_.push_back(basisPtrPrs_); // Pressure
+
+    // Reynold's Number
+    Re_ = parlist.sublist("Problem").get("Reynolds Number",100.0);
+
+    // Pin pressure
+    pinPressure_ = parlist.sublist("Problem").get("Pin Pressure",true);
+
+    // Dirichlet Type
+    dirType_ = parlist.sublist("Problem").get("Dirichlet Type",0);
+
+    numDofs_ = 0;
+    numFields_ = basisPtrs_.size();
+    offset_.resize(numFields_);
+    numFieldDofs_.resize(numFields_);
+    for (int i=0; i<numFields_; ++i) {
+      if (i==0)
+        offset_[i] = 0;
+      else
+        offset_[i] = offset_[i-1] + basisPtrs_[i-1]->getCardinality();
+      numFieldDofs_[i] = basisPtrs_[i]->getCardinality();
+      numDofs_ += numFieldDofs_[i];
+    }
+  }
+
+  void residual(scalar_view & res,
+                const scalar_view u_coeff,
+                const scalar_view z_coeff = scalar_view(),
+                const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    int c  = u_coeff.extent_int(0);
+    int p  = cellCub_->getNumPoints();
+    int fv = basisPtrVel_->getCardinality();
+    int fp = basisPtrPrs_->getCardinality();
+    int d  = cellCub_->getDimension();
+ 
+    // Initialize residuals.
+    std::vector<scalar_view> R(d+1);
+    for (int i = 0; i < d; ++i) {
+      std::stringstream name;
+      name << "res_vel" << i;
+      R[i] = scalar_view(name.str(), c, fv);
+    }
+    R[d] = scalar_view("res_pres", c, fp);
+
+    // Split u_coeff into components.
+    std::vector<scalar_view> U;
+    fieldHelper_->splitFieldCoeff(U, u_coeff);
+
+    // Evaluate/interpolate finite element fields on cells.
+    scalar_view valPres_eval("valPres_eval", c, p);
+    fePrs_->evaluateValue(valPres_eval, U[d]);
+    // Evaluate/interpolate gradient of finite element fields on cells.
+    std::vector<scalar_view> gradVel_vec(d);
+    for (int i = 0; i < d; ++i) {
+      std::stringstream name;
+      name << "gradVel_vec" << i;
+      gradVel_vec[i] = scalar_view(name.str(), c, p, d);
+      feVel_->evaluateGradient(gradVel_vec[i], U[i]);
+    }
+
+    // Assemble the velocity vector and its divergence.
+    scalar_view divVel_eval("divVel_eval", c, p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        divVel_eval(i,j) = static_cast<Real>(0);
+        for (int k = 0; k < d; ++k) {
+          divVel_eval(i,j) += (gradVel_vec[k])(i,j,k);
+        }
+      }
+    }
+
+    // Scale pressure and velocity gradient
+    const Real zero(0), one(1);
+    for (int i = 0; i < d; ++i) {
+      // Multiply velocity gradients with viscosity.
+      rst::scale(gradVel_vec[i], one/Re_);
+    }
+    // Negative pressure
+    rst::scale(valPres_eval, -one);
+
+    /**************************************************************************/
+    /*** EVALUATE WEAK FORM OF RESIDUAL ***************************************/
+    /**************************************************************************/
+    // Velocity equation.
+    for (int i = 0; i < d; ++i) {
+      fst::integrate(R[i], gradVel_vec[i], feVel_->gradNdetJ(), false);
+      fst::integrate(R[i], valPres_eval, feVel_->DNDdetJ(i), true);
+    }
+    // Pressure equation.
+    fst::integrate(R[d], divVel_eval, fePrs_->NdetJ(), false);
+    rst::scale(R[d], -one);
+
+    /**************************************************************************/
+    /*** APPLY BOUNDARY CONDITIONS ********************************************/
+    /**************************************************************************/
+    // --> No slip boundaries: i=0,1,3
+    // -->     Lid boundaries: i=2
+    // -->       Pressure pin: i=4
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        // Velocity Boundary Conditions
+        if (i!=4) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numVBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numVBdryDofs; ++l) {
+                for (int m = 0; m < d; ++m) {
+                  R[m](cidx,fvidx_[j][l]) = U[m](cidx,fvidx_[j][l]) - (bdryCellVDofValues_[i][j])(k,fvidx_[j][l],m);
+                }
+              }
+            }
+          }
+        }
+        // Pressure pinning
+        if (i==4 && pinPressure_) {
+          Real val = (dirType_ == 1) ? one/Re_ : zero;
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int l = 0, cidx = bdryCellLocIds_[i][j][k];
+              R[d](cidx,fpidx_[j][l]) = U[d](cidx,fpidx_[j][l]) - val;
+            }
+          }
+        }
+      }
+    }
+
+    // Combine the residuals.
+    fieldHelper_->combineFieldCoeff(res, R);
+  }
+
+  void Jacobian_1(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    int c  = u_coeff.extent_int(0);
+    int p  = cellCub_->getNumPoints();
+    int fv = basisPtrVel_->getCardinality();
+    int fp = basisPtrPrs_->getCardinality();
+    int d  = cellCub_->getDimension();
+ 
+    // Initialize jacobians.
+    std::vector<std::vector<scalar_view>> J(d+1);
+    for (int i = 0; i < d+1; ++i) J[i].resize(d+1);
+    for (int i = 0; i < d; ++i) {
+      for (int j = 0; j < d; ++j) {
+        std::stringstream name;
+        name << "jac" << i << j;
+        J[i][j] = scalar_view(name.str(), c, fv, fv);
+      }
+      std::stringstream name1, name2;
+      name1 << "jac" << d << i;
+      name2 << "jac" << i << d;
+      J[d][i] = scalar_view(name1.str(), c, fp, fv);
+      J[i][d] = scalar_view(name2.str(), c, fv, fp);
+    }
+    std::stringstream name;
+    name << "jac" << d << d;
+    J[d][d] = scalar_view(name.str(), c, fp, fp);
+
+    // Multiply velocity gradients with viscosity.
+    const Real zero(0), one(1);
+    scalar_view nuGradPhi_eval("nuGradPhi_eval", c, fv, p, d);
+    Kokkos::deep_copy(nuGradPhi_eval,feVel_->gradN());
+    rst::scale(nuGradPhi_eval, one/Re_);
+    // Negative pressure basis.
+    scalar_view negPrsPhi("negPrsPhi", c, fp, p);
+    Kokkos::deep_copy(negPrsPhi,fePrs_->N());
+    rst::scale(negPrsPhi, -one);
+
+    /*** Evaluate weak form of the Jacobian. ***/
+    for (int i = 0; i < d; ++i) {
+      // Velocity components
+      fst::integrate(J[i][i], nuGradPhi_eval, feVel_->gradNdetJ(), true);
+      // Pressure components
+      fst::integrate(J[i][d], feVel_->DNDdetJ(i), negPrsPhi, false);
+      fst::integrate(J[d][i], fePrs_->NdetJ(), feVel_->DND(i), false);
+      rst::scale(J[d][i], -one);
+    }
+
+    // APPLY BOUNDARY CONDITIONS
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      // DIRICHLET CONDITIONS
+      for (int i = 0; i < numSideSets; ++i) {
+        // Velocity Boundary Conditions
+        if (i!=4) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numVBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numVBdryDofs; ++l) {
+                for (int m = 0; m < fv; ++m) {
+                  for (int n = 0; n < d; ++n) {
+                    for (int q = 0; q < d; ++q) {
+                      (J[n][q])(cidx,fvidx_[j][l],m) = zero;
+                    }
+                    (J[n][n])(cidx,fvidx_[j][l],fvidx_[j][l]) = one;
+                  }
+                }
+                for (int m = 0; m < d; ++m) {
+                  for (int n = 0; n < fp; ++n) {
+                    (J[m][d])(cidx,fvidx_[j][l],n) = zero;
+                  }
+                }
+              }
+            }
+          }
+        }
+        // Pressure pinning
+        if (i==4 && pinPressure_) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int l = 0, cidx = bdryCellLocIds_[i][j][k];
+              for (int m = 0; m < fv; ++m) {
+                for (int n = 0; n < d; ++n) {
+                  (J[d][n])(cidx,fpidx_[j][l],m) = zero;
+                }
+              }
+              for (int m = 0; m < fp; ++m) {
+                (J[d][d])(cidx,fpidx_[j][l],m) = zero;
+              }
+              (J[d][d])(cidx,fpidx_[j][l],fpidx_[j][l]) = one;
+            }
+          }
+        }
+      }
+    }
+    // Combine the jacobians.
+    fieldHelper_->combineFieldCoeff(jac, J);
+  }
+
+
+  void Jacobian_2(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    int c  = u_coeff.extent_int(0);
+    int fv = basisPtrVel_->getCardinality();
+    int fp = basisPtrPrs_->getCardinality();
+    int d  = cellCub_->getDimension();
+ 
+    // Initialize jacobians.
+    std::vector<std::vector<scalar_view>> J(d+1);
+    for (int i = 0; i < d+1; ++i) J[i].resize(d+1);
+    for (int i = 0; i < d; ++i) {
+      for (int j = 0; j < d; ++j) {
+        std::stringstream name;
+	name << "jac" << i << j;
+        J[i][j] = scalar_view(name.str(), c, fv, fv);
+      }
+      std::stringstream name1, name2;
+      name1 << "jac" << d << i;
+      name2 << "jac" << i << d;
+      J[d][i] = scalar_view(name1.str(), c, fp, fv);
+      J[i][d] = scalar_view(name2.str(), c, fv, fp);
+    }
+    std::stringstream name;
+    name << "jac" << d << d;
+    J[d][d] = scalar_view(name.str(), c, fp, fp);
+
+    // Combine the jacobians.
+    fieldHelper_->combineFieldCoeff(jac, J);
+  }
+
+  void Hessian_11(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Stokes::Hessian_11): Hessian is zero.");
+  }
+
+  void Hessian_12(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Stokes::Hessian_12): Hessian is zero.");
+  }
+
+  void Hessian_21(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Stokes::Hessian_21): Hessian is zero.");
+  }
+
+  void Hessian_22(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Stokes::Hessian_22): Hessian is zero.");
+  }
+
+  void RieszMap_1(scalar_view & riesz) override {
+    // Retrieve dimensions.
+    int c  = feVel_->N().extent_int(0);
+    int fv = basisPtrVel_->getCardinality();
+    int fp = basisPtrPrs_->getCardinality();
+    int d  = cellCub_->getDimension();
+ 
+    // Initialize jacobians.
+    std::vector<std::vector<scalar_view>> J(d+1);
+    for (int i = 0; i < d+1; ++i) J[i].resize(d+1);
+    for (int i = 0; i < d; ++i) {
+      for (int j = 0; j < d; ++j) {
+        std::stringstream name;
+	name << "jac" << i << j;
+        J[i][j] = scalar_view(name.str(), c, fv, fv);
+      }
+      std::stringstream name1, name2;
+      name1 << "jac" << d << i;
+      name2 << "jac" << i << d;
+      J[d][i] = scalar_view(name1.str(), c, fp, fv);
+      J[i][d] = scalar_view(name2.str(), c, fv, fp);
+    }
+    std::stringstream name;
+    name << "jac" << d << d;
+    J[d][d] = scalar_view(name.str(), c, fp, fp);
+
+    for (int i = 0; i < d; ++i) {
+      Kokkos::deep_copy(J[i][i],feVel_->stiffMat());
+      rst::add(J[i][i],feVel_->massMat());
+    }
+    Kokkos::deep_copy(J[d][d],fePrs_->massMat());
+
+    // Combine the jacobians.
+    fieldHelper_->combineFieldCoeff(riesz, J);
+  }
+
+  void RieszMap_2(scalar_view & riesz) override {
+    // Retrieve dimensions.
+    int c  = feVel_->N().extent_int(0);
+    int fv = basisPtrVel_->getCardinality();
+    int fp = basisPtrPrs_->getCardinality();
+    int d  = cellCub_->getDimension();
+ 
+    // Initialize jacobians.
+    std::vector<std::vector<scalar_view>> J(d+1);
+    for (int i = 0; i < d+1; ++i) J[i].resize(d+1);
+    for (int i = 0; i < d; ++i) {
+      for (int j = 0; j < d; ++j) {
+        std::stringstream name;
+	name << "jac" << i << j;
+        J[i][j] = scalar_view(name.str(), c, fv, fv);
+      }
+      std::stringstream name1, name2;
+      name1 << "jac" << d << i;
+      name2 << "jac" << i << d;
+      J[d][i] = scalar_view(name1.str(), c, fp, fv);
+      J[i][d] = scalar_view(name2.str(), c, fv, fp);
+    }
+    std::stringstream name;
+    name << "jac" << d << d;
+    J[d][d] = scalar_view(name.str(), c, fp, fp);
+
+    for (int i = 0; i < d; ++i)
+      Kokkos::deep_copy(J[i][i],feVel_->massMat());
+    Kokkos::deep_copy(J[d][d],fePrs_->massMat());
+
+    // Combine the jacobians.
+    fieldHelper_->combineFieldCoeff(riesz, J);
+  }
+
+  std::vector<basis_ptr> getFields() override {
+    return basisPtrs_;
+  }
+
+  void setCellNodes(const scalar_view &volCellNodes,
+                    const std::vector<std::vector<scalar_view >> &bdryCellNodes,
+                    const std::vector<std::vector<std::vector<int>>> &bdryCellLocIds) override {
+    volCellNodes_ = volCellNodes;
+    bdryCellNodes_ = bdryCellNodes;
+    bdryCellLocIds_ = bdryCellLocIds;
+    // Finite element definition.
+    feVel_ = ROL::makePtr<fe_type>(volCellNodes_,basisPtrVel_,cellCub_);
+    fePrs_ = ROL::makePtr<fe_type>(volCellNodes_,basisPtrPrs_,cellCub_);
+    // Get boundary degrees of freedom.
+    fvidx_ = feVel_->getBoundaryDofs();
+    fpidx_ = fePrs_->getBoundaryDofs();
+    // Construct boundary FEs
+    const int numSideSets = bdryCellNodes.size();
+    feVelBdry_.resize(numSideSets);
+    fePrsBdry_.resize(numSideSets);
+    for (int i = 0; i < numSideSets; ++i) {
+      int numLocSides = bdryCellNodes[i].size();
+      feVelBdry_[i].resize(numLocSides);
+      fePrsBdry_[i].resize(numLocSides);
+      for (int j = 0; j < numLocSides; ++j) {
+        if (bdryCellNodes[i][j] != scalar_view()) {
+          feVelBdry_[i][j] = ROL::makePtr<fe_type>(bdryCellNodes[i][j],basisPtrVel_,bdryCub_,j);
+          fePrsBdry_[i][j] = ROL::makePtr<fe_type>(bdryCellNodes[i][j],basisPtrPrs_,bdryCub_,j);
+        }
+      }
+    }
+    computeDirichlet();
+  }
+
+  void setFieldPattern(const std::vector<std::vector<int>> & fieldPattern) override {
+    fieldPattern_ = fieldPattern;
+    fieldHelper_ = ROL::makePtr<FieldHelper<Real,DeviceType>>(numFields_, numDofs_, numFieldDofs_, fieldPattern_);
+  }
+
+  const ROL::Ptr<fe_type> getVelocityFE(void) const {
+    return feVel_;
+  }
+
+  const ROL::Ptr<fe_type> getPressureFE(void) const {
+    return fePrs_;
+  }
+
+  const std::vector<std::vector<ROL::Ptr<fe_type>>> getVelocityBdryFE(void) const {
+    return feVelBdry_;
+  }
+
+  const std::vector<std::vector<fe_type>> getPressureBdryFE(void) const {
+    return fePrsBdry_;
+  }
+
+  const std::vector<std::vector<std::vector<int>>> getBdryCellLocIds(void) const {
+    return bdryCellLocIds_;
+  }
+
+  const ROL::Ptr<FieldHelper<Real,DeviceType>> getFieldHelper(void) const {
+    return fieldHelper_;
+  }
+
+}; // PDE_Stokes
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/thermal-fluids/CMakeLists.txt b/packages/rol/example/PDE-OPT/thermal-fluids/CMakeLists.txt
index ace359100890..b187554ca779 100644
--- a/packages/rol/example/PDE-OPT/thermal-fluids/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/thermal-fluids/CMakeLists.txt
@@ -11,49 +11,49 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_01
     SOURCES example_01.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_02
     SOURCES example_02.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_03
     SOURCES example_03.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_04
     SOURCES example_04.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_05
     SOURCES example_05.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_06
     SOURCES example_06.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_07
     SOURCES example_07.cpp
     ADD_DIR_TO_NAME
   )
 
-#  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+#  ROL_ADD_EXECUTABLE_AND_TEST(
 #    example_02
 #    SOURCES example_02.cpp
 #    ARGS PrintItAll
@@ -63,7 +63,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
 #    ADD_DIR_TO_NAME
 #  )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     ThermalFluidsDataCopy
     SOURCE_FILES
       input.xml input_ex03.xml input_ex04.xml input_ex05.xml input_ex06.xml input_ex07.xml tf-cube-3x3x3.txt tf-cube-15x15x15.txt plotresults.m plotresults_ex03.m plotresults_ex04.m process3D.sh
@@ -71,5 +71,68 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
   )
 
+ENDIF()
+
+IF(${PROJECT_NAME}_ENABLE_Intrepid2 AND
+   ${PROJECT_NAME}_ENABLE_Ifpack2   AND
+   ${PROJECT_NAME}_ENABLE_MueLu     AND
+   ${PROJECT_NAME}_ENABLE_Amesos2   AND
+   ${PROJECT_NAME}_ENABLE_Tpetra )
+
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/teuchos/src/sol)
+
+  # Need ROL_TpetraMultiVector.hpp
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
+
+  ROL_ADD_EXECUTABLE(
+    example_01_Kokkos
+    SOURCES example_01K.cpp
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_ADD_EXECUTABLE(
+    example_02_Kokkos
+    SOURCES example_02K.cpp
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_ADD_EXECUTABLE(
+    example_03_Kokkos
+    SOURCES example_03K.cpp
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_ADD_EXECUTABLE(
+    example_04_Kokkos
+    SOURCES example_04K.cpp
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_ADD_EXECUTABLE(
+    example_05_Kokkos
+    SOURCES example_05K.cpp
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_ADD_EXECUTABLE(
+    example_06_Kokkos
+    SOURCES example_06K.cpp
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_ADD_EXECUTABLE(
+    example_07_Kokkos
+    SOURCES example_07K.cpp
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_COPY_FILES_TO_BINARY_DIR(
+    ThermalFluidsDataCopyK
+    SOURCE_FILES
+      input.xml input_ex03.xml input_ex04.xml input_ex05.xml input_ex06.xml input_ex07.xml tf-cube-3x3x3.txt tf-cube-15x15x15.txt plotresults.m plotresults_ex03.m plotresults_ex04.m process3D.sh
+    SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}"
+    DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
+  )
 
 ENDIF()
diff --git a/packages/rol/example/PDE-OPT/thermal-fluids/example_01.cpp b/packages/rol/example/PDE-OPT/thermal-fluids/example_01.cpp
index ecef3154e943..686edd23f3ff 100644
--- a/packages/rol/example/PDE-OPT/thermal-fluids/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/thermal-fluids/example_01.cpp
@@ -13,8 +13,7 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -46,7 +45,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int> > comm
     = Tpetra::getDefaultComm();
   const int myRank = comm->getRank();
diff --git a/packages/rol/example/PDE-OPT/thermal-fluids/example_01K.cpp b/packages/rol/example/PDE-OPT/thermal-fluids/example_01K.cpp
new file mode 100644
index 000000000000..d7e238f1e5aa
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/thermal-fluids/example_01K.cpp
@@ -0,0 +1,200 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_01.cpp
+    \brief Shows how to solve the Navier-Stokes control problem.
+*/
+
+#include "Teuchos_Comm.hpp"
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+//#include <fenv.h>
+
+#include "ROL_TpetraMultiVector.hpp"
+#include "ROL_Reduced_Objective_SimOpt.hpp"
+#include "ROL_Solver.hpp"
+
+#include "../TOOLS/meshmanagerK.hpp"
+#include "../TOOLS/pdeconstraintK.hpp"
+#include "../TOOLS/pdeobjectiveK.hpp"
+#include "../TOOLS/pdevectorK.hpp"
+#include "pde_thermal-fluidsK.hpp"
+#include "obj_thermal-fluidsK.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+int main(int argc, char *argv[]) {
+  //feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
+
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  Kokkos::ScopeGuard kokkosScope (argc, argv);
+  auto comm = Tpetra::getDefaultComm();
+  const int myRank = comm->getRank();
+  if ((iprint > 0) && (myRank == 0))
+    outStream = ROL::makePtrFromRef(std::cout);
+  else
+    outStream = ROL::makePtrFromRef(bhs);
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+
+    /*** Read in XML input ***/
+    std::string filename = "input.xml";
+    auto parlist = ROL::getParametersFromXmlFile(filename);
+
+    /*** Initialize main data structure. ***/
+    auto meshMgr = ROL::makePtr<MeshManager_BackwardFacingStepChannel<RealT,DeviceT>>(*parlist);
+    // Initialize PDE describing Navier-Stokes equations.
+    auto pde = ROL::makePtr<PDE_ThermalFluids<RealT,DeviceT>>(*parlist);
+    auto con = ROL::makePtr<PDE_Constraint<RealT,DeviceT>>(pde,meshMgr,comm,*parlist,*outStream);
+    // Cast the constraint and get the assembler.
+    auto assembler = con->getAssembler();
+    con->setSolveParameters(*parlist);
+    con->outputTpetraData();
+
+    // Create state vector and set to zeroes
+    auto u_ptr  = assembler->createStateVector();     u_ptr->randomize();
+    auto p_ptr  = assembler->createStateVector();     p_ptr->randomize();
+    auto du_ptr = assembler->createStateVector();     du_ptr->randomize();
+    auto r_ptr  = assembler->createResidualVector();  r_ptr->randomize();
+    auto z_ptr  = assembler->createControlVector();   z_ptr->randomize();
+    auto dz_ptr = assembler->createControlVector();   dz_ptr->randomize();
+    auto up  = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(u_ptr,pde,assembler);
+    auto pp  = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(p_ptr,pde,assembler);
+    auto dup = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(du_ptr,pde,assembler);
+    auto rp  = ROL::makePtr<PDE_DualSimVector<RealT,DeviceT>>(r_ptr,pde,assembler);
+    auto zp  = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(z_ptr,pde,assembler);
+    auto dzp = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(dz_ptr,pde,assembler);
+    // Create ROL SimOpt vectors
+    ROL::Vector_SimOpt<RealT> x(up,zp);
+    ROL::Vector_SimOpt<RealT> d(dup,dzp);
+
+    // Initialize objective function.
+    std::vector<ROL::Ptr<QoI<RealT,DeviceT>>> qoi_vec(2,ROL::nullPtr);
+    qoi_vec[0] = ROL::makePtr<QoI_State_ThermalFluids<RealT,DeviceT>>(*parlist,
+                                                                 pde->getVelocityFE(),
+                                                                 pde->getPressureFE(),
+                                                                 pde->getThermalFE(),
+                                                                 pde->getFieldHelper());
+    qoi_vec[1] = ROL::makePtr<QoI_L2Penalty_ThermalFluids<RealT,DeviceT>>(pde->getVelocityFE(),
+                                                                     pde->getPressureFE(),
+                                                                     pde->getThermalFE(),
+                                                                     pde->getThermalBdryFE(),
+                                                                     pde->getBdryCellLocIds(),
+                                                                     pde->getFieldHelper());
+    auto std_obj = ROL::makePtr<StdObjective_ThermalFluids<RealT>>(*parlist);
+    auto obj = ROL::makePtr<PDE_Objective<RealT,DeviceT>>(qoi_vec,std_obj,assembler);
+    auto stateStore = ROL::makePtr<ROL::VectorController<RealT>>();
+    auto robj = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(obj, con, stateStore, up, zp, pp, true, false);
+
+    // Run derivative checks
+    bool checkDeriv = parlist->sublist("Problem").get("Check derivatives",false);
+    if ( checkDeriv ) {
+      *outStream << "Check Gradient of Full Objective Function" << std::endl;
+      obj->checkGradient(x,d,true,*outStream);
+      *outStream << std::endl << "Check Hessian of Full Objective Function" << std::endl;
+      obj->checkHessVec(x,d,true,*outStream);
+      *outStream << std::endl << "Check Jacobian of Constraint" << std::endl;
+      con->checkApplyJacobian(x,d,*up,true,*outStream);
+      *outStream << std::endl << "Check Jacobian_1 of Constraint" << std::endl;
+      con->checkApplyJacobian_1(*up,*zp,*dup,*rp,true,*outStream);
+      *outStream << std::endl << "Check Jacobian_2 of Constraint" << std::endl;
+      con->checkApplyJacobian_2(*up,*zp,*dzp,*rp,true,*outStream);
+      *outStream << std::endl << "Check Hessian of Constraint" << std::endl;
+      con->checkApplyAdjointHessian(x,*dup,d,x,true,*outStream);
+      *outStream << std::endl << "Check Hessian_11 of Constraint" << std::endl;
+      con->checkApplyAdjointHessian_11(*up,*zp,*pp,*dup,*rp,true,*outStream);
+      *outStream << std::endl << "Check Hessian_12 of Constraint" << std::endl;
+      con->checkApplyAdjointHessian_12(*up,*zp,*pp,*dup,*dzp,true,*outStream);
+      *outStream << std::endl << "Check Hessian_21 of Constraint" << std::endl;
+      con->checkApplyAdjointHessian_21(*up,*zp,*pp,*dzp,*rp,true,*outStream);
+      *outStream << std::endl << "Check Hessian_22 of Constraint" << std::endl;
+      con->checkApplyAdjointHessian_22(*up,*zp,*pp,*dzp,*dzp,true,*outStream);
+
+      *outStream << std::endl << "Check Adjoint Jacobian of Constraint" << std::endl;
+      con->checkAdjointConsistencyJacobian(*dup,d,x,true,*outStream);
+      *outStream << std::endl << "Check Adjoint Jacobian_1 of Constraint" << std::endl;
+      con->checkAdjointConsistencyJacobian_1(*pp,*dup,*up,*zp,true,*outStream);
+      *outStream << std::endl << "Check Adjoint Jacobian_2 of Constraint" << std::endl;
+      con->checkAdjointConsistencyJacobian_2(*pp,*dzp,*up,*zp,true,*outStream);
+
+      *outStream << std::endl << "Check Constraint Solve" << std::endl;
+      con->checkSolve(*up,*zp,*rp,true,*outStream);
+      *outStream << std::endl << "Check Inverse Jacobian_1 of Constraint" << std::endl;
+      con->checkInverseJacobian_1(*rp,*dup,*up,*zp,true,*outStream);
+      *outStream << std::endl << "Check Inverse Adjoint Jacobian_1 of Constraint" << std::endl;
+      con->checkInverseAdjointJacobian_1(*rp,*pp,*up,*zp,true,*outStream);
+
+      *outStream << std::endl << "Check Gradient of Reduced Objective Function" << std::endl;
+      robj->checkGradient(*zp,*dzp,true,*outStream);
+      *outStream << std::endl << "Check Hessian of Reduced Objective Function" << std::endl;
+      robj->checkHessVec(*zp,*dzp,true,*outStream);
+    }
+    up->zero();
+    zp->zero();
+
+    RealT tol(1.e-8);
+    con->solve(*rp,*up,*zp,tol);
+    con->outputTpetraVector(u_ptr,"state_uncontrolled.txt");
+
+    ROL::Ptr<ROL::Problem<RealT>> problem;
+    bool useFullSpace = parlist->sublist("Problem").get("Full space",false);
+    if ( useFullSpace ) {
+      problem = ROL::makePtr<ROL::Problem<RealT>>(obj, ROL::makePtrFromRef(x));
+      problem->addConstraint("PDE", con, rp);
+    }
+    else {
+      problem = ROL::makePtr<ROL::Problem<RealT>>(robj, zp);
+    }
+    problem->finalize(false,true,*outStream);
+    ROL::Solver<RealT> solver(problem, *parlist);
+    solver.solve(*outStream);
+
+    // Output.
+    assembler->printMeshData(*outStream);
+    Teuchos::Array<RealT> res(1,0);
+    con->solve(*rp,*up,*zp,tol);
+    con->outputTpetraVector(u_ptr,"state.txt");
+    con->outputTpetraVector(z_ptr,"control.txt");
+    con->value(*rp,*up,*zp,tol);
+    r_ptr->norm2(res.view(0,1));
+    *outStream << "Residual Norm: " << res[0] << std::endl;
+    errorFlag += (res[0] > 1.e-6 ? 1 : 0);
+    //con->outputTpetraData();
+
+    auto obj0 = ROL::makePtr<IntegralObjective<RealT,DeviceT>>(qoi_vec[0],assembler);
+    RealT val = obj0->value(*up,*zp,tol);
+    *outStream << "Vorticity Value: " << val << std::endl;
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/thermal-fluids/example_02.cpp b/packages/rol/example/PDE-OPT/thermal-fluids/example_02.cpp
index b03efba9d55d..7579d4975b59 100644
--- a/packages/rol/example/PDE-OPT/thermal-fluids/example_02.cpp
+++ b/packages/rol/example/PDE-OPT/thermal-fluids/example_02.cpp
@@ -13,8 +13,7 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -110,7 +109,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int> > comm
     = Tpetra::getDefaultComm();
   ROL::Ptr<const Teuchos::Comm<int> > serial_comm
diff --git a/packages/rol/example/PDE-OPT/thermal-fluids/example_02K.cpp b/packages/rol/example/PDE-OPT/thermal-fluids/example_02K.cpp
new file mode 100644
index 000000000000..b3fb5cf457e9
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/thermal-fluids/example_02K.cpp
@@ -0,0 +1,339 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_01.cpp
+    \brief Shows how to solve the Navier-Stokes control problem.
+*/
+
+#include "Teuchos_Comm.hpp"
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+//#include <fenv.h>
+
+#include "ROL_TpetraMultiVector.hpp"
+#include "ROL_Reduced_Objective_SimOpt.hpp"
+#include "ROL_MonteCarloGenerator.hpp"
+#include "ROL_StochasticProblem.hpp"
+#include "ROL_Solver.hpp"
+#include "ROL_DistributionFactory.hpp"
+#include "ROL_TpetraTeuchosBatchManager.hpp"
+
+#include "../TOOLS/meshmanagerK.hpp"
+#include "../TOOLS/pdeconstraintK.hpp"
+#include "../TOOLS/pdeobjectiveK.hpp"
+#include "../TOOLS/pdevectorK.hpp"
+#include "pde_thermal-fluidsK.hpp"
+#include "obj_thermal-fluidsK.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+template<class Real>
+void print(ROL::Objective<Real> &obj,
+           const ROL::Vector<Real> &z,
+           ROL::SampleGenerator<Real> &sampler,
+           const int ngsamp,
+           const ROL::Ptr<const Teuchos::Comm<int> > &comm,
+           const std::string &filename) {
+  Real tol(1e-8);
+  // Build objective function distribution
+  int nsamp = sampler.numMySamples();
+  std::vector<Real> myvalues(nsamp), myzerovec(nsamp, 0);
+  std::vector<double> gvalues(ngsamp), gzerovec(ngsamp, 0);
+  std::vector<Real> sample = sampler.getMyPoint(0);
+  int sdim = sample.size();
+  std::vector<std::vector<Real>> mysamples(sdim, myzerovec);
+  std::vector<std::vector<double>> gsamples(sdim, gzerovec);
+  for (int i = 0; i < nsamp; ++i) {
+    sample = sampler.getMyPoint(i);
+    obj.setParameter(sample);
+    myvalues[i] = static_cast<double>(obj.value(z,tol));
+    for (int j = 0; j < sdim; ++j)
+      mysamples[j][i] = static_cast<double>(sample[j]);
+  }
+
+  // Send data to root processor
+#ifdef HAVE_MPI
+  auto mpicomm = ROL::dynamicPtrCast<const Teuchos::MpiComm<int>>(comm);
+  int nproc = Teuchos::size<int>(*mpicomm);
+  std::vector<int> sampleCounts(nproc, 0), sampleDispls(nproc, 0);
+  MPI_Gather(&nsamp,1,MPI_INT,&sampleCounts[0],1,MPI_INT,0,*(mpicomm->getRawMpiComm())());
+  for (int i = 1; i < nproc; ++i)
+    sampleDispls[i] = sampleDispls[i-1] + sampleCounts[i-1];
+  MPI_Gatherv(&myvalues[0],nsamp,MPI_DOUBLE,&gvalues[0],&sampleCounts[0],&sampleDispls[0],MPI_DOUBLE,0,*(mpicomm->getRawMpiComm())());
+  for (int j = 0; j < sdim; ++j)
+    MPI_Gatherv(&mysamples[j][0],nsamp,MPI_DOUBLE,&gsamples[j][0],&sampleCounts[0],&sampleDispls[0],MPI_DOUBLE,0,*(mpicomm->getRawMpiComm())());
+#else
+  gvalues.assign(myvalues.begin(),myvalues.end());
+  for (int j = 0; j < sdim; ++j)
+    gsamples[j].assign(mysamples[j].begin(),mysamples[j].end());
+#endif
+
+  // Print
+  int rank  = Teuchos::rank<int>(*comm);
+  if ( rank==0 ) {
+    std::ofstream file;
+    file.open(filename);
+    file << std::scientific << std::setprecision(15);
+    for (int i = 0; i < ngsamp; ++i) {
+      for (int j = 0; j < sdim; ++j)
+        file << std::setw(25) << std::left << gsamples[j][i];
+      file << std::setw(25) << std::left << gvalues[i] << std::endl;
+    }
+    file.close();
+  }
+}
+
+int main(int argc, char *argv[]) {
+  //feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
+
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  Kokkos::ScopeGuard kokkosScope (argc, argv);
+  auto comm = Tpetra::getDefaultComm();
+  auto serial_comm = ROL::makePtr<Teuchos::SerialComm<int>>();
+  const int myRank = comm->getRank();
+  if ((iprint > 0) && (myRank == 0))
+    outStream = ROL::makePtrFromRef(std::cout);
+  else
+    outStream = ROL::makePtrFromRef(bhs);
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+
+    /*** Read in XML input ***/
+    std::string filename = "input.xml";
+    auto parlist = ROL::getParametersFromXmlFile(filename);
+
+    /*** Initialize main data structure. ***/
+    auto meshMgr = ROL::makePtr<MeshManager_BackwardFacingStepChannel<RealT,DeviceT>>(*parlist);
+    // Initialize PDE describing Navier-Stokes equations.
+    auto pde = ROL::makePtr<PDE_ThermalFluids<RealT,DeviceT>>(*parlist);
+    auto con = ROL::makePtr<PDE_Constraint<RealT,DeviceT>>(pde,meshMgr,serial_comm,*parlist,*outStream);
+    // Cast the constraint and get the assembler.
+    auto assembler = con->getAssembler();
+    con->setSolveParameters(*parlist);
+    con->outputTpetraData();
+
+    // Create state vector and set to zeroes
+    auto u_ptr  = assembler->createStateVector();     u_ptr->randomize();
+    auto p_ptr  = assembler->createStateVector();     p_ptr->randomize();
+    auto du_ptr = assembler->createStateVector();     du_ptr->randomize();
+    auto yu_ptr = assembler->createStateVector();     yu_ptr->randomize();
+    auto yp_ptr = assembler->createStateVector();     yp_ptr->randomize();
+    auto r_ptr  = assembler->createResidualVector();  r_ptr->randomize();
+    auto z_ptr  = assembler->createControlVector();   z_ptr->randomize();
+    auto dz_ptr = assembler->createControlVector();   dz_ptr->randomize();
+    auto up  = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(u_ptr,pde,assembler);
+    auto pp  = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(p_ptr,pde,assembler);
+    auto dup = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(du_ptr,pde,assembler);
+    auto yup = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(yu_ptr,pde,assembler);
+    auto ypp = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(yp_ptr,pde,assembler);
+    auto rp  = ROL::makePtr<PDE_DualSimVector<RealT,DeviceT>>(r_ptr,pde,assembler);
+    auto zp  = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(z_ptr,pde,assembler);
+    auto dzp = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(dz_ptr,pde,assembler);
+    // Create ROL SimOpt vectors
+    ROL::Vector_SimOpt<RealT> x(up,zp);
+    ROL::Vector_SimOpt<RealT> d(dup,dzp);
+
+    // Initialize objective function.
+    std::vector<ROL::Ptr<QoI<RealT,DeviceT>>> qoi_vec(2,ROL::nullPtr);
+    qoi_vec[0] = ROL::makePtr<QoI_State_ThermalFluids<RealT,DeviceT>>(*parlist,
+                                                                 pde->getVelocityFE(),
+                                                                 pde->getPressureFE(),
+                                                                 pde->getThermalFE(),
+                                                                 pde->getFieldHelper());
+    qoi_vec[1] = ROL::makePtr<QoI_L2Penalty_ThermalFluids<RealT,DeviceT>>(pde->getVelocityFE(),
+                                                                     pde->getPressureFE(),
+                                                                     pde->getThermalFE(),
+                                                                     pde->getThermalBdryFE(),
+                                                                     pde->getBdryCellLocIds(),
+                                                                     pde->getFieldHelper());
+    auto std_obj = ROL::makePtr<StdObjective_ThermalFluids<RealT>>(*parlist);
+    auto obj = ROL::makePtr<PDE_Objective<RealT,DeviceT>>(qoi_vec,std_obj,assembler);
+    auto stateStore = ROL::makePtr<ROL::VectorController<RealT>>();
+    auto robj = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(obj, con, stateStore, up, zp, pp, true, false);
+
+    /*************************************************************************/
+    /***************** BUILD SAMPLER *****************************************/
+    /*************************************************************************/
+    int stochDim = 6;
+    int nsamp = parlist->sublist("Problem").get("Number of samples",100);
+    std::vector<ROL::Ptr<ROL::Distribution<RealT>>> distVec(stochDim);
+    // Reynolds Number
+    Teuchos::ParameterList ReList;
+    ReList.sublist("Distribution").set("Name","Uniform");
+    ReList.sublist("Distribution").sublist("Uniform").set("Lower Bound",0.0);
+    ReList.sublist("Distribution").sublist("Uniform").set("Upper Bound",1.0);
+    distVec[0] = ROL::DistributionFactory<RealT>(ReList);
+    // Prandtl Number
+    Teuchos::ParameterList PrList;
+    PrList.sublist("Distribution").set("Name","Uniform");
+    PrList.sublist("Distribution").sublist("Uniform").set("Lower Bound",0.0);
+    PrList.sublist("Distribution").sublist("Uniform").set("Upper Bound",1.0);
+    distVec[1] = ROL::DistributionFactory<RealT>(PrList);
+    // Grashof Number
+    Teuchos::ParameterList GrList;
+    GrList.sublist("Distribution").set("Name","Beta");
+    GrList.sublist("Distribution").sublist("Beta").set("Shape 1",5.0);
+    GrList.sublist("Distribution").sublist("Beta").set("Shape 2",2.0);
+    distVec[2] = ROL::DistributionFactory<RealT>(GrList);
+    // Thickness Number
+    Teuchos::ParameterList ThList;
+    ThList.sublist("Distribution").set("Name","Beta");
+    ThList.sublist("Distribution").sublist("Beta").set("Shape 1",5.0);
+    ThList.sublist("Distribution").sublist("Beta").set("Shape 2",3.0);
+    distVec[3] = ROL::DistributionFactory<RealT>(ThList);
+    // Inflow/Outflow
+    Teuchos::ParameterList BdList;
+    BdList.sublist("Distribution").set("Name","Uniform");
+    BdList.sublist("Distribution").sublist("Uniform").set("Lower Bound",-1.0);
+    BdList.sublist("Distribution").sublist("Uniform").set("Upper Bound", 1.0);
+    distVec[4] = ROL::DistributionFactory<RealT>(BdList);
+    distVec[5] = ROL::DistributionFactory<RealT>(BdList);
+    // Sampler
+    auto bman = ROL::makePtr<ROL::TpetraTeuchosBatchManager<RealT>>(comm);
+    auto sampler = ROL::makePtr<ROL::MonteCarloGenerator<RealT>>(nsamp,distVec,bman);
+
+    /*************************************************************************/
+    /***************** BUILD STOCHASTIC PROBLEM ******************************/
+    /*************************************************************************/
+    auto problem = ROL::makePtr<ROL::StochasticProblem<RealT>>(robj, zp);
+    parlist->sublist("SOL").sublist("Objective").sublist("Risk Measure").set("Name","CVaR");
+    parlist->sublist("SOL").set("Initial Statistic", 1.0);
+    problem->makeObjectiveStochastic(*parlist, sampler);
+
+    // Run derivative checks
+    bool checkDeriv = parlist->sublist("Problem").get("Check derivatives",false);
+    if ( checkDeriv ) {
+      *outStream << "Check Gradient of Full Objective Function" << std::endl;
+      obj->checkGradient(x,d,true,*outStream);
+      *outStream << std::endl << "Check Hessian of Full Objective Function" << std::endl;
+      obj->checkHessVec(x,d,true,*outStream);
+      *outStream << std::endl << "Check Jacobian of Constraint" << std::endl;
+      con->checkApplyJacobian(x,d,*up,true,*outStream);
+      *outStream << std::endl << "Check Jacobian_1 of Constraint" << std::endl;
+      con->checkApplyJacobian_1(*up,*zp,*dup,*rp,true,*outStream);
+      *outStream << std::endl << "Check Jacobian_2 of Constraint" << std::endl;
+      con->checkApplyJacobian_2(*up,*zp,*dzp,*rp,true,*outStream);
+      *outStream << std::endl << "Check Hessian of Constraint" << std::endl;
+      con->checkApplyAdjointHessian(x,*dup,d,x,true,*outStream);
+      *outStream << std::endl << "Check Hessian_11 of Constraint" << std::endl;
+      con->checkApplyAdjointHessian_11(*up,*zp,*pp,*dup,*rp,true,*outStream);
+      *outStream << std::endl << "Check Hessian_12 of Constraint" << std::endl;
+      con->checkApplyAdjointHessian_12(*up,*zp,*pp,*dup,*dzp,true,*outStream);
+      *outStream << std::endl << "Check Hessian_21 of Constraint" << std::endl;
+      con->checkApplyAdjointHessian_21(*up,*zp,*pp,*dzp,*rp,true,*outStream);
+      *outStream << std::endl << "Check Hessian_22 of Constraint" << std::endl;
+      con->checkApplyAdjointHessian_22(*up,*zp,*pp,*dzp,*dzp,true,*outStream);
+
+      *outStream << std::endl << "Check Adjoint Jacobian of Constraint" << std::endl;
+      con->checkAdjointConsistencyJacobian(*dup,d,x,true,*outStream);
+      *outStream << std::endl << "Check Adjoint Jacobian_1 of Constraint" << std::endl;
+      con->checkAdjointConsistencyJacobian_1(*pp,*dup,*up,*zp,true,*outStream);
+      *outStream << std::endl << "Check Adjoint Jacobian_2 of Constraint" << std::endl;
+      con->checkAdjointConsistencyJacobian_2(*pp,*dzp,*up,*zp,true,*outStream);
+
+      *outStream << std::endl << "Check Constraint Solve" << std::endl;
+      con->checkSolve(*up,*zp,*rp,true,*outStream);
+      *outStream << std::endl << "Check Inverse Jacobian_1 of Constraint" << std::endl;
+      con->checkInverseJacobian_1(*rp,*dup,*up,*zp,true,*outStream);
+      *outStream << std::endl << "Check Inverse Adjoint Jacobian_1 of Constraint" << std::endl;
+      con->checkInverseAdjointJacobian_1(*rp,*pp,*up,*zp,true,*outStream);
+
+      *outStream << std::endl << "Check Gradient of Reduced Objective Function" << std::endl;
+      robj->checkGradient(*zp,*dzp,true,*outStream);
+      *outStream << std::endl << "Check Hessian of Reduced Objective Function" << std::endl;
+      robj->checkHessVec(*zp,*dzp,true,*outStream);
+    }
+
+    up->zero(); zp->zero();
+    parlist->sublist("Step").set("Type","Trust Region");
+    ROL::Solver<RealT> solver(problem,*parlist);
+    std::clock_t timer = std::clock();
+    solver.solve(*outStream);
+    *outStream << "Optimization time: "
+               << static_cast<RealT>(std::clock()-timer)/static_cast<RealT>(CLOCKS_PER_SEC)
+               << " seconds." << std::endl << std::endl;
+    
+    /*************************************************************************/
+    /***************** OUTPUT RESULTS ****************************************/
+    /*************************************************************************/
+    std::clock_t timer_print = std::clock();
+    assembler->printMeshData(*outStream);
+    // Output control to file
+    con->outputTpetraVector(z_ptr,"control.txt");
+    // Output expected state and samples to file
+    *outStream << std::endl << "Print Expected Value of State" << std::endl;
+    up->zero(); pp->zero(); dup->zero(); dzp->zero(); yup->zero();
+    RealT tol(1.e-8);
+    ROL::Ptr<ROL::BatchManager<RealT> > bman_Eu
+      = ROL::makePtr<ROL::TpetraTeuchosBatchManager<RealT>>(comm);
+    std::vector<RealT> sample(stochDim);
+    std::stringstream name_samp;
+    name_samp << "samples_" << bman->batchID() << ".txt";
+    std::ofstream file_samp;
+    file_samp.open(name_samp.str());
+    file_samp << std::scientific << std::setprecision(15);
+    for (int i = 0; i < sampler->numMySamples(); ++i) {
+      *outStream << "Sample i = " << i << std::endl;
+      sample = sampler->getMyPoint(i);
+      con->setParameter(sample);
+      con->solve(*rp,*dup,*zp,tol);
+      up->axpy(sampler->getMyWeight(i),*dup);
+      con->solve(*rp,*dup,*dzp,tol);
+      yup->axpy(sampler->getMyWeight(i),*dup);
+      for (int j = 0; j < stochDim; ++j)
+        file_samp << std::setw(25) << std::left << sample[j];
+      file_samp << std::endl;
+    }
+    file_samp.close();
+    bman_Eu->sumAll(*up,*pp);
+    bman_Eu->sumAll(*yup,*ypp);
+    con->outputTpetraVector(p_ptr,"mean_state.txt");
+    con->outputTpetraVector(yp_ptr,"mean_uncontrolled_state.txt");
+    // Build full objective function distribution
+    *outStream << std::endl << "Print Objective CDF" << std::endl;
+    int nsamp_dist = parlist->sublist("Problem").get("Number of output samples",100);
+    auto sampler_dist = ROL::makePtr<ROL::MonteCarloGenerator<RealT>>(nsamp_dist,distVec,bman);
+    print<RealT>(*robj,*zp,*sampler_dist,nsamp_dist,comm,"obj_samples.txt");
+    // Build vorticity objective function distribution
+    auto obj0 = ROL::makePtr<IntegralObjective<RealT,DeviceT>>(qoi_vec[0],assembler);
+    auto stateStore0 = ROL::makePtr<ROL::VectorController<RealT>>();
+    auto robj0 = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(obj0, con, stateStore0, up, zp, pp, true, false);
+    print<RealT>(*robj0,*zp,*sampler_dist,nsamp_dist,comm,"vort_samples.txt");
+
+    *outStream << "Output time: "
+               << static_cast<RealT>(std::clock()-timer_print)/static_cast<RealT>(CLOCKS_PER_SEC)
+               << " seconds." << std::endl << std::endl;
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/thermal-fluids/example_03.cpp b/packages/rol/example/PDE-OPT/thermal-fluids/example_03.cpp
index dd0e8d4405b6..1751d97dbe4f 100644
--- a/packages/rol/example/PDE-OPT/thermal-fluids/example_03.cpp
+++ b/packages/rol/example/PDE-OPT/thermal-fluids/example_03.cpp
@@ -13,8 +13,7 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -45,7 +44,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int> > comm
     = Tpetra::getDefaultComm();
   const int myRank = comm->getRank();
diff --git a/packages/rol/example/PDE-OPT/thermal-fluids/example_03K.cpp b/packages/rol/example/PDE-OPT/thermal-fluids/example_03K.cpp
new file mode 100644
index 000000000000..0e71cfdba907
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/thermal-fluids/example_03K.cpp
@@ -0,0 +1,211 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_01.cpp
+    \brief Shows how to solve the Navier-Stokes control problem.
+*/
+
+#include "Teuchos_Comm.hpp"
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+//#include <fenv.h>
+
+#include "ROL_TpetraMultiVector.hpp"
+#include "ROL_Reduced_Objective_SimOpt.hpp"
+#include "ROL_Solver.hpp"
+
+#include "../TOOLS/pdeconstraintK.hpp"
+#include "../TOOLS/pdeobjectiveK.hpp"
+#include "../TOOLS/pdevectorK.hpp"
+#include "pde_thermal-fluids_ex03K.hpp"
+#include "obj_thermal-fluids_ex03K.hpp"
+#include "mesh_thermal-fluidsK.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+int main(int argc, char *argv[]) {
+  //feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
+
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  Kokkos::ScopeGuard kokkosScope (argc, argv);
+  auto comm = Tpetra::getDefaultComm();
+  const int myRank = comm->getRank();
+  if ((iprint > 0) && (myRank == 0))
+    outStream = ROL::makePtrFromRef(std::cout);
+  else
+    outStream = ROL::makePtrFromRef(bhs);
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+
+    /*** Read in XML input ***/
+    std::string filename = "input_ex03.xml";
+    auto parlist = ROL::getParametersFromXmlFile(filename);
+
+    /*** Initialize main data structure. ***/
+    auto meshMgr = ROL::makePtr<MeshManager_ThermalFluids<RealT,DeviceT>>(*parlist);
+    // Initialize PDE describing Navier-Stokes equations.
+    auto pde = ROL::makePtr<PDE_ThermalFluids_ex03<RealT,DeviceT>>(*parlist);
+    auto con = ROL::makePtr<PDE_Constraint<RealT,DeviceT>>(pde,meshMgr,comm,*parlist,*outStream);
+    // Cast the constraint and get the assembler.
+    auto assembler = con->getAssembler();
+    con->setSolveParameters(*parlist);
+    con->outputTpetraData();
+
+    // Create state vector and set to zeroes
+    auto u_ptr  = assembler->createStateVector();     u_ptr->randomize();
+    auto p_ptr  = assembler->createStateVector();     p_ptr->randomize();
+    auto du_ptr = assembler->createStateVector();     du_ptr->randomize();
+    auto r_ptr  = assembler->createResidualVector();  r_ptr->randomize();
+    auto z_ptr  = assembler->createControlVector();   z_ptr->randomize();
+    auto dz_ptr = assembler->createControlVector();   dz_ptr->randomize();
+    auto up  = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(u_ptr,pde,assembler);
+    auto pp  = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(p_ptr,pde,assembler);
+    auto dup = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(du_ptr,pde,assembler);
+    auto rp  = ROL::makePtr<PDE_DualSimVector<RealT,DeviceT>>(r_ptr,pde,assembler);
+    auto zp  = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(z_ptr,pde,assembler);
+    auto dzp = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(dz_ptr,pde,assembler);
+    // Create ROL SimOpt vectors
+    ROL::Vector_SimOpt<RealT> x(up,zp);
+    ROL::Vector_SimOpt<RealT> d(dup,dzp);
+
+    // Initialize objective function.
+    std::vector<ROL::Ptr<QoI<RealT,DeviceT>>> qoi_vec(2,ROL::nullPtr);
+    qoi_vec[0] = ROL::makePtr<QoI_State_ThermalFluids<RealT,DeviceT>>(*parlist,
+                                                                 pde->getVelocityFE(),
+                                                                 pde->getPressureFE(),
+                                                                 pde->getThermalFE(),
+                                                                 pde->getFieldHelper());
+    qoi_vec[1] = ROL::makePtr<QoI_L2Penalty_ThermalFluids<RealT,DeviceT>>(pde->getVelocityFE(),
+                                                                     pde->getPressureFE(),
+                                                                     pde->getThermalFE(),
+                                                                     pde->getThermalBdryFE(),
+                                                                     pde->getBdryCellLocIds(),
+                                                                     pde->getFieldHelper());
+    auto std_obj = ROL::makePtr<StdObjective_ThermalFluids<RealT>>(*parlist);
+    auto obj = ROL::makePtr<PDE_Objective<RealT,DeviceT>>(qoi_vec,std_obj,assembler);
+    auto stateStore = ROL::makePtr<ROL::VectorController<RealT>>();
+    auto robj = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(obj, con, stateStore, up, zp, pp, true, false);
+
+    //up->zero();
+    //zp->zero();
+    //z_ptr->putScalar(1.e0);
+    //dz_ptr->putScalar(0);
+
+    // Run derivative checks
+    bool checkDeriv = parlist->sublist("Problem").get("Check derivatives",false);
+    if ( checkDeriv ) {
+      *outStream << "Check Gradient of Full Objective Function" << std::endl;
+      obj->checkGradient(x,d,true,*outStream);
+      *outStream << std::endl << "Check Hessian of Full Objective Function" << std::endl;
+      obj->checkHessVec(x,d,true,*outStream);
+      *outStream << std::endl << "Check Jacobian of Constraint" << std::endl;
+      con->checkApplyJacobian(x,d,*up,true,*outStream);
+      *outStream << std::endl << "Check Jacobian_1 of Constraint" << std::endl;
+      con->checkApplyJacobian_1(*up,*zp,*dup,*rp,true,*outStream);
+      *outStream << std::endl << "Check Jacobian_2 of Constraint" << std::endl;
+      con->checkApplyJacobian_2(*up,*zp,*dzp,*rp,true,*outStream);
+      *outStream << std::endl << "Check Hessian of Constraint" << std::endl;
+      con->checkApplyAdjointHessian(x,*dup,d,x,true,*outStream);
+      *outStream << std::endl << "Check Hessian_11 of Constraint" << std::endl;
+      con->checkApplyAdjointHessian_11(*up,*zp,*pp,*dup,*rp,true,*outStream);
+      *outStream << std::endl << "Check Hessian_12 of Constraint" << std::endl;
+      con->checkApplyAdjointHessian_12(*up,*zp,*pp,*dup,*dzp,true,*outStream);
+      *outStream << std::endl << "Check Hessian_21 of Constraint" << std::endl;
+      con->checkApplyAdjointHessian_21(*up,*zp,*pp,*dzp,*rp,true,*outStream);
+      *outStream << std::endl << "Check Hessian_22 of Constraint" << std::endl;
+      con->checkApplyAdjointHessian_22(*up,*zp,*pp,*dzp,*dzp,true,*outStream);
+
+      *outStream << std::endl << "Check Adjoint Jacobian of Constraint" << std::endl;
+      con->checkAdjointConsistencyJacobian(*dup,d,x,true,*outStream);
+      *outStream << std::endl << "Check Adjoint Jacobian_1 of Constraint" << std::endl;
+      con->checkAdjointConsistencyJacobian_1(*pp,*dup,*up,*zp,true,*outStream);
+      *outStream << std::endl << "Check Adjoint Jacobian_2 of Constraint" << std::endl;
+      con->checkAdjointConsistencyJacobian_2(*pp,*dzp,*up,*zp,true,*outStream);
+
+      *outStream << std::endl << "Check Constraint Solve" << std::endl;
+      con->checkSolve(*up,*zp,*rp,true,*outStream);
+      *outStream << std::endl << "Check Inverse Jacobian_1 of Constraint" << std::endl;
+      con->checkInverseJacobian_1(*rp,*dup,*up,*zp,true,*outStream);
+      *outStream << std::endl << "Check Inverse Adjoint Jacobian_1 of Constraint" << std::endl;
+      con->checkInverseAdjointJacobian_1(*rp,*pp,*up,*zp,true,*outStream);
+
+      *outStream << std::endl << "Check Gradient of Reduced Objective Function" << std::endl;
+      robj->checkGradient(*zp,*dzp,true,*outStream);
+      *outStream << std::endl << "Check Hessian of Reduced Objective Function" << std::endl;
+      robj->checkHessVec(*zp,*dzp,true,*outStream);
+    }
+    up->zero();
+    zp->zero();
+
+    RealT tol(1.e-8);
+    bool initSolve = parlist->sublist("Problem").get("Solve state for full space",true);
+    if (initSolve) {
+      con->solve(*rp,*up,*zp,tol);
+      con->outputTpetraVector(u_ptr,"state_uncontrolled.txt");
+    }
+
+    bool useFullSpace = parlist->sublist("Problem").get("Full space",false);
+    ROL::Ptr<ROL::Problem<RealT>> problem;
+    if ( useFullSpace ) {
+      problem = ROL::makePtr<ROL::Problem<RealT>>(obj, ROL::makePtrFromRef(x));
+      problem->addConstraint("PDE", con, rp);
+    }
+    else {
+      problem = ROL::makePtr<ROL::Problem<RealT>>(robj, zp);
+    }
+    problem->finalize(false,true,*outStream);
+    ROL::Solver<RealT> solver(problem, *parlist);
+    solver.solve(*outStream);
+
+    // Output.
+    assembler->printMeshData(*outStream);
+    Teuchos::Array<RealT> res(1,0);
+    //con->solve(*rp,*up,*zp,tol);
+    con->outputTpetraVector(u_ptr,"state.txt");
+    con->outputTpetraVector(z_ptr,"control.txt");
+    con->value(*rp,*up,*zp,tol);
+    r_ptr->norm2(res.view(0,1));
+    *outStream << "Residual Norm: " << res[0] << std::endl;
+    errorFlag += (res[0] > 1.e-6 ? 1 : 0);
+    con->outputTpetraData();
+
+    auto obj0 = ROL::makePtr<IntegralObjective<RealT,DeviceT>>(qoi_vec[0],assembler);
+    RealT val = obj0->value(*up,*zp,tol);
+    *outStream << "Vorticity Value: " << val << std::endl;
+
+    // Get a summary from the time monitor.
+    Teuchos::TimeMonitor::summarize();
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/thermal-fluids/example_04.cpp b/packages/rol/example/PDE-OPT/thermal-fluids/example_04.cpp
index b20fcaf57321..33b21d75822e 100644
--- a/packages/rol/example/PDE-OPT/thermal-fluids/example_04.cpp
+++ b/packages/rol/example/PDE-OPT/thermal-fluids/example_04.cpp
@@ -13,8 +13,7 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -110,7 +109,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int> > comm
     = Tpetra::getDefaultComm();
   ROL::Ptr<const Teuchos::Comm<int> > serial_comm
diff --git a/packages/rol/example/PDE-OPT/thermal-fluids/example_04K.cpp b/packages/rol/example/PDE-OPT/thermal-fluids/example_04K.cpp
new file mode 100644
index 000000000000..acad35527a4d
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/thermal-fluids/example_04K.cpp
@@ -0,0 +1,321 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_01.cpp
+    \brief Shows how to solve the Navier-Stokes control problem.
+*/
+
+#include "Teuchos_Comm.hpp"
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+//#include <fenv.h>
+
+#include "ROL_TpetraMultiVector.hpp"
+#include "ROL_Reduced_Objective_SimOpt.hpp"
+#include "ROL_TpetraTeuchosBatchManager.hpp"
+#include "ROL_MonteCarloGenerator.hpp"
+#include "ROL_StochasticProblem.hpp"
+#include "ROL_Solver.hpp"
+
+#include "../TOOLS/pdeconstraintK.hpp"
+#include "../TOOLS/pdeobjectiveK.hpp"
+#include "../TOOLS/pdevectorK.hpp"
+#include "pde_thermal-fluids_ex03K.hpp"
+#include "obj_thermal-fluids_ex03K.hpp"
+#include "mesh_thermal-fluidsK.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+template<class Real>
+void print(ROL::Objective<Real> &obj,
+           const ROL::Vector<Real> &z,
+           ROL::SampleGenerator<Real> &sampler,
+           int ngsamp,
+           const ROL::Ptr<const Teuchos::Comm<int>> &comm,
+           std::string filename) {
+  Real tol(1e-8);
+  // Build objective function distribution
+  int nsamp = sampler.numMySamples();
+  std::vector<Real> myvalues(nsamp), myzerovec(nsamp, 0);
+  std::vector<double> gvalues(ngsamp), gzerovec(ngsamp, 0);
+  std::vector<Real> sample = sampler.getMyPoint(0);
+  int sdim = sample.size();
+  std::vector<std::vector<Real> > mysamples(sdim, myzerovec);
+  std::vector<std::vector<double> > gsamples(sdim, gzerovec);
+  for (int i = 0; i < nsamp; ++i) {
+    sample = sampler.getMyPoint(i);
+    obj.setParameter(sample);
+    myvalues[i] = static_cast<double>(obj.value(z,tol));
+    for (int j = 0; j < sdim; ++j) {
+      mysamples[j][i] = static_cast<double>(sample[j]);
+    }
+  }
+
+  // Send data to root processor
+#ifdef HAVE_MPI
+  auto mpicomm = ROL::dynamicPtrCast<const Teuchos::MpiComm<int>>(comm);
+  int nproc = Teuchos::size<int>(*mpicomm);
+  std::vector<int> sampleCounts(nproc, 0), sampleDispls(nproc, 0);
+  MPI_Gather(&nsamp,1,MPI_INT,&sampleCounts[0],1,MPI_INT,0,*(mpicomm->getRawMpiComm())());
+  for (int i = 1; i < nproc; ++i)
+    sampleDispls[i] = sampleDispls[i-1] + sampleCounts[i-1];
+  MPI_Gatherv(&myvalues[0],nsamp,MPI_DOUBLE,&gvalues[0],&sampleCounts[0],&sampleDispls[0],MPI_DOUBLE,0,*(mpicomm->getRawMpiComm())());
+  for (int j = 0; j < sdim; ++j)
+    MPI_Gatherv(&mysamples[j][0],nsamp,MPI_DOUBLE,&gsamples[j][0],&sampleCounts[0],&sampleDispls[0],MPI_DOUBLE,0,*(mpicomm->getRawMpiComm())());
+#else
+  gvalues.assign(myvalues.begin(),myvalues.end());
+  for (int j = 0; j < sdim; ++j)
+    gsamples[j].assign(mysamples[j].begin(),mysamples[j].end());
+#endif
+
+  // Print
+  int rank = Teuchos::rank<int>(*comm);
+  if ( rank==0 ) {
+    std::ofstream file;
+    file.open(filename);
+    file << std::scientific << std::setprecision(15);
+    for (int i = 0; i < ngsamp; ++i) {
+      for (int j = 0; j < sdim; ++j)
+        file << std::setw(25) << std::left << gsamples[j][i];
+      file << std::setw(25) << std::left << gvalues[i] << std::endl;
+    }
+    file.close();
+  }
+}
+
+int main(int argc, char *argv[]) {
+  //feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
+
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  Kokkos::ScopeGuard kokkosScope (argc, argv);
+  auto comm = Tpetra::getDefaultComm();
+  auto serial_comm = ROL::makePtr<Teuchos::SerialComm<int>>();
+  const int myRank = comm->getRank();
+  if ((iprint > 0) && (myRank == 0))
+    outStream = ROL::makePtrFromRef(std::cout);
+  else
+    outStream = ROL::makePtrFromRef(bhs);
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+
+    /*** Read in XML input ***/
+    std::string filename = "input_ex04.xml";
+    auto parlist = ROL::getParametersFromXmlFile(filename);
+    parlist->sublist("SimOpt").sublist("Solve").set("Output Iteration History",myRank==0);
+
+    /*** Initialize main data structure. ***/
+    auto meshMgr = ROL::makePtr<MeshManager_ThermalFluids<RealT,DeviceT>>(*parlist);
+    // Initialize PDE describing Navier-Stokes equations.
+    auto pde = ROL::makePtr<PDE_ThermalFluids_ex03<RealT,DeviceT>>(*parlist);
+    auto con = ROL::makePtr<PDE_Constraint<RealT,DeviceT>>(pde,meshMgr,serial_comm,*parlist,*outStream);
+    // Cast the constraint and get the assembler.
+    auto assembler = con->getAssembler();
+    con->setSolveParameters(*parlist);
+    con->outputTpetraData();
+
+    // Create state vector and set to zeroes
+    auto u_ptr  = assembler->createStateVector();     u_ptr->randomize();
+    auto p_ptr  = assembler->createStateVector();     p_ptr->randomize();
+    auto du_ptr = assembler->createStateVector();     du_ptr->randomize();
+    auto dy_ptr = assembler->createStateVector();     dy_ptr->randomize();
+    auto yu_ptr = assembler->createStateVector();     yu_ptr->randomize();
+    auto yp_ptr = assembler->createStateVector();     yp_ptr->randomize();
+    auto r_ptr  = assembler->createResidualVector();  r_ptr->randomize();
+    auto z_ptr  = assembler->createControlVector();   z_ptr->randomize();
+    auto dz_ptr = assembler->createControlVector();   dz_ptr->randomize();
+    auto up  = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(u_ptr,pde,assembler);
+    auto pp  = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(p_ptr,pde,assembler);
+    auto dup = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(du_ptr,pde,assembler);
+    auto dyp = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(dy_ptr,pde,assembler);
+    auto yup = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(yu_ptr,pde,assembler);
+    auto ypp = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(yp_ptr,pde,assembler);
+    auto rp  = ROL::makePtr<PDE_DualSimVector<RealT,DeviceT>>(r_ptr,pde,assembler);
+    auto zp  = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(z_ptr,pde,assembler);
+    auto dzp = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(dz_ptr,pde,assembler);
+    // Create ROL SimOpt vectors
+    ROL::Vector_SimOpt<RealT> x(up,zp);
+    ROL::Vector_SimOpt<RealT> d(dup,dzp);
+
+    // Initialize objective function.
+    std::vector<ROL::Ptr<QoI<RealT,DeviceT>>> qoi_vec(2,ROL::nullPtr);
+    qoi_vec[0] = ROL::makePtr<QoI_State_ThermalFluids<RealT,DeviceT>>(*parlist,
+                                                                 pde->getVelocityFE(),
+                                                                 pde->getPressureFE(),
+                                                                 pde->getThermalFE(),
+                                                                 pde->getFieldHelper());
+    qoi_vec[1] = ROL::makePtr<QoI_L2Penalty_ThermalFluids<RealT,DeviceT>>(pde->getVelocityFE(),
+                                                                     pde->getPressureFE(),
+                                                                     pde->getThermalFE(),
+                                                                     pde->getThermalBdryFE(),
+                                                                     pde->getBdryCellLocIds(),
+                                                                     pde->getFieldHelper());
+    auto std_obj = ROL::makePtr<StdObjective_ThermalFluids<RealT>>(*parlist);
+    auto obj = ROL::makePtr<PDE_Objective<RealT,DeviceT>>(qoi_vec,std_obj,assembler);
+    auto stateStore = ROL::makePtr<ROL::VectorController<RealT>>();
+    auto robj = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(obj, con, stateStore, up, zp, pp, true, false);
+
+    /*************************************************************************/
+    /***************** BUILD SAMPLER *****************************************/
+    /*************************************************************************/
+    int Nbottom = parlist->sublist("Problem").get("Bottom KL Truncation Order",5);
+    int Nleft   = parlist->sublist("Problem").get("Left KL Truncation Order",5);
+    int Nright  = parlist->sublist("Problem").get("Right KL Truncation Order",5);
+    int stochDim = Nbottom + Nleft + Nright + 3;
+    int nsamp = parlist->sublist("Problem").get("Number of samples",100);
+    // Build vector of distributions
+    std::vector<ROL::Ptr<ROL::Distribution<RealT>>> distVec(stochDim);
+    Teuchos::ParameterList UList;
+    UList.sublist("Distribution").set("Name","Uniform");
+    UList.sublist("Distribution").sublist("Uniform").set("Lower Bound",-1.0);
+    UList.sublist("Distribution").sublist("Uniform").set("Upper Bound", 1.0);
+    for (int i = 0; i < stochDim; ++i)
+      distVec[i] = ROL::DistributionFactory<RealT>(UList);
+    // Sampler
+    auto bman = ROL::makePtr<ROL::TpetraTeuchosBatchManager<RealT>>(comm);
+    //  = ROL::makePtr<PDE_OptVector_BatchManager<RealT>>(comm);
+    auto sampler = ROL::makePtr<ROL::MonteCarloGenerator<RealT>>(nsamp,distVec,bman);
+
+    /*************************************************************************/
+    /***************** BUILD STOCHASTIC PROBLEM ******************************/
+    /*************************************************************************/
+    auto opt = ROL::makePtr<ROL::StochasticProblem<RealT>>(robj,zp);
+    parlist->sublist("SOL").set("Initial Solution",1.0);
+    opt->makeObjectiveStochastic(*parlist,sampler);
+
+    // Run derivative checks
+    bool checkDeriv = parlist->sublist("Problem").get("Check derivatives",false);
+    if ( checkDeriv ) {
+      *outStream << "Check Gradient of Full Objective Function" << std::endl;
+      obj->checkGradient(x,d,true,*outStream);
+      *outStream << std::endl << "Check Hessian of Full Objective Function" << std::endl;
+      obj->checkHessVec(x,d,true,*outStream);
+      *outStream << std::endl << "Check Jacobian of Constraint" << std::endl;
+      con->checkApplyJacobian(x,d,*up,true,*outStream);
+      *outStream << std::endl << "Check Jacobian_1 of Constraint" << std::endl;
+      con->checkApplyJacobian_1(*up,*zp,*dup,*rp,true,*outStream);
+      *outStream << std::endl << "Check Jacobian_2 of Constraint" << std::endl;
+      con->checkApplyJacobian_2(*up,*zp,*dzp,*rp,true,*outStream);
+      *outStream << std::endl << "Check Hessian of Constraint" << std::endl;
+      con->checkApplyAdjointHessian(x,*dup,d,x,true,*outStream);
+      *outStream << std::endl << "Check Hessian_11 of Constraint" << std::endl;
+      con->checkApplyAdjointHessian_11(*up,*zp,*pp,*dup,*rp,true,*outStream);
+      *outStream << std::endl << "Check Hessian_12 of Constraint" << std::endl;
+      con->checkApplyAdjointHessian_12(*up,*zp,*pp,*dup,*dzp,true,*outStream);
+      *outStream << std::endl << "Check Hessian_21 of Constraint" << std::endl;
+      con->checkApplyAdjointHessian_21(*up,*zp,*pp,*dzp,*rp,true,*outStream);
+      *outStream << std::endl << "Check Hessian_22 of Constraint" << std::endl;
+      con->checkApplyAdjointHessian_22(*up,*zp,*pp,*dzp,*dzp,true,*outStream);
+
+      *outStream << std::endl << "Check Adjoint Jacobian of Constraint" << std::endl;
+      con->checkAdjointConsistencyJacobian(*dup,d,x,true,*outStream);
+      *outStream << std::endl << "Check Adjoint Jacobian_1 of Constraint" << std::endl;
+      con->checkAdjointConsistencyJacobian_1(*pp,*dup,*up,*zp,true,*outStream);
+      *outStream << std::endl << "Check Adjoint Jacobian_2 of Constraint" << std::endl;
+      con->checkAdjointConsistencyJacobian_2(*pp,*dzp,*up,*zp,true,*outStream);
+
+      *outStream << std::endl << "Check Constraint Solve" << std::endl;
+      con->checkSolve(*up,*zp,*rp,true,*outStream);
+      *outStream << std::endl << "Check Inverse Jacobian_1 of Constraint" << std::endl;
+      con->checkInverseJacobian_1(*rp,*dup,*up,*zp,true,*outStream);
+      *outStream << std::endl << "Check Inverse Adjoint Jacobian_1 of Constraint" << std::endl;
+      con->checkInverseAdjointJacobian_1(*rp,*pp,*up,*zp,true,*outStream);
+
+      *outStream << std::endl << "Check Gradient of Reduced Objective Function" << std::endl;
+      robj->checkGradient(*zp,*dzp,true,*outStream);
+      *outStream << std::endl << "Check Hessian of Reduced Objective Function" << std::endl;
+      robj->checkHessVec(*zp,*dzp,true,*outStream);
+    }
+
+    up->zero();
+    zp->zero();
+    parlist->sublist("Step").set("Type","Trust Region");
+    ROL::Solver<RealT> solver(opt,*parlist);
+    std::clock_t timer = std::clock();
+    solver.solve(*outStream);
+    *outStream << "Optimization time: "
+               << static_cast<RealT>(std::clock()-timer)/static_cast<RealT>(CLOCKS_PER_SEC)
+               << " seconds." << std::endl << std::endl;
+    
+    /*************************************************************************/
+    /***************** OUTPUT RESULTS ****************************************/
+    /*************************************************************************/
+    std::clock_t timer_print = std::clock();
+    assembler->printMeshData(*outStream);
+    // Output control to file
+    con->outputTpetraVector(z_ptr,"control.txt");
+    // Output expected state and samples to file
+    *outStream << std::endl << "Print Expected Value of State" << std::endl;
+    up->zero(); pp->zero(); dup->zero(); dzp->zero(); yup->zero(); dyp->zero();
+    RealT tol(1.e-8);
+    auto bman_Eu = ROL::makePtr<ROL::TpetraTeuchosBatchManager<RealT>>(comm);
+    std::vector<RealT> sample(stochDim);
+    std::stringstream name_samp;
+    name_samp << "samples_" << bman->batchID() << ".txt";
+    std::ofstream file_samp;
+    file_samp.open(name_samp.str());
+    file_samp << std::scientific << std::setprecision(15);
+    for (int i = 0; i < sampler->numMySamples(); ++i) {
+      *outStream << "Sample i = " << i << std::endl;
+      sample = sampler->getMyPoint(i);
+      con->setParameter(sample);
+      con->solve(*rp,*dup,*zp,tol);
+      up->axpy(sampler->getMyWeight(i),*dup);
+      con->solve(*rp,*dyp,*dzp,tol);
+      yup->axpy(sampler->getMyWeight(i),*dyp);
+      for (int j = 0; j < stochDim; ++j)
+        file_samp << std::setw(25) << std::left << sample[j];
+      file_samp << std::endl;
+    }
+    file_samp.close();
+    bman_Eu->sumAll(*up,*pp);
+    bman_Eu->sumAll(*yup,*ypp);
+    con->outputTpetraVector(p_ptr,"mean_state.txt");
+    con->outputTpetraVector(yp_ptr,"mean_uncontrolled_state.txt");
+    // Build full objective function distribution
+    *outStream << std::endl << "Print Objective CDF" << std::endl;
+    int nsamp_dist = parlist->sublist("Problem").get("Number of output samples",100);
+    auto sampler_dist = ROL::makePtr<ROL::MonteCarloGenerator<RealT>>(nsamp_dist,distVec,bman);
+    print<RealT>(*robj,*zp,*sampler_dist,nsamp_dist,comm,"obj_samples.txt");
+    // Build vorticity objective function distribution
+    auto obj0 = ROL::makePtr<IntegralObjective<RealT,DeviceT>>(qoi_vec[0],assembler);
+    auto stateStore0 = ROL::makePtr<ROL::VectorController<RealT>>();
+    auto robj0 = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(obj0, con, stateStore0, up, zp, pp, true, false);
+    print<RealT>(*robj0,*zp,*sampler_dist,nsamp_dist,comm,"vort_samples.txt");
+
+    *outStream << "Output time: "
+               << static_cast<RealT>(std::clock()-timer_print)/static_cast<RealT>(CLOCKS_PER_SEC)
+               << " seconds." << std::endl << std::endl;
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/thermal-fluids/example_05.cpp b/packages/rol/example/PDE-OPT/thermal-fluids/example_05.cpp
index d6469aef7ad0..3bcd88470aa6 100644
--- a/packages/rol/example/PDE-OPT/thermal-fluids/example_05.cpp
+++ b/packages/rol/example/PDE-OPT/thermal-fluids/example_05.cpp
@@ -13,8 +13,7 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -115,7 +114,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int> > comm
     = Tpetra::getDefaultComm();
   ROL::Ptr<const Teuchos::Comm<int> > serial_comm
diff --git a/packages/rol/example/PDE-OPT/thermal-fluids/example_05K.cpp b/packages/rol/example/PDE-OPT/thermal-fluids/example_05K.cpp
new file mode 100644
index 000000000000..981bc5a8ee3b
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/thermal-fluids/example_05K.cpp
@@ -0,0 +1,324 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_01.cpp
+    \brief Shows how to solve the Navier-Stokes control problem.
+*/
+
+#include "Teuchos_Comm.hpp"
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+//#include <fenv.h>
+
+#include "ROL_TpetraMultiVector.hpp"
+#include "ROL_Solver.hpp"
+#include "ROL_Reduced_Objective_SimOpt.hpp"
+#include "ROL_MonteCarloGenerator.hpp"
+#include "ROL_SimulatedConstraint.hpp"
+#include "ROL_SimulatedObjectiveCVaR.hpp"
+#include "ROL_SimulatedObjective.hpp"
+#include "ROL_TpetraTeuchosBatchManager.hpp"
+
+#include "../TOOLS/pdeconstraintK.hpp"
+#include "../TOOLS/pdeobjectiveK.hpp"
+#include "../TOOLS/pdevectorK.hpp"
+#include "pde_thermal-fluids_ex03K.hpp"
+#include "obj_thermal-fluids_ex03K.hpp"
+#include "mesh_thermal-fluidsK.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+template<class Real>
+void print(ROL::Objective<Real> &obj,
+           const ROL::Vector<Real> &z,
+           ROL::SampleGenerator<Real> &sampler,
+           int ngsamp,
+           const ROL::Ptr<const Teuchos::Comm<int>> &comm,
+           std::string filename) {
+  Real tol(1e-8);
+  // Build objective function distribution
+  int nsamp = sampler.numMySamples();
+  std::vector<Real> myvalues(nsamp), myzerovec(nsamp, 0);
+  std::vector<double> gvalues(ngsamp), gzerovec(ngsamp, 0);
+  std::vector<Real> sample = sampler.getMyPoint(0);
+  int sdim = sample.size();
+  std::vector<std::vector<Real> > mysamples(sdim, myzerovec);
+  std::vector<std::vector<double> > gsamples(sdim, gzerovec);
+  for (int i = 0; i < nsamp; ++i) {
+    sample = sampler.getMyPoint(i);
+    obj.setParameter(sample);
+    myvalues[i] = static_cast<double>(obj.value(z,tol));
+    for (int j = 0; j < sdim; ++j)
+      mysamples[j][i] = static_cast<double>(sample[j]);
+  }
+
+  // Send data to root processor
+#ifdef HAVE_MPI
+  auto mpicomm = ROL::dynamicPtrCast<const Teuchos::MpiComm<int>>(comm);
+  int nproc = Teuchos::size<int>(*mpicomm);
+  std::vector<int> sampleCounts(nproc, 0), sampleDispls(nproc, 0);
+  MPI_Gather(&nsamp,1,MPI_INT,&sampleCounts[0],1,MPI_INT,0,*(mpicomm->getRawMpiComm())());
+  for (int i = 1; i < nproc; ++i)
+    sampleDispls[i] = sampleDispls[i-1] + sampleCounts[i-1];
+  MPI_Gatherv(&myvalues[0],nsamp,MPI_DOUBLE,&gvalues[0],&sampleCounts[0],&sampleDispls[0],MPI_DOUBLE,0,*(mpicomm->getRawMpiComm())());
+  for (int j = 0; j < sdim; ++j)
+    MPI_Gatherv(&mysamples[j][0],nsamp,MPI_DOUBLE,&gsamples[j][0],&sampleCounts[0],&sampleDispls[0],MPI_DOUBLE,0,*(mpicomm->getRawMpiComm())());
+#else
+  gvalues.assign(myvalues.begin(),myvalues.end());
+  for (int j = 0; j < sdim; ++j)
+    gsamples[j].assign(mysamples[j].begin(),mysamples[j].end());
+#endif
+
+  // Print
+  int rank  = Teuchos::rank<int>(*comm);
+  if ( rank==0 ) {
+    std::ofstream file;
+    file.open(filename);
+    file << std::scientific << std::setprecision(15);
+    for (int i = 0; i < ngsamp; ++i) {
+      for (int j = 0; j < sdim; ++j)
+        file << std::setw(25) << std::left << gsamples[j][i];
+      file << std::setw(25) << std::left << gvalues[i] << std::endl;
+    }
+    file.close();
+  }
+}
+
+int main(int argc, char *argv[]) {
+  //feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
+
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  Kokkos::ScopeGuard kokkosScope (argc, argv);
+  auto comm = Tpetra::getDefaultComm();
+  auto serial_comm = ROL::makePtr<Teuchos::SerialComm<int>>();
+  const int myRank = comm->getRank();
+  if ((iprint > 0) && (myRank == 0))
+    outStream = ROL::makePtrFromRef(std::cout);
+  else
+    outStream = ROL::makePtrFromRef(bhs);
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+
+    /*** Read in XML input ***/
+    std::string filename = "input_ex05.xml";
+    auto parlist = ROL::getParametersFromXmlFile(filename);
+    parlist->sublist("SimOpt").sublist("Solve").set("Output Iteration History",myRank==0);
+
+    /*** Initialize main data structure. ***/
+    auto meshMgr = ROL::makePtr<MeshManager_ThermalFluids<RealT,DeviceT>>(*parlist);
+    // Initialize PDE describing Navier-Stokes equations.
+    auto pde = ROL::makePtr<PDE_ThermalFluids_ex03<RealT,DeviceT>>(*parlist);
+    auto con = ROL::makePtr<PDE_Constraint<RealT,DeviceT>>(pde,meshMgr,serial_comm,*parlist,*outStream);
+    // Cast the constraint and get the assembler.
+    auto assembler = con->getAssembler();
+    con->setSolveParameters(*parlist);
+    con->outputTpetraData();
+
+    // Create state vector and set to zeroes
+    auto u_ptr  = assembler->createStateVector();     u_ptr->randomize();
+    auto p_ptr  = assembler->createStateVector();     p_ptr->randomize();
+    auto y_ptr  = assembler->createStateVector();     y_ptr->randomize();
+    auto r_ptr  = assembler->createResidualVector();  r_ptr->randomize();
+    auto z_ptr  = assembler->createControlVector();   z_ptr->putScalar(1.234); //z_ptr->randomize();
+    auto s_ptr  = assembler->createControlVector();   s_ptr->putScalar(2.345); //s_ptr->randomize();
+    auto t_ptr  = assembler->createControlVector();   t_ptr->putScalar(3.456); //t_ptr->randomize();
+    auto up  = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(u_ptr,pde,assembler);
+    auto pp  = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(p_ptr,pde,assembler);
+    auto yp  = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(y_ptr,pde,assembler);
+    auto rp  = ROL::makePtr<PDE_DualSimVector<RealT,DeviceT>>(r_ptr,pde,assembler);
+    auto zp  = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(z_ptr,pde,assembler);
+    auto sp  = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(s_ptr,pde,assembler);
+    auto tp  = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(t_ptr,pde,assembler);
+
+    // Initialize objective function.
+    std::vector<ROL::Ptr<QoI<RealT,DeviceT> > > qoi_vec(2,ROL::nullPtr);
+    qoi_vec[0] = ROL::makePtr<QoI_State_ThermalFluids<RealT,DeviceT>>(*parlist,
+                                                                 pde->getVelocityFE(),
+                                                                 pde->getPressureFE(),
+                                                                 pde->getThermalFE(),
+                                                                 pde->getFieldHelper());
+    qoi_vec[1] = ROL::makePtr<QoI_L2Penalty_ThermalFluids<RealT,DeviceT>>(pde->getVelocityFE(),
+                                                                     pde->getPressureFE(),
+                                                                     pde->getThermalFE(),
+                                                                     pde->getThermalBdryFE(),
+                                                                     pde->getBdryCellLocIds(),
+                                                                     pde->getFieldHelper());
+    auto std_obj = ROL::makePtr<StdObjective_ThermalFluids<RealT>>(*parlist);
+    auto obj = ROL::makePtr<PDE_Objective<RealT,DeviceT>>(qoi_vec,std_obj,assembler);
+    auto stateStore = ROL::makePtr<ROL::VectorController<RealT>>();
+    auto robj = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(obj, con, stateStore, up, zp, pp, true, false);
+
+    /*************************************************************************/
+    /***************** BUILD SAMPLER *****************************************/
+    /*************************************************************************/
+    int Nbottom = parlist->sublist("Problem").get("Bottom KL Truncation Order",5);
+    int Nleft   = parlist->sublist("Problem").get("Left KL Truncation Order",5);
+    int Nright  = parlist->sublist("Problem").get("Right KL Truncation Order",5);
+    int stochDim = Nbottom + Nleft + Nright + 3;
+    int nsamp = parlist->sublist("Problem").get("Number of samples",100);
+    // Build vector of distributions
+    std::vector<ROL::Ptr<ROL::Distribution<RealT> > > distVec(stochDim);
+    Teuchos::ParameterList UList;
+    UList.sublist("Distribution").set("Name","Uniform");
+    UList.sublist("Distribution").sublist("Uniform").set("Lower Bound",-1.0);
+    UList.sublist("Distribution").sublist("Uniform").set("Upper Bound", 1.0);
+    for (int i = 0; i < stochDim; ++i)
+      distVec[i] = ROL::DistributionFactory<RealT>(UList);
+    // Sampler
+    auto bman = ROL::makePtr<ROL::TpetraTeuchosBatchManager<RealT>>(comm);
+    auto sampler = ROL::makePtr<ROL::MonteCarloGenerator<RealT>>(nsamp,distVec,bman);
+
+    /*************************************************************************/
+    /***************** BUILD STOCHASTIC PROBLEM ******************************/
+    /*************************************************************************/
+    bool useW    = parlist->sublist("SOL").sublist("Simulated").get("Use Constraint Weights", true);
+    bool useCVaR = parlist->sublist("SOL").sublist("Simulated").get("Use CVaR", false);
+    auto simcon  = ROL::makePtr<ROL::SimulatedConstraint<RealT>>(sampler, con, useW);
+    ROL::Ptr<ROL::Objective<RealT>> simobj;
+    if (useCVaR) {
+      ROL::ParameterList list = parlist->sublist("SOL").sublist("Simulated");
+      auto pf = ROL::makePtr<ROL::PlusFunction<RealT>>(list);
+      RealT alpha = parlist->sublist("SOL").sublist("Simulated").get("CVaR Confidence Level", 0.9);
+      simobj = ROL::makePtr<ROL::SimulatedObjectiveCVaR<RealT>>(sampler, obj, pf, alpha);
+    }
+    else {
+      simobj = ROL::makePtr<ROL::SimulatedObjective<RealT>>(sampler, obj);
+    }
+    std::vector<ROL::Ptr<ROL::Vector<RealT>>> vuvec, vpvec, vyvec;
+    for (int i = 0; i < sampler->numMySamples(); ++i) {
+      auto vu_ptr  = assembler->createStateVector(); vu_ptr->putScalar(4.567); //vu_ptr->randomize();
+      auto vp_ptr  = assembler->createStateVector(); vp_ptr->putScalar(5.678); //vp_ptr->randomize();
+      auto vy_ptr  = assembler->createStateVector(); vy_ptr->putScalar(6.789); //vy_ptr->randomize();
+      auto vup  = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(vu_ptr,pde,assembler);
+      auto vpp  = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(vp_ptr,pde,assembler);
+      auto vyp  = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(vy_ptr,pde,assembler);
+      vuvec.push_back(vup);
+      vpvec.push_back(vpp);
+      vyvec.push_back(vyp);
+    }
+    auto vu = ROL::makePtr<ROL::SimulatedVector<RealT>>(vuvec,bman);
+    auto vp = ROL::makePtr<ROL::SimulatedVector<RealT>>(vpvec,bman);
+    auto vy = ROL::makePtr<ROL::SimulatedVector<RealT>>(vyvec,bman);
+    ROL::Ptr<ROL::Vector<RealT>> rz, rs, rt;
+    if (useCVaR) {
+      Teuchos::RCP<Teuchos::ParameterList> cvarlist = Teuchos::rcp( new Teuchos::ParameterList() );
+      cvarlist->sublist("SOL").sublist("Risk Measure").set("Name","CVaR");
+      rz = ROL::makePtr<ROL::RiskVector<RealT>>(cvarlist, zp);
+      rs = ROL::makePtr<ROL::RiskVector<RealT>>(cvarlist, sp);
+      rt = ROL::makePtr<ROL::RiskVector<RealT>>(cvarlist, tp);
+    }
+    else {
+      rz = zp;
+      rs = sp;
+      rt = tp;
+    }
+    auto x = ROL::makePtr<ROL::Vector_SimOpt<RealT>>(vu,rz);
+    auto p = ROL::makePtr<ROL::Vector_SimOpt<RealT>>(vp,rs);
+    auto y = ROL::makePtr<ROL::Vector_SimOpt<RealT>>(vy,rt);
+    x->checkVector(*p,*y,true,*outStream);
+
+    bool derivCheck = parlist->sublist("Problem").get("Check derivatives",false);
+    if (derivCheck) {
+      *outStream << std::endl << "TESTING SimulatedConstraint" << std::endl;
+      simcon->checkApplyJacobian(*x, *p, *vu, true, *outStream);
+      simcon->checkAdjointConsistencyJacobian(*vu, *p, *x, true, *outStream);
+      simcon->checkApplyAdjointHessian(*x, *vu, *p, *x, true, *outStream);
+      *outStream << std::endl << "TESTING SimulatedObjective" << std::endl;
+      RealT tol = 1e-8;
+      simobj->value(*x, tol);
+      simobj->checkGradient(*x, *p, true, *outStream);
+      simobj->checkHessVec(*x, *p, true, *outStream);
+    }
+
+    zp->zero();
+    auto vusim = ROL::dynamicPtrCast<ROL::SimulatedVector<RealT>>(vu);
+    for (int i = 0; i < sampler->numMySamples(); ++i) {
+      RealT tol = 1e-8;
+      std::vector<RealT> param = sampler->getMyPoint(i);
+      con->setParameter(param);
+      vusim->get(i)->zero();
+      con->update(*(vusim->get(i)),*zp);
+      con->solve(*rp,*(vusim->get(i)),*zp,tol);
+    }
+
+    bool zeroInit = parlist->sublist("Problem").get("Zero initial guess",true);
+    if (zeroInit) {
+      x->zero();
+      vp->zero();
+    }
+
+    /*************************************************************************/
+    /***************** SOLVE PROBLEM *****************************************/
+    /*************************************************************************/
+    auto opt = ROL::makePtr<ROL::Problem<RealT>>(simobj, x);
+    opt->addConstraint("PDE", simcon, vp);
+    opt->finalize(false,true,*outStream);
+    ROL::Solver<RealT> solver(opt,*parlist);
+    std::clock_t timer = std::clock();
+    solver.solve(*outStream);
+    *outStream << "Optimization time: "
+               << static_cast<RealT>(std::clock()-timer)/static_cast<RealT>(CLOCKS_PER_SEC)
+               << " seconds." << std::endl << std::endl;
+    
+    /*************************************************************************/
+    /***************** OUTPUT RESULTS ****************************************/
+    /*************************************************************************/
+    std::clock_t timer_print = std::clock();
+    assembler->printMeshData(*outStream);
+    // Output control to file
+    con->outputTpetraVector(z_ptr,"control.txt");
+    // Output expected state and samples to file
+    *outStream << std::endl << "Print Expected Value of State" << std::endl;
+    up->zero(); pp->zero();
+    for (int i = 0; i < sampler->numMySamples(); ++i)
+      up->axpy(sampler->getMyWeight(i),*(vusim->get(i)));
+    bman->sumAll(*up,*pp);
+    con->outputTpetraVector(p_ptr,"mean_state.txt");
+    // Build full objective function distribution
+    *outStream << std::endl << "Print Objective CDF" << std::endl;
+    int nsamp_dist = parlist->sublist("Problem").get("Number of output samples",100);
+    auto sampler_dist = ROL::makePtr<ROL::MonteCarloGenerator<RealT>>(nsamp_dist,distVec,bman);
+    print<RealT>(*robj,*zp,*sampler_dist,nsamp_dist,comm,"obj_samples.txt");
+    // Build vorticity objective function distribution
+    auto obj0 = ROL::makePtr<IntegralObjective<RealT,DeviceT>>(qoi_vec[0],assembler);
+    auto stateStore0 = ROL::makePtr<ROL::VectorController<RealT>>();
+    auto robj0 = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(obj0, con, stateStore0, up, zp, pp, true, false);
+    print<RealT>(*robj0,*zp,*sampler_dist,nsamp_dist,comm,"vort_samples.txt");
+
+    *outStream << "Output time: "
+               << static_cast<RealT>(std::clock()-timer_print)/static_cast<RealT>(CLOCKS_PER_SEC)
+               << " seconds." << std::endl << std::endl;
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/thermal-fluids/example_06.cpp b/packages/rol/example/PDE-OPT/thermal-fluids/example_06.cpp
index 9ded3b6eebcf..decd51517363 100644
--- a/packages/rol/example/PDE-OPT/thermal-fluids/example_06.cpp
+++ b/packages/rol/example/PDE-OPT/thermal-fluids/example_06.cpp
@@ -13,8 +13,7 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -123,7 +122,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int> > comm
     = Tpetra::getDefaultComm();
   ROL::Ptr<const Teuchos::Comm<int> > serial_comm
diff --git a/packages/rol/example/PDE-OPT/thermal-fluids/example_06K.cpp b/packages/rol/example/PDE-OPT/thermal-fluids/example_06K.cpp
new file mode 100644
index 000000000000..0ee26ad5328b
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/thermal-fluids/example_06K.cpp
@@ -0,0 +1,309 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_01.cpp
+    \brief Shows how to solve the Navier-Stokes control problem.
+*/
+
+#include "Teuchos_Comm.hpp"
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+//#include <fenv.h>
+
+#include "ROL_TpetraMultiVector.hpp"
+#include "ROL_Reduced_Objective_SimOpt.hpp"
+#include "ROL_TpetraTeuchosBatchManager.hpp"
+#include "ROL_MonteCarloGenerator.hpp"
+#include "ROL_StochasticProblem.hpp"
+#include "ROL_Solver.hpp"
+
+#include "../TOOLS/pdeconstraintK.hpp"
+#include "../TOOLS/pdeobjectiveK.hpp"
+#include "../TOOLS/pdevectorK.hpp"
+#include "pde_thermal-fluids_ex03K.hpp"
+#include "obj_thermal-fluids_ex03K.hpp"
+#include "mesh_thermal-fluidsK.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+template<class Real>
+void setUpAndSolve(const ROL::Ptr<ROL::Problem<Real>> &opt,
+                   ROL::ParameterList &parlist,
+                   std::ostream &outStream) {
+  parlist.sublist("Step").set("Type","Trust Region");
+  ROL::Solver<RealT> solver(opt,parlist);
+  Teuchos::Time timer("Optimization Time", true);
+  solver.solve(outStream);
+  timer.stop();
+  outStream << "Total optimization time = " << timer.totalElapsedTime() << " seconds." << std::endl;
+}
+
+template<class Real>
+void print(ROL::Objective<Real> &obj,
+           const ROL::Vector<Real> &z,
+           ROL::SampleGenerator<Real> &sampler,
+           int ngsamp,
+           const ROL::Ptr<const Teuchos::Comm<int>> &comm,
+           std::string filename) {
+  Real tol(1e-8);
+  // Build objective function distribution
+  int nsamp = sampler.numMySamples();
+  std::vector<Real> myvalues(nsamp), myzerovec(nsamp, 0);
+  std::vector<double> gvalues(ngsamp), gzerovec(ngsamp, 0);
+  std::vector<Real> sample = sampler.getMyPoint(0);
+  int sdim = sample.size();
+  std::vector<std::vector<Real> > mysamples(sdim, myzerovec);
+  std::vector<std::vector<double> > gsamples(sdim, gzerovec);
+  for (int i = 0; i < nsamp; ++i) {
+    sample = sampler.getMyPoint(i);
+    obj.setParameter(sample);
+    myvalues[i] = static_cast<double>(obj.value(z,tol));
+    for (int j = 0; j < sdim; ++j)
+      mysamples[j][i] = static_cast<double>(sample[j]);
+  }
+
+  // Send data to root processor
+#ifdef HAVE_MPI
+  auto mpicomm = ROL::dynamicPtrCast<const Teuchos::MpiComm<int>>(comm);
+  int nproc = Teuchos::size<int>(*mpicomm);
+  std::vector<int> sampleCounts(nproc, 0), sampleDispls(nproc, 0);
+  MPI_Gather(&nsamp,1,MPI_INT,&sampleCounts[0],1,MPI_INT,0,*(mpicomm->getRawMpiComm())());
+  for (int i = 1; i < nproc; ++i)
+    sampleDispls[i] = sampleDispls[i-1] + sampleCounts[i-1];
+  MPI_Gatherv(&myvalues[0],nsamp,MPI_DOUBLE,&gvalues[0],&sampleCounts[0],&sampleDispls[0],MPI_DOUBLE,0,*(mpicomm->getRawMpiComm())());
+  for (int j = 0; j < sdim; ++j)
+    MPI_Gatherv(&mysamples[j][0],nsamp,MPI_DOUBLE,&gsamples[j][0],&sampleCounts[0],&sampleDispls[0],MPI_DOUBLE,0,*(mpicomm->getRawMpiComm())());
+#else
+  gvalues.assign(myvalues.begin(),myvalues.end());
+  for (int j = 0; j < sdim; ++j)
+    gsamples[j].assign(mysamples[j].begin(),mysamples[j].end());
+#endif
+
+  // Print
+  int rank = Teuchos::rank<int>(*comm);
+  if ( rank==0 ) {
+    std::ofstream file;
+    file.open(filename);
+    file << std::scientific << std::setprecision(15);
+    for (int i = 0; i < ngsamp; ++i) {
+      for (int j = 0; j < sdim; ++j)
+        file << std::setw(25) << std::left << gsamples[j][i];
+      file << std::setw(25) << std::left << gvalues[i] << std::endl;
+    }
+    file.close();
+  }
+}
+
+int main(int argc, char *argv[]) {
+  //feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
+
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  Kokkos::ScopeGuard kokkosScope (argc, argv);
+  auto comm = Tpetra::getDefaultComm();
+  auto serial_comm = ROL::makePtr<Teuchos::SerialComm<int>>();
+  const int myRank = comm->getRank();
+  if ((iprint > 0) && (myRank == 0))
+    outStream = ROL::makePtrFromRef(std::cout);
+  else
+    outStream = ROL::makePtrFromRef(bhs);
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+
+    /*** Read in XML input ***/
+    std::string filename = "input_ex06.xml";
+    auto parlist = ROL::getParametersFromXmlFile(filename);
+    parlist->sublist("SimOpt").sublist("Solve").set("Output Iteration History",myRank==0);
+
+    /*** Initialize main data structure. ***/
+    auto meshMgr = ROL::makePtr<MeshManager_ThermalFluids<RealT,DeviceT>>(*parlist);
+    // Initialize PDE describing Navier-Stokes equations.
+    auto pde = ROL::makePtr<PDE_ThermalFluids_ex03<RealT,DeviceT>>(*parlist);
+    auto con = ROL::makePtr<PDE_Constraint<RealT,DeviceT>>(pde,meshMgr,serial_comm,*parlist,*outStream);
+    // Cast the constraint and get the assembler.
+    auto assembler = con->getAssembler();
+    con->setSolveParameters(*parlist);
+    con->outputTpetraData();
+
+    // Create state vector and set to zeroes
+    auto u_ptr  = assembler->createStateVector();     u_ptr->randomize();
+    auto p_ptr  = assembler->createStateVector();     p_ptr->randomize();
+    auto du_ptr = assembler->createStateVector();     du_ptr->randomize();
+    auto dy_ptr = assembler->createStateVector();     dy_ptr->randomize();
+    auto yu_ptr = assembler->createStateVector();     yu_ptr->randomize();
+    auto yp_ptr = assembler->createStateVector();     yp_ptr->randomize();
+    auto r_ptr  = assembler->createResidualVector();  r_ptr->randomize();
+    auto z_ptr  = assembler->createControlVector();   z_ptr->randomize();
+    auto dz_ptr = assembler->createControlVector();   dz_ptr->randomize();
+    auto up  = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(u_ptr,pde,assembler);
+    auto pp  = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(p_ptr,pde,assembler);
+    auto dup = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(du_ptr,pde,assembler);
+    auto dyp = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(dy_ptr,pde,assembler);
+    auto yup = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(yu_ptr,pde,assembler);
+    auto ypp = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(yp_ptr,pde,assembler);
+    auto rp  = ROL::makePtr<PDE_DualSimVector<RealT,DeviceT>>(r_ptr,pde,assembler);
+    auto zp  = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(z_ptr,pde,assembler);
+    auto dzp = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(dz_ptr,pde,assembler);
+    // Create ROL SimOpt vectors
+    ROL::Vector_SimOpt<RealT> x(up,zp);
+    ROL::Vector_SimOpt<RealT> d(dup,dzp);
+
+    // Initialize objective function.
+    std::vector<ROL::Ptr<QoI<RealT,DeviceT> > > qoi_vec(2,ROL::nullPtr);
+    qoi_vec[0] = ROL::makePtr<QoI_State_ThermalFluids<RealT,DeviceT>>(*parlist,
+                                                                 pde->getVelocityFE(),
+                                                                 pde->getPressureFE(),
+                                                                 pde->getThermalFE(),
+                                                                 pde->getFieldHelper());
+    qoi_vec[1] = ROL::makePtr<QoI_L2Penalty_ThermalFluids<RealT,DeviceT>>(pde->getVelocityFE(),
+                                                                     pde->getPressureFE(),
+                                                                     pde->getThermalFE(),
+                                                                     pde->getThermalBdryFE(),
+                                                                     pde->getBdryCellLocIds(),
+                                                                     pde->getFieldHelper());
+    auto std_obj = ROL::makePtr<StdObjective_ThermalFluids<RealT>>(*parlist);
+    auto obj = ROL::makePtr<PDE_Objective<RealT,DeviceT>>(qoi_vec,std_obj,assembler);
+    auto objState = ROL::makePtr<PDE_Objective<RealT,DeviceT>>(qoi_vec[0],assembler);
+    auto objCtrl = ROL::makePtr<PDE_Objective<RealT,DeviceT>>(qoi_vec[1],assembler);
+    auto stateStore = ROL::makePtr<ROL::VectorController<RealT>>();
+    auto robj = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(obj, con, stateStore, up, zp, pp, true, false);
+
+    /*************************************************************************/
+    /***************** BUILD SAMPLER *****************************************/
+    /*************************************************************************/
+    int Nbottom = parlist->sublist("Problem").get("Bottom KL Truncation Order",5);
+    int Nleft   = parlist->sublist("Problem").get("Left KL Truncation Order",5);
+    int Nright  = parlist->sublist("Problem").get("Right KL Truncation Order",5);
+    int stochDim = Nbottom + Nleft + Nright + 3;
+    int nsamp = parlist->sublist("Problem").get("Number of samples",100);
+    int nsamp_dist = parlist->sublist("Problem").get("Number of output samples",100);
+    // Build vector of distributions
+    std::vector<ROL::Ptr<ROL::Distribution<RealT>>> distVec(stochDim);
+    Teuchos::ParameterList UList;
+    UList.sublist("Distribution").set("Name","Uniform");
+    UList.sublist("Distribution").sublist("Uniform").set("Lower Bound",-1.0);
+    UList.sublist("Distribution").sublist("Uniform").set("Upper Bound", 1.0);
+    for (int i = 0; i < stochDim; ++i)
+      distVec[i] = ROL::DistributionFactory<RealT>(UList);
+    // Sampler
+    auto bman = ROL::makePtr<ROL::TpetraTeuchosBatchManager<RealT>>(comm);
+    //  = ROL::makePtr<PDE_OptVector_BatchManager<RealT>>(comm);
+    auto sampler = ROL::makePtr<ROL::MonteCarloGenerator<RealT>>(nsamp,distVec,bman);
+    auto sampler_dist = ROL::makePtr<ROL::MonteCarloGenerator<RealT>>(nsamp_dist,distVec,bman);
+
+    /*************************************************************************/
+    /***************** BUILD STOCHASTIC PROBLEM ******************************/
+    /*************************************************************************/
+    ROL::Ptr<ROL::StochasticProblem<RealT>> opt;
+    std::vector<RealT> ctrl, stat;
+
+    std::vector<RealT> lambda = ROL::getArrayFromStringParameter<RealT>(parlist->sublist("Problem"),"Convex Combination Parameters");
+    std::sort(lambda.begin(),lambda.end());
+    int N = lambda.size();
+    RealT alpha = parlist->sublist("Problem").get("CVaR Confidence Level",0.9);
+
+    /*************************************************************************/
+    /***************** SOLVE RISK NEUTRAL ************************************/
+    /*************************************************************************/
+    zp->zero(); up->zero();
+    RealT tol(1e-8), zero(0), one(1);
+    bool lambdaZero = (lambda[0] == zero);
+    if ( lambdaZero ) {
+      lambda.erase(lambda.begin()); --N;
+      // Solve.
+      parlist->sublist("SOL").set("Type","Risk Neutral");
+      opt = ROL::makePtr<ROL::StochasticProblem<RealT>>(robj,zp);
+      parlist->sublist("SOL").set("Initial Statistic",one);
+      opt->makeObjectiveStochastic(*parlist,sampler);
+      setUpAndSolve<RealT>(opt,*parlist,*outStream);
+      // Output.
+      ctrl.push_back(objCtrl->value(*up,*zp,tol));
+      stat.push_back(opt->getSolutionStatistic());
+      std::string CtrlRN = "control_RN.txt";
+      con->outputTpetraVector(z_ptr,CtrlRN);
+      std::string ObjRN = "obj_samples_RN.txt";
+      print<RealT>(*robj,*zp,*sampler_dist,nsamp_dist,comm,ObjRN);
+    }
+
+    /*************************************************************************/
+    /***************** SOLVE MEAN PLUS CVAR **********************************/
+    /*************************************************************************/
+    parlist->sublist("SOL").set("Type","Risk Averse");
+    parlist->sublist("SOL").sublist("Risk Measure").set("Name","CVaR");
+    parlist->sublist("SOL").sublist("Risk Measure").sublist("CVaR").set("Confidence Level",alpha);
+    parlist->sublist("SOL").sublist("Risk Measure").sublist("CVaR").set("Smoothing Parameter",1e-4);
+    parlist->sublist("SOL").sublist("Risk Measure").sublist("CVaR").sublist("Distribution").set("Name","Parabolic");
+    parlist->sublist("SOL").sublist("Risk Measure").sublist("CVaR").sublist("Distribution").sublist("Parabolic").set("Lower Bound",0.0);
+    parlist->sublist("SOL").sublist("Risk Measure").sublist("CVaR").sublist("Distribution").sublist("Parabolic").set("Upper Bound",1.0);
+    for (int i = 0; i < N; ++i) {
+      parlist->sublist("SOL").sublist("Risk Measure").sublist("CVaR").set("Convex Combination Parameter",one-lambda[i]);
+      // Solve.
+      opt = ROL::makePtr<ROL::StochasticProblem<RealT>>(robj,zp);
+      parlist->sublist("SOL").set("Initial Statistic",stat[i]);
+      opt->makeObjectiveStochastic(*parlist,sampler);
+      setUpAndSolve<RealT>(opt,*parlist,*outStream);
+      // Output.
+      ctrl.push_back(objCtrl->value(*up,*zp,tol));
+      stat.push_back(opt->getSolutionStatistic());
+      std::stringstream nameCtrl;
+      nameCtrl << "control_MCVaR_" << i << ".txt";
+      con->outputTpetraVector(z_ptr,nameCtrl.str().c_str());
+      std::stringstream nameObj;
+      nameObj << "obj_samples_MCVaR_" << i << ".txt";
+      print<RealT>(*robj,*zp,*sampler_dist,nsamp_dist,comm,nameObj.str());
+    }
+    
+    /*************************************************************************/
+    /***************** OUTPUT RESULTS ****************************************/
+    /*************************************************************************/
+    const int rank = Teuchos::rank<int>(*comm);
+    if ( rank==0 ) {
+      std::stringstream nameCTRL, nameVAR;
+      nameCTRL << "ctrl.txt";
+      nameVAR << "stat.txt";
+      std::ofstream fileCTRL, fileVAR;
+      fileCTRL.open(nameCTRL.str());
+      fileVAR.open(nameVAR.str());
+      fileCTRL << std::scientific << std::setprecision(15);
+      fileVAR << std::scientific << std::setprecision(15);
+      int size = stat.size();
+      for (int i = 0; i < size; ++i) {
+        fileCTRL << std::setw(25) << std::left << ctrl[i] << std::endl;
+        fileVAR << std::setw(25) << std::left << stat[i] << std::endl;
+      }
+      fileCTRL.close();
+      fileVAR.close();
+    }
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/thermal-fluids/example_07.cpp b/packages/rol/example/PDE-OPT/thermal-fluids/example_07.cpp
index 69b7ba9814e1..378353fed335 100644
--- a/packages/rol/example/PDE-OPT/thermal-fluids/example_07.cpp
+++ b/packages/rol/example/PDE-OPT/thermal-fluids/example_07.cpp
@@ -13,8 +13,7 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -109,7 +108,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int> > comm
     = Tpetra::getDefaultComm();
   ROL::Ptr<const Teuchos::Comm<int> > serial_comm
diff --git a/packages/rol/example/PDE-OPT/thermal-fluids/example_07K.cpp b/packages/rol/example/PDE-OPT/thermal-fluids/example_07K.cpp
new file mode 100644
index 000000000000..ed9ba8954f45
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/thermal-fluids/example_07K.cpp
@@ -0,0 +1,248 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_01.cpp
+    \brief Shows how to solve the Navier-Stokes control problem.
+*/
+
+#include "Teuchos_Comm.hpp"
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+//#include <fenv.h>
+
+#include "ROL_TpetraMultiVector.hpp"
+#include "ROL_Reduced_Objective_SimOpt.hpp"
+#include "ROL_MonteCarloGenerator.hpp"
+#include "ROL_TpetraTeuchosBatchManager.hpp"
+#include "ROL_StochasticProblem.hpp"
+#include "ROL_Solver.hpp"
+
+#include "../TOOLS/pdeconstraintK.hpp"
+#include "../TOOLS/pdeobjectiveK.hpp"
+#include "../TOOLS/pdevectorK.hpp"
+#include "../TOOLS/meshreaderK.hpp"
+#include "pde_thermal-fluids_ex07K.hpp"
+#include "obj_thermal-fluids_ex03K.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+template<class Real>
+void print(ROL::Objective<Real> &obj,
+           const ROL::Vector<Real> &z,
+           ROL::SampleGenerator<Real> &sampler,
+           int ngsamp,
+           const ROL::Ptr<const Teuchos::Comm<int> > &comm,
+           std::string filename) {
+  Real tol(1e-8);
+  // Build objective function distribution
+  int nsamp = sampler.numMySamples();
+  std::vector<Real> myvalues(nsamp), myzerovec(nsamp, 0);
+  std::vector<double> gvalues(ngsamp), gzerovec(ngsamp, 0);
+  std::vector<Real> sample = sampler.getMyPoint(0);
+  int sdim = sample.size();
+  std::vector<std::vector<Real> > mysamples(sdim, myzerovec);
+  std::vector<std::vector<double> > gsamples(sdim, gzerovec);
+  for (int i = 0; i < nsamp; ++i) {
+    sample = sampler.getMyPoint(i);
+    obj.setParameter(sample);
+    myvalues[i] = static_cast<double>(obj.value(z,tol));
+    for (int j = 0; j < sdim; ++j)
+      mysamples[j][i] = static_cast<double>(sample[j]);
+  }
+
+  // Send data to root processor
+#ifdef HAVE_MPI
+  auto mpicomm = ROL::dynamicPtrCast<const Teuchos::MpiComm<int> >(comm);
+  int nproc = Teuchos::size<int>(*mpicomm);
+  std::vector<int> sampleCounts(nproc, 0), sampleDispls(nproc, 0);
+  MPI_Gather(&nsamp,1,MPI_INT,&sampleCounts[0],1,MPI_INT,0,*(mpicomm->getRawMpiComm())());
+  for (int i = 1; i < nproc; ++i)
+    sampleDispls[i] = sampleDispls[i-1] + sampleCounts[i-1];
+  MPI_Gatherv(&myvalues[0],nsamp,MPI_DOUBLE,&gvalues[0],&sampleCounts[0],&sampleDispls[0],MPI_DOUBLE,0,*(mpicomm->getRawMpiComm())());
+  for (int j = 0; j < sdim; ++j)
+    MPI_Gatherv(&mysamples[j][0],nsamp,MPI_DOUBLE,&gsamples[j][0],&sampleCounts[0],&sampleDispls[0],MPI_DOUBLE,0,*(mpicomm->getRawMpiComm())());
+#else
+  gvalues.assign(myvalues.begin(),myvalues.end());
+  for (int j = 0; j < sdim; ++j)
+    gsamples[j].assign(mysamples[j].begin(),mysamples[j].end());
+#endif
+
+  // Print
+  int rank = Teuchos::rank<int>(*comm);
+  if ( rank==0 ) {
+    std::ofstream file;
+    file.open(filename);
+    file << std::scientific << std::setprecision(15);
+    for (int i = 0; i < ngsamp; ++i) {
+      for (int j = 0; j < sdim; ++j)
+        file << std::setw(25) << std::left << gsamples[j][i];
+      file << std::setw(25) << std::left << gvalues[i] << std::endl;
+    }
+    file.close();
+  }
+}
+
+int main(int argc, char *argv[]) {
+  //feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
+
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  Kokkos::ScopeGuard kokkosScope (argc, argv);
+  auto comm = Tpetra::getDefaultComm();
+  auto serial_comm = ROL::makePtr<Teuchos::SerialComm<int>>();
+  const int myRank = comm->getRank();
+  if ((iprint > 0) && (myRank == 0))
+    outStream = ROL::makePtrFromRef(std::cout);
+  else
+    outStream = ROL::makePtrFromRef(bhs);
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+
+    /*** Read in XML input ***/
+    std::string filename = "input_ex07.xml";
+    auto parlist = ROL::getParametersFromXmlFile(filename);
+
+    /*** Initialize main data structure. ***/
+    auto meshMgr = ROL::makePtr<MeshReader<RealT,DeviceT>>(*parlist);
+    // Initialize PDE describing Navier-Stokes equations.
+    auto pde = ROL::makePtr<PDE_ThermalFluids_ex07<RealT,DeviceT>>(*parlist);
+    auto con = ROL::makePtr<PDE_Constraint<RealT,DeviceT>>(pde,meshMgr,serial_comm,*parlist,*outStream);
+    // Cast the constraint and get the assembler.
+    auto assembler = con->getAssembler();
+    con->setSolveParameters(*parlist);
+    con->outputTpetraData();
+
+    // Create state vector and set to zeroes
+    auto u_ptr  = assembler->createStateVector();
+    auto p_ptr  = assembler->createStateVector();
+    auto r_ptr  = assembler->createResidualVector();
+    auto z_ptr  = assembler->createControlVector();
+    auto up  = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(u_ptr,pde,assembler);
+    auto pp  = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(p_ptr,pde,assembler);
+    auto rp  = ROL::makePtr<PDE_DualSimVector<RealT,DeviceT>>(r_ptr,pde,assembler);
+    auto zp  = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(z_ptr,pde,assembler);
+
+    // Create ROL SimOpt vectors
+    //ROL::Vector_SimOpt<RealT> x(up,zp);
+
+    // Initialize objective function.
+    std::vector<ROL::Ptr<QoI<RealT,DeviceT>>> qoi_vec(2,ROL::nullPtr);
+    qoi_vec[0] = ROL::makePtr<QoI_State_ThermalFluids<RealT,DeviceT>>(*parlist,
+                                                                 pde->getVelocityFE(),
+                                                                 pde->getPressureFE(),
+                                                                 pde->getThermalFE(),
+                                                                 pde->getFieldHelper());
+    qoi_vec[1] = ROL::makePtr<QoI_L2Penalty_ThermalFluids<RealT,DeviceT>>(pde->getVelocityFE(),
+                                                                     pde->getPressureFE(),
+                                                                     pde->getThermalFE(),
+                                                                     pde->getThermalBdryFE(),
+                                                                     pde->getBdryCellLocIds(),
+                                                                     pde->getFieldHelper());
+    auto std_obj = ROL::makePtr<StdObjective_ThermalFluids<RealT>>(*parlist);
+    auto obj = ROL::makePtr<PDE_Objective<RealT,DeviceT>>(qoi_vec,std_obj,assembler);
+    auto stateStore = ROL::makePtr<ROL::VectorController<RealT>>();
+    auto robj = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(obj, con, stateStore, up, zp, pp, true, false);
+
+    /*************************************************************************/
+    /***************** BUILD SAMPLER *****************************************/
+    /*************************************************************************/
+    int Nbottom = parlist->sublist("Problem").get("Bottom KL Truncation Order",5);
+    int N0      = parlist->sublist("Problem").get("Side 0 KL Truncation Order",5);
+    int N1      = parlist->sublist("Problem").get("Side 1 KL Truncation Order",5);
+    int N2      = parlist->sublist("Problem").get("Side 2 KL Truncation Order",5);
+    int N3      = parlist->sublist("Problem").get("Side 3 KL Truncation Order",5);
+    int stochDim = 2*(Nbottom + N0 + N1 + N2 + N3) + 3;
+    int nsamp = parlist->sublist("Problem").get("Number of samples",100);
+    int nsamp_dist = parlist->sublist("Problem").get("Number of output samples",100);
+    // Build vector of distributions
+    std::vector<ROL::Ptr<ROL::Distribution<RealT>>> distVec(stochDim);
+    Teuchos::ParameterList UList;
+    UList.sublist("Distribution").set("Name","Uniform");
+    UList.sublist("Distribution").sublist("Uniform").set("Lower Bound",-1.0);
+    UList.sublist("Distribution").sublist("Uniform").set("Upper Bound", 1.0);
+    for (int i = 0; i < stochDim; ++i)
+      distVec[i] = ROL::DistributionFactory<RealT>(UList);
+    // Sampler
+    auto bman = ROL::makePtr<ROL::TpetraTeuchosBatchManager<RealT>>(comm);
+    auto sampler = ROL::makePtr<ROL::MonteCarloGenerator<RealT>>(nsamp,distVec,bman);
+    auto sampler_dist = ROL::makePtr<ROL::MonteCarloGenerator<RealT>>(nsamp_dist,distVec,bman);
+
+    // Set up optimization problem, check derivatives and solve
+    zp->zero();
+    auto optProb = ROL::makePtr<ROL::StochasticProblem<RealT>>(robj,zp);
+    bool isStoch = parlist->sublist("Problem").get("Is stochastic?",false);
+    if (isStoch) {
+      optProb->makeObjectiveStochastic(*parlist,sampler);
+    }
+    if ( parlist->sublist("Problem").get("Check derivatives",false) ) {
+      optProb->check(true,*outStream);
+      auto xp = ROL::makePtr<ROL::Vector_SimOpt<RealT>>(up,zp);
+      ROL::Problem<RealT> op(obj,xp);
+      op.addConstraint("PDE",con,pp);
+      op.check(true,*outStream);
+    }
+    optProb->finalize(false,true,*outStream);
+    ROL::Solver<RealT> optSolver(optProb,*parlist);
+    optSolver.solve(*outStream);
+
+    // Output.
+    RealT tol = std::sqrt(ROL::ROL_EPSILON<RealT>());
+    if (isStoch) {
+      std::vector<RealT> pt;
+      RealT wt;
+      up->zero(); pp->zero();
+      for (int i = 0; i < sampler->numMySamples(); ++i) {
+        pt = sampler->getMyPoint(i);
+        wt = sampler->getMyWeight(i);
+        con->setParameter(pt);
+        con->solve(*rp,*up,*zp,tol);
+        pp->axpy(wt,*up);
+      }
+      up->zero();
+      sampler->sumAll(*pp,*up);
+    } else {
+      con->solve(*rp,*up,*zp,tol);
+    }
+    con->outputTpetraVector(u_ptr,"state.txt");
+
+    con->outputTpetraVector(z_ptr,"control.txt");
+    assembler->printMeshData(*outStream);
+    if (isStoch) {
+      print<RealT>(*robj,*zp,*sampler_dist,nsamp_dist,comm,"obj_samples.txt");
+    }
+
+    // Get a summary from the time monitor.
+    Teuchos::TimeMonitor::summarize();
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/thermal-fluids/input.xml b/packages/rol/example/PDE-OPT/thermal-fluids/input.xml
index 0069cadf0749..84521b260504 100644
--- a/packages/rol/example/PDE-OPT/thermal-fluids/input.xml
+++ b/packages/rol/example/PDE-OPT/thermal-fluids/input.xml
@@ -110,6 +110,7 @@
 
   <!-- ===========  BEGIN GENERAL INPUT PARAMETER SUBLIST  =========== -->
   <ParameterList name="General">
+    <Parameter name="Output Level"                           type="int"    value="1"     />
     <Parameter name="Variable Objective Function"            type="bool"   value="false" />
     <Parameter name="Scale for Epsilon Active Sets"          type="double" value="1.0"   />
     <!-- ===========  USE INEXACT OBJECTIVE OR DERIVATIVES  =========== -->
diff --git a/packages/rol/example/PDE-OPT/thermal-fluids/mesh_thermal-fluidsK.hpp b/packages/rol/example/PDE-OPT/thermal-fluids/mesh_thermal-fluidsK.hpp
new file mode 100644
index 000000000000..e02b547ab93a
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/thermal-fluids/mesh_thermal-fluidsK.hpp
@@ -0,0 +1,127 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+#include "../TOOLS/meshmanagerK.hpp"
+
+template <class Real, class DeviceType>
+class MeshManager_ThermalFluids : public MeshManager_Rectangle<Real,DeviceType> {
+private:
+  int nx_, ny_;
+  ROL::Ptr<std::vector<std::vector<std::vector<int>>>>  meshSideSets_;
+
+public:
+
+  MeshManager_ThermalFluids(Teuchos::ParameterList &parlist) : MeshManager_Rectangle<Real,DeviceType>(parlist) {
+    nx_ = parlist.sublist("Geometry").get("NX", 3);
+    ny_ = parlist.sublist("Geometry").get("NY", 1);
+    computeSideSets();
+  }
+
+  void computeSideSets() {
+
+    int numSideSets = 8;
+    meshSideSets_ = ROL::makePtr<std::vector<std::vector<std::vector<int>>>>(numSideSets);
+
+    Real patchFrac = static_cast<Real>(1)/static_cast<Real>(3);
+    int np1 = static_cast<int>(patchFrac * static_cast<Real>(nx_));
+    int np2 = static_cast<int>(patchFrac * static_cast<Real>(nx_));
+    int np3 = nx_-(np1+np2);
+
+    // Bottom
+    (*meshSideSets_)[0].resize(4);
+    (*meshSideSets_)[0][0].resize(nx_);
+    (*meshSideSets_)[0][1].resize(0);
+    (*meshSideSets_)[0][2].resize(0);
+    (*meshSideSets_)[0][3].resize(0);
+    // Right
+    (*meshSideSets_)[1].resize(4);
+    (*meshSideSets_)[1][0].resize(0);
+    (*meshSideSets_)[1][1].resize(ny_);
+    (*meshSideSets_)[1][2].resize(0);
+    (*meshSideSets_)[1][3].resize(0);
+    // Left
+    (*meshSideSets_)[2].resize(4);
+    (*meshSideSets_)[2][0].resize(0);
+    (*meshSideSets_)[2][1].resize(0);
+    (*meshSideSets_)[2][2].resize(0);
+    (*meshSideSets_)[2][3].resize(ny_);
+    // Top
+    (*meshSideSets_)[3].resize(4);
+    (*meshSideSets_)[3][0].resize(0);
+    (*meshSideSets_)[3][1].resize(0);
+    (*meshSideSets_)[3][2].resize(nx_);
+    (*meshSideSets_)[3][3].resize(0);
+    // Thermal boundaries
+    // Top Right
+    (*meshSideSets_)[4].resize(4);
+    (*meshSideSets_)[4][0].resize(0);
+    (*meshSideSets_)[4][1].resize(0);
+    (*meshSideSets_)[4][2].resize(np3);
+    (*meshSideSets_)[4][3].resize(0);
+    // Top Center
+    (*meshSideSets_)[5].resize(4);
+    (*meshSideSets_)[5][0].resize(0);
+    (*meshSideSets_)[5][1].resize(0);
+    (*meshSideSets_)[5][2].resize(np2);
+    (*meshSideSets_)[5][3].resize(0);
+    // Top Left
+    (*meshSideSets_)[6].resize(4);
+    (*meshSideSets_)[6][0].resize(0);
+    (*meshSideSets_)[6][1].resize(0);
+    (*meshSideSets_)[6][2].resize(np1);
+    (*meshSideSets_)[6][3].resize(0);
+    // Pressure Pinning
+    (*meshSideSets_)[7].resize(4);
+    //(*meshSideSets_)[7][0].resize(1);
+    //(*meshSideSets_)[7][1].resize(1);
+    //(*meshSideSets_)[7][2].resize(1);
+    //(*meshSideSets_)[7][3].resize(1);
+    (*meshSideSets_)[7][0].resize(1);
+    (*meshSideSets_)[7][1].resize(0);
+    (*meshSideSets_)[7][2].resize(0);
+    (*meshSideSets_)[7][3].resize(0);
+    
+    // Bottom
+    for (int i=0; i<nx_; ++i)
+      (*meshSideSets_)[0][0][i] = i;
+    // Right
+    for (int i=0; i<ny_; ++i)
+      (*meshSideSets_)[1][1][i] = (i+1)*nx_-1;
+    // Left
+    for (int i=0; i<ny_; ++i)
+      (*meshSideSets_)[2][3][i] = i*nx_;
+    // Top
+    for (int i=0; i<nx_; ++i)
+      (*meshSideSets_)[3][2][i] = i + nx_*(ny_-1);
+    // Top Left
+    for (int i=0; i<np1; ++i)
+      (*meshSideSets_)[6][2][i] = i + nx_*(ny_-1);
+    // Top Center
+    for (int i=0; i<np2; ++i)
+      (*meshSideSets_)[5][2][i] = i + np1 + nx_*(ny_-1);
+    // Top Right
+    for (int i=0; i<np3; ++i)
+      (*meshSideSets_)[4][2][i] = i + (np1+np2) + nx_*(ny_-1);
+    // Pressure Pinning
+    //int offset = nx_ * (ny_/2) + (nx_-1)/2;
+    //(*meshSideSets_)[7][0][0] = offset + nx_ + 1;
+    //(*meshSideSets_)[7][1][0] = offset + nx_;
+    //(*meshSideSets_)[7][2][0] = offset;
+    //(*meshSideSets_)[7][3][0] = offset + 1;
+    (*meshSideSets_)[7][0][0] = 0;
+
+  } // computeSideSets
+
+  ROL::Ptr<std::vector<std::vector<std::vector<int>>>> getSideSets(
+              const bool verbose = false,
+              std::ostream & outStream = std::cout) const { 
+    return meshSideSets_;
+  }
+
+}; // MeshManager_ThermalFluids
diff --git a/packages/rol/example/PDE-OPT/thermal-fluids/obj_thermal-fluidsK.hpp b/packages/rol/example/PDE-OPT/thermal-fluids/obj_thermal-fluidsK.hpp
new file mode 100644
index 000000000000..dfae49f01bba
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/thermal-fluids/obj_thermal-fluidsK.hpp
@@ -0,0 +1,968 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  obj.hpp
+    \brief Provides the interface for local (cell-based) objective function computations.
+*/
+
+#ifndef PDEOPT_QOI_THERMALFLUIDSK_HPP
+#define PDEOPT_QOI_THERMALFLUIDSK_HPP
+
+#include "../TOOLS/qoiK.hpp"
+#include "pde_thermal-fluidsK.hpp"
+
+template <class Real, class DeviceType>
+class QoI_Vorticity_ThermalFluids : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> feVel_, fePrs_, feThr_;
+  const ROL::Ptr<FieldHelper<Real,DeviceType>> fieldHelper_;
+  scalar_view weight_;
+
+  const Real eps_;
+  Real sw_, sh_, ow_;
+
+  Real weightFunc(const std::vector<Real> & x) const {
+    return (((x[1] <= sh_+eps_)&&(x[0] >= sw_-eps_)&&(x[0] <= sw_+ow_+eps_)) ?
+           static_cast<Real>(1) : static_cast<Real>(0));
+  }
+
+public:
+  QoI_Vorticity_ThermalFluids(const ROL::Ptr<fe_type> &feVel,
+                              const ROL::Ptr<fe_type> &fePrs,
+                              const ROL::Ptr<fe_type> &feThr,
+                              const ROL::Ptr<FieldHelper<Real,DeviceType>> &fieldHelper,
+                              Teuchos::ParameterList &parlist)
+    : feVel_(feVel), fePrs_(fePrs), feThr_(feThr),
+      fieldHelper_(fieldHelper), eps_(std::sqrt(ROL::ROL_EPSILON<Real>())) {
+    sw_ = parlist.sublist("Geometry").get("Step width",1.0);
+    sh_ = parlist.sublist("Geometry").get("Step height",0.5);
+    ow_ = parlist.sublist("Problem").get("Observation width",2.0);
+    int c = feVel_->cubPts().extent_int(0);
+    int p = feVel_->cubPts().extent_int(1);
+    int d = feVel_->cubPts().extent_int(2);
+    std::vector<Real> pt(d);
+    weight_ = scalar_view("weight",c,p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k)
+          pt[k] = (feVel_->cubPts())(i,j,k);
+        weight_(i,j) = weightFunc(pt);
+      }
+    }
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = u_coeff.extent_int(0);
+    int p = feVel_->cubPts().extent_int(1);
+    int d = feVel_->cubPts().extent_int(2);
+    // Initialize output val
+    val = scalar_view("val", c);
+    // Get components of the control
+    std::vector<scalar_view> U;
+    fieldHelper_->splitFieldCoeff(U, u_coeff);
+    // Evaluate on FE basis
+    std::vector<scalar_view> gradU_vec(d);
+    for (int i = 0; i < d; ++i) {
+      gradU_vec[i] = scalar_view("gradU_vec", c, p, d);
+      feVel_->evaluateGradient(gradU_vec[i], U[i]);
+    }
+    // Compute weighted curl
+    scalar_view curlU_eval;
+    if (d==2)
+      curlU_eval = scalar_view("curlU_eval", c, p);
+    else if (d==3)
+      curlU_eval = scalar_view("curlU_eval", c, p, d);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        if (d==2) {
+          curlU_eval(i,j) = weight_(i,j)*((gradU_vec[1])(i,j,0) - (gradU_vec[0])(i,j,1));
+        }
+        else if (d==3) {
+          for (int k = 0; k < d; ++k) {
+            int i1 = (k+2)%d, i2 = (k+1)%d;
+            curlU_eval(i,j,k) = weight_(i,j)*((gradU_vec[i1])(i,j,i2) - (gradU_vec[i2])(i,j,i1));
+          }
+        }
+      }
+    }
+    // Compute L2 norm squared
+    feVel_->computeIntegral(val,curlU_eval,curlU_eval,false);
+    rst::scale(val,static_cast<Real>(0.5));
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = u_coeff.extent_int(0);
+    int fv = feVel_->N().extent_int(1);
+    int fp = fePrs_->N().extent_int(1);
+    int fh = feThr_->N().extent_int(1);
+    int p = feVel_->cubPts().extent_int(1);
+    int d = feVel_->cubPts().extent_int(2);
+    // Initialize output grad
+    std::vector<scalar_view> G(d+2);
+    for (int i = 0; i < d; ++i)
+      G[i] = scalar_view("grad", c, fv);
+    G[d] = scalar_view("grad", c, fp);
+    G[d+1] = scalar_view("grad", c, fh);
+    // Get components of the control
+    std::vector<scalar_view> U;
+    fieldHelper_->splitFieldCoeff(U, u_coeff);
+    // Evaluate on FE basis
+    std::vector<scalar_view> gradU_vec(d);
+    for (int i = 0; i < d; ++i) {
+      gradU_vec[i] = scalar_view("gradU_vec", c, p, d);
+      feVel_->evaluateGradient(gradU_vec[i], U[i]);
+    }
+    // Compute weighted curl
+    int size = (d==2) ? 1 : d;
+    std::vector<scalar_view> curlU_vec(size);
+    if (d==2) {
+      curlU_vec[0] = scalar_view("curlU_vec", c, p);
+      for (int i = 0; i < c; ++i) {
+        for (int j = 0; j < p; ++j) {
+          (curlU_vec[0])(i,j) = weight_(i,j)*((gradU_vec[1])(i,j,0) - (gradU_vec[0])(i,j,1));
+        }
+      }
+    }
+    else if (d==3) {
+      for (int i = 0; i < d; ++i) {
+        curlU_vec[i] = scalar_view("curlU_vec", c, p);
+        for (int j = 0; j < c; ++j) {
+          for (int k = 0; k < p; ++k) {
+            int i1 = (i+2)%d, i2 = (i+1)%d;
+            (curlU_vec[i])(j,k) = weight_(j,k)*((gradU_vec[i1])(j,k,i2) - (gradU_vec[i2])(j,k,i1));
+          }
+        }
+      }
+    }
+    // Build local gradient of state tracking term
+    if (d==2) {
+      fst::integrate(G[0],curlU_vec[0],feVel_->DNDdetJ(1),false);
+      rst::scale(G[0],static_cast<Real>(-1));
+      fst::integrate(G[1],curlU_vec[0],feVel_->DNDdetJ(0),false);
+    }
+    else if (d==3) {
+      for (int i = 0; i < d; ++i) {
+        int i1 = (i+2)%d, i2 = (i+1)%d;
+        fst::integrate(G[i],curlU_vec[i1],feVel_->DNDdetJ(i2),false);
+        rst::scale(G[i],static_cast<Real>(-1));
+        fst::integrate(G[i],curlU_vec[i2],feVel_->DNDdetJ(i1),false);
+      }
+    }
+    fieldHelper_->combineFieldCoeff(grad, G);
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Vorticity_ThermalFluids::gradient_2 is zero.");
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c  = z_coeff.extent_int(0);
+    int p  = feVel_->cubPts().extent_int(1);
+    int fv = feVel_->N().extent_int(1);
+    int fp = fePrs_->N().extent_int(1);
+    int fh = feThr_->N().extent_int(1);
+    int d = feVel_->cubPts().extent_int(2);
+    // Initialize output grad
+    std::vector<scalar_view> H(d+2);
+    for (int i = 0; i < d; ++i)
+      H[i] = scalar_view("hess", c, fv);
+    H[d] = scalar_view("hess", c, fp);
+    H[d+1] = scalar_view("hess", c, fh);
+    // Get components of the control
+    std::vector<scalar_view> V;
+    fieldHelper_->splitFieldCoeff(V, v_coeff);
+    // Evaluate on FE basis
+    std::vector<scalar_view> gradV_vec(d);
+    for (int i = 0; i < d; ++i) {
+      gradV_vec[i] = scalar_view("gradV_vec", c, p, d);
+      feVel_->evaluateGradient(gradV_vec[i], V[i]);
+    }
+    // Compute weighted curl
+    int size = (d==2) ? 1 : d;
+    std::vector<scalar_view> curlV_vec(size);
+    if (d==2) {
+      curlV_vec[0] = scalar_view("curlV_vec", c, p);
+      for (int i = 0; i < c; ++i) {
+        for (int j = 0; j < p; ++j) {
+          (curlV_vec[0])(i,j) = weight_(i,j)*((gradV_vec[1])(i,j,0) - (gradV_vec[0])(i,j,1));
+        }
+      }
+    }
+    else if (d==3) {
+      for (int i = 0; i < d; ++i) {
+        curlV_vec[i] = scalar_view("curlV_vec", c, p);
+        for (int j = 0; j < c; ++j) {
+          for (int k = 0; k < p; ++k) {
+            int i1 = (i+2)%d, i2 = (i+1)%d;
+            (curlV_vec[i])(j,k) = weight_(j,k)*((gradV_vec[i1])(j,k,i2) - (gradV_vec[i2])(j,k,i1));
+          }
+        }
+      }
+    }
+    // Build local gradient of state tracking term
+    if (d==2) {
+      fst::integrate(H[0],curlV_vec[0],feVel_->DNDdetJ(1),false);
+      rst::scale(H[0],static_cast<Real>(-1));
+      fst::integrate(H[1],curlV_vec[0],feVel_->DNDdetJ(0),false);
+    }
+    else if (d==3) {
+      for (int i = 0; i < d; ++i) {
+        int i1 = (i+2)%d, i2 = (i+1)%d;
+        fst::integrate(H[i],curlV_vec[i1],feVel_->DNDdetJ(i2),false);
+        rst::scale(H[i],static_cast<Real>(-1));
+        fst::integrate(H[i],curlV_vec[i2],feVel_->DNDdetJ(i1),false);
+      }
+    }
+
+    fieldHelper_->combineFieldCoeff(hess, H);
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Vorticity_ThermalFluids::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Vorticity_ThermalFluids::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Vorticity_ThermalFluids::HessVec_22 is zero.");
+  }
+
+}; // QoI_Vorticity_ThermalFluids
+
+template <class Real, class DeviceType>
+class QoI_Circulation_ThermalFluids : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> feVel_, fePrs_, feThr_;
+  const ROL::Ptr<FieldHelper<Real,DeviceType>> fieldHelper_;
+  scalar_view weight_;
+
+  const Real eps_;
+
+  Real weightFunc(const std::vector<Real> & x) const {
+    return (((x[1] <= 1.0+eps_)&&(x[0] >= 1.0-eps_)&&(x[0] <= 3.0+eps_)) ?
+                static_cast<Real>(1) : static_cast<Real>(0));
+  }
+
+public:
+  QoI_Circulation_ThermalFluids(const ROL::Ptr<fe_type> &feVel,
+                                const ROL::Ptr<fe_type> &fePrs,
+                                const ROL::Ptr<fe_type> &feThr,
+                                const ROL::Ptr<FieldHelper<Real,DeviceType>> &fieldHelper)
+    : feVel_(feVel), fePrs_(fePrs), feThr_(feThr),
+      fieldHelper_(fieldHelper), eps_(std::sqrt(ROL::ROL_EPSILON<Real>())) {
+    int c = feVel_->cubPts().extent_int(0);
+    int p = feVel_->cubPts().extent_int(1);
+    int d = feVel_->cubPts().extent_int(2);
+    std::vector<Real> pt(d);
+    weight_ = scalar_view("weight",c,p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k)
+          pt[k] = (feVel_->cubPts())(i,j,k);
+        weight_(i,j) = weightFunc(pt);
+      }
+    }
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = u_coeff.extent_int(0);
+    int p = feVel_->cubPts().extent_int(1);
+    int d = feVel_->cubPts().extent_int(2);
+    // Initialize output val
+    val = scalar_view("val", c);
+    // Get components of the control
+    std::vector<scalar_view> U;
+    fieldHelper_->splitFieldCoeff(U, u_coeff);
+    // Evaluate on FE basis
+    scalar_view gradUX_eval("gradUX_eval", c, p, d);
+    scalar_view gradUY_eval("gradUY_eval", c, p, d);
+    feVel_->evaluateGradient(gradUX_eval, U[0]);
+    feVel_->evaluateGradient(gradUY_eval, U[1]);
+    // Compute curl
+    scalar_view curlU_eval("curlU_eval", c, p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        curlU_eval(i,j) = (gradUY_eval)(i,j,0) - (gradUX_eval)(i,j,1);
+      }
+    }
+    // Compute circulation
+    feVel_->computeIntegral(val,curlU_eval,weight_,false);
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = u_coeff.extent_int(0);
+    int fv = feVel_->N().extent_int(1);
+    int fp = fePrs_->N().extent_int(1);
+    int fh = feThr_->N().extent_int(1);
+    int d = feVel_->cubPts().extent_int(2);
+    // Initialize output grad
+    std::vector<scalar_view> G(d+2);
+    for (int i = 0; i < d; ++i)
+      G[i] = scalar_view("grad", c, fv);
+    G[d] = scalar_view("grad", c, fp);
+    G[d+1] = scalar_view("grad", c, fh);
+    // Build local gradient of state tracking term
+    fst::integrate(G[0],weight_,feVel_->DNDdetJ(1),false);
+    rst::scale(G[0],static_cast<Real>(-1));
+    fst::integrate(G[1],weight_,feVel_->DNDdetJ(0),false);
+
+    fieldHelper_->combineFieldCoeff(grad, G);
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Circulation_ThermalFluids::gradient_2 is zero.");
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Circulation_ThermalFluids::HessVec_11 is zero.");
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Circulation_ThermalFluids::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Circulation_ThermalFluids::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Circulation_ThermalFluids::HessVec_22 is zero.");
+  }
+
+}; // QoI_Circulation_ThermalFluids
+
+template <class Real, class DeviceType>
+class QoI_Horizontal_ThermalFluids : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> feVel_, fePrs_, feThr_;
+  const ROL::Ptr<FieldHelper<Real,DeviceType>> fieldHelper_;
+  scalar_view weight_;
+
+  const Real eps_;
+
+  Real weightFunc(const std::vector<Real> & x) const {
+    return (((x[1] <= 0.5+eps_)&&(x[0] >= 1.0-eps_)&&(x[0] <= 3.0+eps_)) ?
+                static_cast<Real>(1) : static_cast<Real>(0));
+  }
+
+public:
+  QoI_Horizontal_ThermalFluids(const ROL::Ptr<fe_type> &feVel,
+                               const ROL::Ptr<fe_type> &fePrs,
+                               const ROL::Ptr<fe_type> &feThr,
+                               const ROL::Ptr<FieldHelper<Real,DeviceType>> &fieldHelper)
+    : feVel_(feVel), fePrs_(fePrs), feThr_(feThr),
+      fieldHelper_(fieldHelper), eps_(std::sqrt(ROL::ROL_EPSILON<Real>())) {
+    int c = feVel_->cubPts().extent_int(0);
+    int p = feVel_->cubPts().extent_int(1);
+    int d = feVel_->cubPts().extent_int(2);
+    std::vector<Real> pt(d);
+    weight_ = scalar_view("weight",c,p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k)
+          pt[k] = (feVel_->cubPts())(i,j,k);
+        weight_(i,j) = weightFunc(pt);
+      }
+    }
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = u_coeff.extent_int(0);
+    int p = feVel_->cubPts().extent_int(1);
+    // Initialize output val
+    val = scalar_view("val",c);
+    // Get components of the control
+    std::vector<scalar_view> U;
+    fieldHelper_->splitFieldCoeff(U, u_coeff);
+    // Evaluate on FE basis
+    scalar_view valUX_eval("valUX_eval", c, p);
+    scalar_view valUY_eval("valUY_eval", c, p);
+    feVel_->evaluateValue(valUX_eval, U[0]);
+    feVel_->evaluateValue(valUY_eval, U[1]);
+    // Compute negative part of x-velocity
+    scalar_view minUX_eval("minUX_eval", c, p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        minUX_eval(i,j) = std::min(static_cast<Real>(0),valUX_eval(i,j));
+      }
+    }
+    // Multiply by weight
+    scalar_view weighted_minUX_eval("weighted_minUX_eval", c, p);
+    fst::scalarMultiplyDataData(weighted_minUX_eval,weight_,minUX_eval);
+    scalar_view weighted_valUY_eval("weighted_valUY_eval", c, p);
+    fst::scalarMultiplyDataData(weighted_valUY_eval,weight_,valUY_eval);
+    // Compute L2 norm squared
+    feVel_->computeIntegral(val,minUX_eval,weighted_minUX_eval,false);
+    feVel_->computeIntegral(val,valUY_eval,weighted_valUY_eval,true);
+
+    rst::scale(val,static_cast<Real>(0.5));
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = u_coeff.extent_int(0);
+    int fv = feVel_->N().extent_int(1);
+    int fp = fePrs_->N().extent_int(1);
+    int fh = feThr_->N().extent_int(1);
+    int p = feVel_->cubPts().extent_int(1);
+    int d = feVel_->cubPts().extent_int(2);
+    // Initialize output grad
+    std::vector<scalar_view> G(d+2);
+    for (int i = 0; i < d; ++i)
+      G[i] = scalar_view("grad", c, fv);
+    G[d] = scalar_view("grad", c, fp);
+    G[d+1] = scalar_view("grad", c, fh);
+    // Get components of the control
+    std::vector<scalar_view> U;
+    fieldHelper_->splitFieldCoeff(U, u_coeff);
+    // Evaluate on FE basis
+    scalar_view valUX_eval("valUX_eval", c, p);
+    scalar_view valUY_eval("valUY_eval", c, p);
+    feVel_->evaluateValue(valUX_eval, U[0]);
+    feVel_->evaluateValue(valUY_eval, U[1]);
+    // Compute negative part of x-velocity
+    scalar_view weighted_minUX_eval("weighted_minUX_eval", c, p);
+    scalar_view weighted_valUY_eval("weighted_valUY_eval", c, p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        weighted_minUX_eval(i,j) = weight_(i,j) * std::min(static_cast<Real>(0),valUX_eval(i,j));
+        weighted_valUY_eval(i,j) = weight_(i,j) * valUY_eval(i,j);
+      }
+    }
+    // Build local gradient of state tracking term
+    fst::integrate(G[0],weighted_minUX_eval,feVel_->NdetJ(),false);
+    fst::integrate(G[1],weighted_valUY_eval,feVel_->NdetJ(),false);
+
+    fieldHelper_->combineFieldCoeff(grad, G);
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Horizontal_ThermalFluids::gradient_2 is zero.");
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c  = z_coeff.extent_int(0);
+    int p  = feVel_->cubPts().extent_int(1);
+    int fv = feVel_->N().extent_int(1);
+    int fp = fePrs_->N().extent_int(1);
+    int fh = feThr_->N().extent_int(1);
+    int d = feVel_->cubPts().extent_int(2);
+    // Initialize output grad
+    std::vector<scalar_view> H(d+2);
+    for (int i = 0; i < d; ++i)
+      H[i] = scalar_view("hess", c, fv);
+    H[d] = scalar_view("hess", c, fp);
+    H[d+1] = scalar_view("hess", c, fh);
+    // Get components of the control
+    std::vector<scalar_view> U;
+    fieldHelper_->splitFieldCoeff(U, u_coeff);
+    std::vector<scalar_view> V;
+    fieldHelper_->splitFieldCoeff(V, v_coeff);
+    // Evaluate on FE basis
+    scalar_view valUX_eval("valUX_eval", c, p);
+    scalar_view valVX_eval("valVX_eval", c, p);
+    scalar_view valVY_eval("valVY_eval", c, p);
+    feVel_->evaluateValue(valUX_eval, U[0]);
+    feVel_->evaluateValue(valVX_eval, V[0]);
+    feVel_->evaluateValue(valVY_eval, V[1]);
+    // Compute negative part of x-velocity
+    scalar_view weighted_minVX_eval("weighted_V_eval", c, p);
+    scalar_view weighted_valVY_eval("weighted_V_eval", c, p);
+    const Real zero(0), one(1);
+    Real scale(0), uij(0);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        uij = valUX_eval(i,j);
+        scale = (uij < zero ? one : (uij > zero ? zero : one));
+        weighted_minVX_eval(i,j) = scale * weight_(i,j) * valVX_eval(i,j);
+        weighted_valVY_eval(i,j) = weight_(i,j) * valVY_eval(i,j);
+      }
+    }
+    // Build local gradient of state tracking term
+    fst::integrate(H[0],weighted_minVX_eval,feVel_->NdetJ(),false);
+    fst::integrate(H[1],weighted_valVY_eval,feVel_->NdetJ(),false);
+
+    fieldHelper_->combineFieldCoeff(hess, H);
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Horizontal_ThermalFluids::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Horizontal_ThermalFluids::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Horizontal_ThermalFluids::HessVec_22 is zero.");
+  }
+
+}; // QoI_Horizontal_ThermalFluids
+
+template <class Real, class DeviceType>
+class QoI_State_ThermalFluids : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  ROL::Ptr<QoI<Real,DeviceType>> qoi_;
+
+public:
+  QoI_State_ThermalFluids(Teuchos::ParameterList &parlist,
+                         const ROL::Ptr<fe_type> &feVel,
+                         const ROL::Ptr<fe_type> &fePrs,
+                         const ROL::Ptr<fe_type> &feThr,
+                         const ROL::Ptr<FieldHelper<Real,DeviceType>> &fieldHelper) {
+    std::string stateObj = parlist.sublist("Problem").get("Objective type","Vorticity");
+    if ( stateObj != "Vorticity" && stateObj != "Circulation" && stateObj != "Directional" )
+      throw Exception::NotImplemented(">>> (QoI_State_ThermalFluids): Unknown objective type."); 
+    if ( stateObj == "Vorticity" )
+      qoi_ = ROL::makePtr<QoI_Vorticity_ThermalFluids<Real,DeviceType>>(feVel,fePrs,feThr,fieldHelper,parlist);
+    else if ( stateObj == "Directional" )
+      qoi_ = ROL::makePtr<QoI_Horizontal_ThermalFluids<Real,DeviceType>>(feVel,fePrs,feThr,fieldHelper);
+    else
+      qoi_ = ROL::makePtr<QoI_Circulation_ThermalFluids<Real,DeviceType>>(feVel,fePrs,feThr,fieldHelper);
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    return qoi_->value(val, u_coeff, z_coeff, z_param);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    qoi_->gradient_1(grad, u_coeff, z_coeff, z_param);
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    qoi_->gradient_2(grad, u_coeff, z_coeff, z_param);
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    qoi_->HessVec_11(hess, v_coeff, u_coeff, z_coeff, z_param);
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    qoi_->HessVec_12(hess, v_coeff, u_coeff, z_coeff, z_param);
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    qoi_->HessVec_21(hess, v_coeff, u_coeff, z_coeff, z_param);
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    qoi_->HessVec_22(hess, v_coeff, u_coeff, z_coeff, z_param);
+  }
+
+};
+
+template <class Real, class DeviceType>
+class QoI_L2Penalty_ThermalFluids : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> feVel_, fePrs_, feThr_;
+  const std::vector<std::vector<ROL::Ptr<fe_type>>> feThrBdry_;
+  const std::vector<std::vector<std::vector<int>>> bdryCellLocIds_;
+  const ROL::Ptr<FieldHelper<Real,DeviceType>> fieldHelper_;
+
+  scalar_view getBoundaryCoeff(const scalar_view cell_coeff, int sideSet, int cell) const {
+    std::vector<int> bdryCellLocId = bdryCellLocIds_[sideSet][cell];
+    const int numCellsSide = bdryCellLocId.size();
+    const int f = feThr_->N().extent_int(1);
+    
+    scalar_view bdry_coeff("bdry_coeff", numCellsSide, f);
+    for (int i = 0; i < numCellsSide; ++i) {
+      for (int j = 0; j < f; ++j)
+        bdry_coeff(i, j) = cell_coeff(bdryCellLocId[i], j);
+    }
+    return bdry_coeff;
+  }
+
+public:
+  QoI_L2Penalty_ThermalFluids(const ROL::Ptr<fe_type> &feVel,
+                              const ROL::Ptr<fe_type> &fePrs,
+                              const ROL::Ptr<fe_type> &feThr,
+                              const std::vector<std::vector<ROL::Ptr<fe_type>>> &feThrBdry,
+                              const std::vector<std::vector<std::vector<int>>> &bdryCellLocIds,
+                              const ROL::Ptr<FieldHelper<Real,DeviceType>> &fieldHelper)
+    : feVel_(feVel), fePrs_(fePrs), feThr_(feThr), feThrBdry_(feThrBdry),
+      bdryCellLocIds_(bdryCellLocIds), fieldHelper_(fieldHelper) {}
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    const int c = feVel_->gradN().extent_int(0);
+    const int d = feVel_->gradN().extent_int(3);
+    // Get components of the control
+    std::vector<scalar_view> Z;
+    fieldHelper_->splitFieldCoeff(Z, z_coeff);
+    // Initialize output val
+    val = scalar_view("val", c);
+    // Compute cost integral
+    const int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if ( i == 0 || i == 3 ) {
+          const int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            const int numCellsSide = bdryCellLocIds_[i][j].size();
+            if ( numCellsSide ) {
+              const int numCubPerSide = feThrBdry_[i][j]->cubPts().extent_int(1);
+              // Evaluate control on FE basis
+              scalar_view z_coeff_bdry = getBoundaryCoeff(Z[d+1], i, j);
+              scalar_view valZ_eval("valZ_eval", numCellsSide, numCubPerSide);
+              feThrBdry_[i][j]->evaluateValue(valZ_eval, z_coeff_bdry);
+              // Integrate cell L2 norm squared
+              scalar_view intVal("intVal", numCellsSide);
+              feThrBdry_[i][j]->computeIntegral(intVal,valZ_eval,valZ_eval,false);
+              // Add to integral value
+              for (int k = 0; k < numCellsSide; ++k) {
+                int cidx = bdryCellLocIds_[i][j][k];
+                val(cidx) += static_cast<Real>(0.5)*intVal(k);
+              }
+            }
+          }
+        }
+      }
+    }
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_L2Penalty_ThermalFluids::gradient_1 is zero.");
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fv = feVel_->gradN().extent_int(1);
+    const int fp = fePrs_->gradN().extent_int(1);
+    const int fh = feThr_->gradN().extent_int(1);
+    const int d  = feThr_->gradN().extent_int(3);
+    // Initialize output grad
+    std::vector<scalar_view> G(d+2);
+    for (int i = 0; i < d; ++i)
+      G[i] = scalar_view("grad", c, fv);
+    G[d] = scalar_view("grad", c, fp);
+    G[d+1] = scalar_view("grad", c, fh);
+    // Get components of the control
+    std::vector<scalar_view> Z;
+    fieldHelper_->splitFieldCoeff(Z, z_coeff);
+    // Compute cost integral
+    const int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if ( i == 0 || i == 3 ) {
+          const int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            const int numCellsSide = bdryCellLocIds_[i][j].size();
+            if ( numCellsSide ) {
+              const int numCubPerSide = feThrBdry_[i][j]->cubPts().extent_int(1);
+              // Evaluate control on FE basis
+              scalar_view z_coeff_bdry = getBoundaryCoeff(Z[d+1], i, j);
+              scalar_view valZ_eval("valZ_eval", numCellsSide, numCubPerSide);
+              feThrBdry_[i][j]->evaluateValue(valZ_eval, z_coeff_bdry);
+              // Compute gradient of squared L2-norm of diff
+              scalar_view intGrad("intGrad", numCellsSide, fh);
+              fst::integrate(intGrad,valZ_eval,feThrBdry_[i][j]->NdetJ(),false);
+              // Add to integral value
+              for (int k = 0; k < numCellsSide; ++k) {
+                int cidx = bdryCellLocIds_[i][j][k];
+                for (int l = 0; l < fh; ++l)
+                  (G[d+1])(cidx,l) += intGrad(k,l);
+              }
+            }
+          }
+        }
+      }
+    }
+
+    fieldHelper_->combineFieldCoeff(grad, G);
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_L2Penalty_ThermalFluids::HessVec_11 is zero.");
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_L2Penalty_ThermalFluids::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_L2Penalty_ThermalFluids::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fv = feVel_->gradN().extent_int(1);
+    const int fp = fePrs_->gradN().extent_int(1);
+    const int fh = feThr_->gradN().extent_int(1);
+    const int d  = feThr_->gradN().extent_int(3);
+    // Initialize output grad
+    std::vector<scalar_view> H(d+2);
+    for (int i = 0; i < d; ++i)
+      H[i] = scalar_view("hess", c, fv);
+    H[d] = scalar_view("hess", c, fp);
+    H[d+1] = scalar_view("hess", c, fh);
+    // Get components of the control
+    std::vector<scalar_view> V;
+    fieldHelper_->splitFieldCoeff(V, v_coeff);
+    // Compute cost integral
+    const int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if ( i == 0 || i == 3 ) {
+          const int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            const int numCellsSide = bdryCellLocIds_[i][j].size();
+            if ( numCellsSide ) {
+              const int numCubPerSide = feThrBdry_[i][j]->cubPts().extent_int(1);
+              // Evaluate control on FE basis
+              scalar_view v_coeff_bdry = getBoundaryCoeff(V[d+1], i, j);
+              scalar_view valV_eval("valV_eval", numCellsSide, numCubPerSide);
+              feThrBdry_[i][j]->evaluateValue(valV_eval, v_coeff_bdry);
+              // Compute gradient of squared L2-norm of diff
+              scalar_view intHess("intHess", numCellsSide, fh);
+              fst::integrate(intHess,valV_eval,feThrBdry_[i][j]->NdetJ(),false);
+              // Add to integral value
+              for (int k = 0; k < numCellsSide; ++k) {
+                int cidx = bdryCellLocIds_[i][j][k];
+                for (int l = 0; l < fh; ++l)
+                  (H[d+1])(cidx,l) += intHess(k,l);
+              }
+            }
+          }
+        }
+      }
+    }
+
+    fieldHelper_->combineFieldCoeff(hess, H);
+  }
+
+}; // QoI_L2Penalty_ThermalFluids
+
+template <class Real>
+class StdObjective_ThermalFluids : public ROL::StdObjective<Real> {
+private:
+  Real alpha_;
+  std::string stateObj_;
+
+public:
+  StdObjective_ThermalFluids(Teuchos::ParameterList &parlist) {
+    alpha_    = parlist.sublist("Problem").get("Control penalty parameter",1.e-4);
+    stateObj_ = parlist.sublist("Problem").get("Objective type","Vorticity");
+    if ( stateObj_ != "Vorticity" && stateObj_ != "Circulation" && stateObj_ != "Directional") {
+      throw Exception::NotImplemented(">>> (StdObjective_ThermalFluids): Unknown objective type."); 
+    }
+  }
+
+  Real value(const std::vector<Real> &x, Real &tol) {
+    Real val = alpha_*x[1];
+    if ( stateObj_ == "Vorticity" || stateObj_ == "Directional" ) {
+      val += x[0];
+    }
+    else {
+      val += static_cast<Real>(0.5)*x[0]*x[0];
+    }
+    return val;
+  }
+
+  void gradient(std::vector<Real> &g, const std::vector<Real> &x, Real &tol) {
+    const Real one(1);
+    if ( stateObj_ == "Vorticity" || stateObj_ == "Directional" ) {
+      g[0] = one;
+    }
+    else {
+      g[0] = x[0];
+    }
+    g[1] = alpha_;
+  }
+
+  void hessVec(std::vector<Real> &hv, const std::vector<Real> &v, const std::vector<Real> &x, Real &tol) {
+    const Real zero(0);
+    if ( stateObj_ == "Vorticity" || stateObj_ == "Directional" ) {
+      hv[0] = zero;
+    }
+    else {
+      hv[0] = v[0];
+    }
+    hv[1] = zero;
+  }
+
+}; // OBJ_SCALAR
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/thermal-fluids/obj_thermal-fluids_ex03K.hpp b/packages/rol/example/PDE-OPT/thermal-fluids/obj_thermal-fluids_ex03K.hpp
new file mode 100644
index 000000000000..b298bc3ad114
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/thermal-fluids/obj_thermal-fluids_ex03K.hpp
@@ -0,0 +1,952 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  obj.hpp
+    \brief Provides the interface for local (cell-based) objective function computations.
+*/
+
+#ifndef PDEOPT_QOI_THERMALFLUIDSK_HPP
+#define PDEOPT_QOI_THERMALFLUIDSK_HPP
+
+#include "../TOOLS/qoiK.hpp"
+#include "pde_thermal-fluids_ex03K.hpp"
+
+template <class Real, class DeviceType>
+class QoI_Vorticity_ThermalFluids : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> feVel_, fePrs_, feThr_;
+  const ROL::Ptr<FieldHelper<Real,DeviceType>> fieldHelper_;
+  scalar_view weight_;
+
+  Real weightFunc(const std::vector<Real> & x) const {
+    return static_cast<Real>(1);
+  }
+
+public:
+  QoI_Vorticity_ThermalFluids(const ROL::Ptr<fe_type> &feVel,
+                              const ROL::Ptr<fe_type> &fePrs,
+                              const ROL::Ptr<fe_type> &feThr,
+                              const ROL::Ptr<FieldHelper<Real,DeviceType>> &fieldHelper,
+                              ROL::ParameterList &parlist)
+    : feVel_(feVel), fePrs_(fePrs), feThr_(feThr), fieldHelper_(fieldHelper) {
+    int c = feVel_->cubPts().extent_int(0);
+    int p = feVel_->cubPts().extent_int(1);
+    int d = feVel_->cubPts().extent_int(2);
+    std::vector<Real> pt(d);
+    weight_ = scalar_view("weight",c,p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k)
+          pt[k] = (feVel_->cubPts())(i,j,k);
+        weight_(i,j) = weightFunc(pt);
+      }
+    }
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = u_coeff.extent_int(0);
+    int p = feVel_->cubPts().extent_int(1);
+    int d = feVel_->cubPts().extent_int(2);
+    // Initialize output val
+    val = scalar_view("val", c);
+    // Get components of the control
+    std::vector<scalar_view> U;
+    fieldHelper_->splitFieldCoeff(U, u_coeff);
+    // Evaluate on FE basis
+    std::vector<scalar_view> gradU_vec(d);
+    for (int i = 0; i < d; ++i) {
+      gradU_vec[i] = scalar_view("gradU_vec", c, p, d);
+      feVel_->evaluateGradient(gradU_vec[i], U[i]);
+    }
+    // Compute weighted curl
+    scalar_view curlU_eval;
+    if (d==2)
+      curlU_eval = scalar_view("curlU_eval", c, p);
+    else if (d==3)
+      curlU_eval = scalar_view("curlU_eval", c, p, d);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        if (d==2) {
+          curlU_eval(i,j) = weight_(i,j)*((gradU_vec[1])(i,j,0)-(gradU_vec[0])(i,j,1));
+        }
+        else if (d==3) {
+          for (int k = 0; k < d; ++k) {
+            int i1 = (k+2)%d, i2 = (k+1)%d;
+            curlU_eval(i,j,k) = weight_(i,j)*((gradU_vec[i1])(i,j,i2)-(gradU_vec[i2])(i,j,i1));
+          }
+        }
+      }
+    }
+    // Compute L2 norm squared
+    feVel_->computeIntegral(val,curlU_eval,curlU_eval,false);
+    rst::scale(val,static_cast<Real>(0.5));
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = u_coeff.extent_int(0);
+    int fv = feVel_->N().extent_int(1);
+    int fp = fePrs_->N().extent_int(1);
+    int fh = feThr_->N().extent_int(1);
+    int p = feVel_->cubPts().extent_int(1);
+    int d = feVel_->cubPts().extent_int(2);
+    // Initialize output grad
+    std::vector<scalar_view> G(d+2);
+    for (int i = 0; i < d; ++i)
+      G[i] = scalar_view("grad", c, fv);
+    G[d] = scalar_view("grad", c, fp);
+    G[d+1] = scalar_view("grad", c, fh);
+    // Get components of the control
+    std::vector<scalar_view> U;
+    fieldHelper_->splitFieldCoeff(U, u_coeff);
+    // Evaluate on FE basis
+    std::vector<scalar_view> gradU_vec(d);
+    for (int i = 0; i < d; ++i) {
+      gradU_vec[i] = scalar_view("gradU_vec", c, p, d);
+      feVel_->evaluateGradient(gradU_vec[i], U[i]);
+    }
+    // Compute weighted curl
+    int size = (d==2) ? 1 : d;
+    std::vector<scalar_view> curlU_vec(size);
+    if (d==2) {
+      curlU_vec[0] = scalar_view("curlU_vec", c, p);
+      for (int i = 0; i < c; ++i) {
+        for (int j = 0; j < p; ++j) {
+          (curlU_vec[0])(i,j) = weight_(i,j)*((gradU_vec[1])(i,j,0)-(gradU_vec[0])(i,j,1));
+        }
+      }
+    }
+    else if (d==3) {
+      for (int i = 0; i < d; ++i) {
+        curlU_vec[i] = scalar_view("curlU_vec", c, p);
+        for (int j = 0; j < c; ++j) {
+          for (int k = 0; k < p; ++k) {
+            int i1 = (i+2)%d, i2 = (i+1)%d;
+            (curlU_vec[i])(j,k) = weight_(j,k)*((gradU_vec[i1])(j,k,i2)-(gradU_vec[i2])(j,k,i1));
+          }
+        }
+      }
+    }
+    // Build local gradient of state tracking term
+    if (d==2) {
+      fst::integrate(G[0],curlU_vec[0],feVel_->DNDdetJ(1),false);
+      rst::scale(G[0],static_cast<Real>(-1));
+      fst::integrate(G[1],curlU_vec[0],feVel_->DNDdetJ(0),false);
+    }
+    else if (d==3) {
+      for (int i = 0; i < d; ++i) {
+        int i1 = (i+2)%d, i2 = (i+1)%d;
+        fst::integrate(G[i],curlU_vec[i1],feVel_->DNDdetJ(i2),false);
+        rst::scale(G[i],static_cast<Real>(-1));
+        fst::integrate(G[i],curlU_vec[i2],feVel_->DNDdetJ(i1),true);
+      }
+    }
+    fieldHelper_->combineFieldCoeff(grad, G);
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Vorticity_ThermalFluids::gradient_2 is zero.");
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c  = z_coeff.extent_int(0);
+    int p  = feVel_->cubPts().extent_int(1);
+    int fv = feVel_->N().extent_int(1);
+    int fp = fePrs_->N().extent_int(1);
+    int fh = feThr_->N().extent_int(1);
+    int d = feVel_->cubPts().extent_int(2);
+    // Initialize output grad
+    std::vector<scalar_view> H(d+2);
+    for (int i = 0; i < d; ++i)
+      H[i] = scalar_view("hess", c, fv);
+    H[d] = scalar_view("hess", c, fp);
+    H[d+1] = scalar_view("hess", c, fh);
+    // Get components of the control
+    std::vector<scalar_view> V;
+    fieldHelper_->splitFieldCoeff(V, v_coeff);
+    // Evaluate on FE basis
+    std::vector<scalar_view> gradV_vec(d);
+    for (int i = 0; i < d; ++i) {
+      gradV_vec[i] = scalar_view("gradV_vec", c, p, d);
+      feVel_->evaluateGradient(gradV_vec[i], V[i]);
+    }
+    // Compute weighted curl
+    int size = (d==2) ? 1 : d;
+    std::vector<scalar_view> curlV_vec(size);
+    if (d==2) {
+      curlV_vec[0] = scalar_view("curlV_vec", c, p);
+      for (int i = 0; i < c; ++i) {
+        for (int j = 0; j < p; ++j) {
+          (curlV_vec[0])(i,j) = weight_(i,j)*((gradV_vec[1])(i,j,0)-(gradV_vec[0])(i,j,1));
+        }
+      }
+    }
+    else if (d==3) {
+      for (int i = 0; i < d; ++i) {
+        curlV_vec[i] = scalar_view("curlV_vec", c, p);
+        for (int j = 0; j < c; ++j) {
+          for (int k = 0; k < p; ++k) {
+            int i1 = (i+2)%d, i2 = (i+1)%d;
+            (curlV_vec[i])(j,k) = weight_(j,k)*((gradV_vec[i1])(j,k,i2)-(gradV_vec[i2])(j,k,i1));
+          }
+        }
+      }
+    }
+    // Build local gradient of state tracking term
+    if (d==2) {
+      fst::integrate(H[0],curlV_vec[0],feVel_->DNDdetJ(1),false);
+      rst::scale(H[0],static_cast<Real>(-1));
+      fst::integrate(H[1],curlV_vec[0],feVel_->DNDdetJ(0),false);
+    }
+    else if (d==3) {
+      for (int i = 0; i < d; ++i) {
+        int i1 = (i+2)%d, i2 = (i+1)%d;
+        fst::integrate(H[i],curlV_vec[i1],feVel_->DNDdetJ(i2),false);
+        rst::scale(H[i],static_cast<Real>(-1));
+        fst::integrate(H[i],curlV_vec[i2],feVel_->DNDdetJ(i1),true);
+      }
+    }
+
+    fieldHelper_->combineFieldCoeff(hess, H);
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Vorticity_ThermalFluids::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Vorticity_ThermalFluids::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Vorticity_ThermalFluids::HessVec_22 is zero.");
+  }
+
+}; // QoI_Vorticity_ThermalFluids
+
+template <class Real, class DeviceType>
+class QoI_Circulation_ThermalFluids : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> feVel_, fePrs_, feThr_;
+  const ROL::Ptr<FieldHelper<Real,DeviceType>> fieldHelper_;
+  scalar_view weight_;
+
+  Real weightFunc(const std::vector<Real> & x) const {
+    return static_cast<Real>(1);
+  }
+
+public:
+  QoI_Circulation_ThermalFluids(const ROL::Ptr<fe_type> &feVel,
+                                const ROL::Ptr<fe_type> &fePrs,
+                                const ROL::Ptr<fe_type> &feThr,
+                                const ROL::Ptr<FieldHelper<Real,DeviceType>> &fieldHelper)
+    : feVel_(feVel), fePrs_(fePrs), feThr_(feThr), fieldHelper_(fieldHelper) {
+    int c = feVel_->cubPts().extent_int(0);
+    int p = feVel_->cubPts().extent_int(1);
+    int d = feVel_->cubPts().extent_int(2);
+    std::vector<Real> pt(d);
+    weight_ = scalar_view("weight", c,p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k)
+          pt[k] = (feVel_->cubPts())(i,j,k);
+        weight_(i,j) = weightFunc(pt);
+      }
+    }
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = u_coeff.extent_int(0);
+    int p = feVel_->cubPts().extent_int(1);
+    int d = feVel_->cubPts().extent_int(2);
+    // Initialize output val
+    val = scalar_view("val", c);
+    // Get components of the control
+    std::vector<scalar_view> U;
+    fieldHelper_->splitFieldCoeff(U, u_coeff);
+    // Evaluate on FE basis
+    scalar_view gradUX_eval("gradUX_eval", c, p, d);
+    scalar_view gradUY_eval("gradUY_eval", p, d);
+    feVel_->evaluateGradient(gradUX_eval, U[0]);
+    feVel_->evaluateGradient(gradUY_eval, U[1]);
+    // Compute curl
+    scalar_view curlU_eval("curlU_eval", c, p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        curlU_eval(i,j) = gradUY_eval(i,j,0) - gradUX_eval(i,j,1);
+      }
+    }
+    // Compute circulation
+    feVel_->computeIntegral(val,curlU_eval,weight_,false);
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = u_coeff.extent_int(0);
+    int fv = feVel_->N().extent_int(1);
+    int fp = fePrs_->N().extent_int(1);
+    int fh = feThr_->N().extent_int(1);
+    int d = feVel_->cubPts().extent_int(2);
+    // Initialize output grad
+    std::vector<scalar_view> G(d+2);
+    for (int i = 0; i < d; ++i)
+      G[i] = scalar_view("grad", c, fv);
+    G[d] = scalar_view("grad", c, fp);
+    G[d+1] = scalar_view("grad", c, fh);
+    // Build local gradient of state tracking term
+    fst::integrate(G[0],weight_,feVel_->DNDdetJ(1),false);
+    rst::scale(G[0],static_cast<Real>(-1));
+    fst::integrate(G[1],weight_,feVel_->DNDdetJ(0),false);
+
+    fieldHelper_->combineFieldCoeff(grad, G);
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Circulation_ThermalFluids::gradient_2 is zero.");
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Circulation_ThermalFluids::HessVec_11 is zero.");
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Circulation_ThermalFluids::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Circulation_ThermalFluids::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Circulation_ThermalFluids::HessVec_22 is zero.");
+  }
+
+}; // QoI_Circulation_ThermalFluids
+
+template <class Real, class DeviceType>
+class QoI_Horizontal_ThermalFluids : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> feVel_, fePrs_, feThr_;
+  const ROL::Ptr<FieldHelper<Real,DeviceType>> fieldHelper_;
+  scalar_view weight_;
+
+  Real weightFunc(const std::vector<Real> & x) const {
+    return static_cast<Real>(1);
+  }
+
+public:
+  QoI_Horizontal_ThermalFluids(const ROL::Ptr<fe_type> &feVel,
+                               const ROL::Ptr<fe_type> &fePrs,
+                               const ROL::Ptr<fe_type> &feThr,
+                               const ROL::Ptr<FieldHelper<Real,DeviceType>> &fieldHelper)
+    : feVel_(feVel), fePrs_(fePrs), feThr_(feThr), fieldHelper_(fieldHelper) {
+    int c = feVel_->cubPts().extent_int(0);
+    int p = feVel_->cubPts().extent_int(1);
+    int d = feVel_->cubPts().extent_int(2);
+    std::vector<Real> pt(d);
+    weight_ = scalar_view("weight",c,p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k)
+          pt[k] = (feVel_->cubPts())(i,j,k);
+        weight_(i,j) = weightFunc(pt);
+      }
+    }
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = u_coeff.extent_int(0);
+    int p = feVel_->cubPts().extent_int(1);
+    // Initialize output val
+    val = scalar_view("val", c);
+    // Get components of the control
+    std::vector<scalar_view> U;
+    fieldHelper_->splitFieldCoeff(U, u_coeff);
+    // Evaluate on FE basis
+    scalar_view valUX_eval("valUX_eval", c, p);
+    scalar_view valUY_eval("valUY_eval", c, p);
+    feVel_->evaluateValue(valUX_eval, U[0]);
+    feVel_->evaluateValue(valUY_eval, U[1]);
+    // Compute negative part of x-velocity
+    scalar_view minUX_eval("minUX_eval", c, p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        minUX_eval(i,j) = std::min(static_cast<Real>(0),valUX_eval(i,j));
+      }
+    }
+    // Multiply by weight
+    scalar_view weighted_minUX_eval("minUX_eval", c, p);
+    fst::scalarMultiplyDataData(weighted_minUX_eval,weight_,minUX_eval);
+    scalar_view weighted_valUY_eval("valUY_eval", c, p);
+    fst::scalarMultiplyDataData(weighted_valUY_eval,weight_,valUY_eval);
+    // Compute L2 norm squared
+    feVel_->computeIntegral(val,minUX_eval,weighted_minUX_eval,false);
+    feVel_->computeIntegral(val,valUY_eval,weighted_valUY_eval,true);
+
+    rst::scale(val,static_cast<Real>(0.5));
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = u_coeff.extent_int(0);
+    int fv = feVel_->N().extent_int(1);
+    int fp = fePrs_->N().extent_int(1);
+    int fh = feThr_->N().extent_int(1);
+    int p = feVel_->cubPts().extent_int(1);
+    int d = feVel_->cubPts().extent_int(2);
+    // Initialize output grad
+    std::vector<scalar_view> G(d+2);
+    for (int i = 0; i < d; ++i)
+      G[i] = scalar_view("grad", c, fv);
+    G[d] = scalar_view("grad", c, fp);
+    G[d+1] = scalar_view("grad", c, fh);
+    // Get components of the control
+    std::vector<scalar_view> U;
+    fieldHelper_->splitFieldCoeff(U, u_coeff);
+    // Evaluate on FE basis
+    scalar_view valUX_eval("valUX_eval", c, p);
+    scalar_view valUY_eval("valUY_eval", c, p);
+    feVel_->evaluateValue(valUX_eval, U[0]);
+    feVel_->evaluateValue(valUY_eval, U[1]);
+    // Compute negative part of x-velocity
+    scalar_view weighted_minUX_eval("minUX_eval", c, p);
+    scalar_view weighted_valUY_eval("valUY_eval", c, p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        weighted_minUX_eval(i,j) = weight_(i,j)*std::min(static_cast<Real>(0),valUX_eval(i,j));
+        weighted_valUY_eval(i,j) = weight_(i,j)*valUY_eval(i,j);
+      }
+    }
+    // Build local gradient of state tracking term
+    fst::integrate(G[0],weighted_minUX_eval,feVel_->NdetJ(),false);
+    fst::integrate(G[1],weighted_valUY_eval,feVel_->NdetJ(),false);
+
+    fieldHelper_->combineFieldCoeff(grad, G);
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Horizontal_ThermalFluids::gradient_2 is zero.");
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c  = z_coeff.extent_int(0);
+    int p  = feVel_->cubPts().extent_int(1);
+    int fv = feVel_->N().extent_int(1);
+    int fp = fePrs_->N().extent_int(1);
+    int fh = feThr_->N().extent_int(1);
+    int d = feVel_->cubPts().extent_int(2);
+    // Initialize output grad
+    std::vector<scalar_view> H(d+2);
+    for (int i = 0; i < d; ++i)
+      H[i] = scalar_view("hess", c, fv);
+    H[d] = scalar_view("hess", c, fp);
+    H[d+1] = scalar_view("hess", c, fh);
+    // Get components of the control
+    std::vector<scalar_view> U;
+    fieldHelper_->splitFieldCoeff(U, u_coeff);
+    std::vector<scalar_view> V;
+    fieldHelper_->splitFieldCoeff(V, v_coeff);
+    // Evaluate on FE basis
+    scalar_view valUX_eval("valUX_eval", c, p);
+    scalar_view valVX_eval("valVX_eval", c, p);
+    scalar_view valVY_eval("valVY_eval", c, p);
+    feVel_->evaluateValue(valUX_eval, U[0]);
+    feVel_->evaluateValue(valVX_eval, V[0]);
+    feVel_->evaluateValue(valVY_eval, V[1]);
+    // Compute negative part of x-velocity
+    scalar_view weighted_minVX_eval("weighted_minVX_eval", c, p);
+    scalar_view weighted_valVY_eval("valVY_eval", c, p);
+    const Real zero(0), one(1);
+    Real scale(0), uij(0);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        uij = valUX_eval(i,j);
+        scale = (uij < zero ? one : (uij > zero ? zero : one));
+        weighted_minVX_eval(i,j) = scale * weight_(i,j) * valVX_eval(i,j);
+        weighted_valVY_eval(i,j) = weight_(i,j) * valVY_eval(i,j);
+      }
+    }
+    // Build local gradient of state tracking term
+    fst::integrate(H[0],weighted_minVX_eval,feVel_->NdetJ(),false);
+    fst::integrate(H[1],weighted_valVY_eval,feVel_->NdetJ(),false);
+
+    fieldHelper_->combineFieldCoeff(hess, H);
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Horizontal_ThermalFluids::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Horizontal_ThermalFluids::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Horizontal_ThermalFluids::HessVec_22 is zero.");
+  }
+
+}; // QoI_Horizontal_ThermalFluids
+
+template <class Real, class DeviceType>
+class QoI_State_ThermalFluids : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  ROL::Ptr<QoI<Real,DeviceType>> qoi_;
+
+public:
+  QoI_State_ThermalFluids(ROL::ParameterList &parlist,
+                         const ROL::Ptr<fe_type> &feVel,
+                         const ROL::Ptr<fe_type> &fePrs,
+                         const ROL::Ptr<fe_type> &feThr,
+                         const ROL::Ptr<FieldHelper<Real,DeviceType>> &fieldHelper) {
+    std::string stateObj = parlist.sublist("Problem").get("Objective type","Vorticity");
+    if ( stateObj != "Vorticity" && stateObj != "Circulation" && stateObj != "Directional" )
+      throw Exception::NotImplemented(">>> (QoI_State_ThermalFluids): Unknown objective type."); 
+    if ( stateObj == "Vorticity" )
+      qoi_ = ROL::makePtr<QoI_Vorticity_ThermalFluids<Real,DeviceType>>(feVel,fePrs,feThr,fieldHelper,parlist);
+    else if ( stateObj == "Directional" )
+      qoi_ = ROL::makePtr<QoI_Horizontal_ThermalFluids<Real,DeviceType>>(feVel,fePrs,feThr,fieldHelper);
+    else
+      qoi_ = ROL::makePtr<QoI_Circulation_ThermalFluids<Real,DeviceType>>(feVel,fePrs,feThr,fieldHelper);
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    return qoi_->value(val, u_coeff, z_coeff, z_param);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    qoi_->gradient_1(grad, u_coeff, z_coeff, z_param);
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    qoi_->gradient_2(grad, u_coeff, z_coeff, z_param);
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    qoi_->HessVec_11(hess, v_coeff, u_coeff, z_coeff, z_param);
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    qoi_->HessVec_12(hess, v_coeff, u_coeff, z_coeff, z_param);
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    qoi_->HessVec_21(hess, v_coeff, u_coeff, z_coeff, z_param);
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    qoi_->HessVec_22(hess, v_coeff, u_coeff, z_coeff, z_param);
+  }
+
+};
+
+template <class Real, class DeviceType>
+class QoI_L2Penalty_ThermalFluids : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> feVel_, fePrs_, feThr_;
+  const std::vector<std::vector<ROL::Ptr<fe_type>>> feThrBdry_;
+  const std::vector<std::vector<std::vector<int>>> bdryCellLocIds_;
+  const ROL::Ptr<FieldHelper<Real,DeviceType>> fieldHelper_;
+
+  scalar_view getBoundaryCoeff(const scalar_view cell_coeff, int sideSet, int cell) const {
+    std::vector<int> bdryCellLocId = bdryCellLocIds_[sideSet][cell];
+    const int numCellsSide = bdryCellLocId.size();
+    const int f = feThr_->N().extent_int(1);
+    
+    scalar_view bdry_coeff("bdry_coeff", numCellsSide, f);
+    for (int i = 0; i < numCellsSide; ++i) {
+      for (int j = 0; j < f; ++j)
+        bdry_coeff(i, j) = cell_coeff(bdryCellLocId[i], j);
+    }
+    return bdry_coeff;
+  }
+
+public:
+  QoI_L2Penalty_ThermalFluids(const ROL::Ptr<fe_type> &feVel,
+                              const ROL::Ptr<fe_type> &fePrs,
+                              const ROL::Ptr<fe_type> &feThr,
+                              const std::vector<std::vector<ROL::Ptr<fe_type>>> &feThrBdry,
+                              const std::vector<std::vector<std::vector<int>>> &bdryCellLocIds,
+                              const ROL::Ptr<FieldHelper<Real,DeviceType>> &fieldHelper)
+    : feVel_(feVel), fePrs_(fePrs), feThr_(feThr), feThrBdry_(feThrBdry),
+      bdryCellLocIds_(bdryCellLocIds), fieldHelper_(fieldHelper) {}
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    const int c = feVel_->gradN().extent_int(0);
+    const int d = feVel_->gradN().extent_int(3);
+    // Get components of the control
+    std::vector<scalar_view> Z;
+    fieldHelper_->splitFieldCoeff(Z, z_coeff);
+    // Initialize output val
+    val = scalar_view("val", c);
+    // Compute cost integral
+    const int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if ( i == 1 || i == 2 ) {
+          const int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            const int numCellsSide = bdryCellLocIds_[i][j].size();
+            if ( numCellsSide ) {
+              const int numCubPerSide = feThrBdry_[i][j]->cubPts().extent_int(1);
+              // Evaluate control on FE basis
+              scalar_view z_coeff_bdry = getBoundaryCoeff(Z[d+1], i, j);
+              scalar_view valZ_eval("valZ_eval", numCellsSide, numCubPerSide);
+              feThrBdry_[i][j]->evaluateValue(valZ_eval, z_coeff_bdry);
+              // Integrate cell L2 norm squared
+              scalar_view intVal("intVal", numCellsSide);
+              feThrBdry_[i][j]->computeIntegral(intVal,valZ_eval,valZ_eval,false);
+              // Add to integral value
+              for (int k = 0; k < numCellsSide; ++k) {
+                int cidx = bdryCellLocIds_[i][j][k];
+                val(cidx) += static_cast<Real>(0.5)*intVal(k);
+              }
+            }
+          }
+        }
+      }
+    }
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_L2Penalty_ThermalFluids::gradient_1 is zero.");
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fv = feVel_->gradN().extent_int(1);
+    const int fp = fePrs_->gradN().extent_int(1);
+    const int fh = feThr_->gradN().extent_int(1);
+    const int d  = feThr_->gradN().extent_int(3);
+    // Initialize output grad
+    std::vector<scalar_view> G(d+2);
+    for (int i = 0; i < d; ++i)
+      G[i] = scalar_view("grad", c, fv);
+    G[d] = scalar_view("grad", c, fp);
+    G[d+1] = scalar_view("grad", c, fh);
+    // Get components of the control
+    std::vector<scalar_view> Z;
+    fieldHelper_->splitFieldCoeff(Z, z_coeff);
+    // Compute cost integral
+    const int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if ( i == 1 || i == 2 ) {
+          const int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            const int numCellsSide = bdryCellLocIds_[i][j].size();
+            if ( numCellsSide ) {
+              const int numCubPerSide = feThrBdry_[i][j]->cubPts().extent_int(1);
+              // Evaluate control on FE basis
+              scalar_view z_coeff_bdry = getBoundaryCoeff(Z[d+1], i, j);
+              scalar_view valZ_eval("valZ_eval", numCellsSide, numCubPerSide);
+              feThrBdry_[i][j]->evaluateValue(valZ_eval, z_coeff_bdry);
+              // Compute gradient of squared L2-norm of diff
+              scalar_view intGrad("intGrad", numCellsSide, fh);
+              fst::integrate(intGrad,valZ_eval,feThrBdry_[i][j]->NdetJ(),false);
+              // Add to integral value
+              for (int k = 0; k < numCellsSide; ++k) {
+                int cidx = bdryCellLocIds_[i][j][k];
+                for (int l = 0; l < fh; ++l)
+                  (G[d+1])(cidx,l) += intGrad(k,l);
+              }
+            }
+          }
+        }
+      }
+    }
+
+    fieldHelper_->combineFieldCoeff(grad, G);
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_L2Penalty_ThermalFluids::HessVec_11 is zero.");
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_L2Penalty_ThermalFluids::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_L2Penalty_ThermalFluids::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fv = feVel_->gradN().extent_int(1);
+    const int fp = fePrs_->gradN().extent_int(1);
+    const int fh = feThr_->gradN().extent_int(1);
+    const int d  = feThr_->gradN().extent_int(3);
+    // Initialize output grad
+    std::vector<scalar_view> H(d+2);
+    for (int i = 0; i < d; ++i)
+      H[i] = scalar_view("hess", c, fv);
+    H[d] = scalar_view("hess", c, fp);
+    H[d+1] = scalar_view("hess", c, fh);
+    // Get components of the control
+    std::vector<scalar_view> V;
+    fieldHelper_->splitFieldCoeff(V, v_coeff);
+    // Compute cost integral
+    const int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if ( i == 1 || i == 2 ) {
+          const int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            const int numCellsSide = bdryCellLocIds_[i][j].size();
+            if ( numCellsSide ) {
+              const int numCubPerSide = feThrBdry_[i][j]->cubPts().extent_int(1);
+              // Evaluate control on FE basis
+              scalar_view v_coeff_bdry = getBoundaryCoeff(V[d+1], i, j);
+              scalar_view valV_eval("valV_eval", numCellsSide, numCubPerSide);
+              feThrBdry_[i][j]->evaluateValue(valV_eval, v_coeff_bdry);
+              // Compute gradient of squared L2-norm of diff
+              scalar_view intHess("intHess", numCellsSide, fh);
+              fst::integrate(intHess,valV_eval,feThrBdry_[i][j]->NdetJ(),false);
+              // Add to integral value
+              for (int k = 0; k < numCellsSide; ++k) {
+                int cidx = bdryCellLocIds_[i][j][k];
+                for (int l = 0; l < fh; ++l)
+                  (H[d+1])(cidx,l) += intHess(k,l);
+              }
+            }
+          }
+        }
+      }
+    }
+
+    fieldHelper_->combineFieldCoeff(hess, H);
+  }
+
+}; // QoI_L2Penalty_ThermalFluids
+
+template <class Real>
+class StdObjective_ThermalFluids : public ROL::StdObjective<Real> {
+private:
+  Real alpha_;
+  std::string stateObj_;
+
+public:
+  StdObjective_ThermalFluids(ROL::ParameterList &parlist) {
+    alpha_    = parlist.sublist("Problem").get("Control penalty parameter",1.e-4);
+    stateObj_ = parlist.sublist("Problem").get("Objective type","Vorticity");
+    if ( stateObj_ != "Vorticity" && stateObj_ != "Circulation" && stateObj_ != "Directional") {
+      throw Exception::NotImplemented(">>> (StdObjective_ThermalFluids): Unknown objective type."); 
+    }
+  }
+
+  Real value(const std::vector<Real> &x, Real &tol) {
+    Real val = alpha_*x[1];
+    if ( stateObj_ == "Vorticity" || stateObj_ == "Directional" ) {
+      val += x[0];
+    }
+    else {
+      val += static_cast<Real>(0.5)*x[0]*x[0];
+    }
+    return val;
+  }
+
+  void gradient(std::vector<Real> &g, const std::vector<Real> &x, Real &tol) {
+    const Real one(1);
+    if ( stateObj_ == "Vorticity" || stateObj_ == "Directional" ) {
+      g[0] = one;
+    }
+    else {
+      g[0] = x[0];
+    }
+    g[1] = alpha_;
+  }
+
+  void hessVec(std::vector<Real> &hv, const std::vector<Real> &v, const std::vector<Real> &x, Real &tol) {
+    const Real zero(0);
+    if ( stateObj_ == "Vorticity" || stateObj_ == "Directional" ) {
+      hv[0] = zero;
+    }
+    else {
+      hv[0] = v[0];
+    }
+    hv[1] = zero;
+  }
+
+}; // OBJ_SCALAR
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/thermal-fluids/pde_thermal-fluidsK.hpp b/packages/rol/example/PDE-OPT/thermal-fluids/pde_thermal-fluidsK.hpp
new file mode 100644
index 000000000000..07e1e82d8eb7
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/thermal-fluids/pde_thermal-fluidsK.hpp
@@ -0,0 +1,1336 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  pde_thermal-fluids.hpp
+    \brief Implements the local PDE interface for the Navier-Stokes control problem.
+*/
+
+#ifndef PDE_THERMALFLUIDSK_HPP
+#define PDE_THERMALFLUIDSK_HPP
+
+#include "../TOOLS/pdeK.hpp"
+#include "../TOOLS/feK.hpp"
+#include "../TOOLS/fieldhelperK.hpp"
+
+#include "Intrepid2_HGRAD_QUAD_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_QUAD_C2_FEM.hpp"
+#include "Intrepid2_DefaultCubatureFactory.hpp"
+#include "Intrepid2_FunctionSpaceTools.hpp"
+#include "Intrepid2_CellTools.hpp"
+
+#include "ROL_Ptr.hpp"
+
+template <class Real, class DeviceType>
+class PDE_ThermalFluids : public PDE<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  // Finite element basis information
+  basis_ptr basisPtrVel_, basisPtrPrs_, basisPtrThr_;
+  std::vector<basis_ptr> basisPtrs_;
+  // Cell cubature information
+  ROL::Ptr<Intrepid2::Cubature<DeviceType,Real,Real>> cellCub_, bdryCub_;
+  // Cell node information
+  scalar_view volCellNodes_;
+  std::vector<std::vector<scalar_view>> bdryCellNodes_;
+  std::vector<std::vector<std::vector<int>>> bdryCellLocIds_;
+  // Finite element definition
+  ROL::Ptr<fe_type> feVel_, fePrs_, feThr_;
+  std::vector<std::vector<ROL::Ptr<fe_type>>> feVelBdry_, fePrsBdry_, feThrBdry_;
+  // Local degrees of freedom on boundary, for each side of the reference cell (first index).
+  std::vector<std::vector<int>> fvidx_, fpidx_, fhidx_;
+  // Coordinates of degrees freedom on boundary cells.
+  // Indexing:  [sideset number][local side id](cell number, value at dof)
+  std::vector<std::vector<scalar_view>> bdryCellVDofValues_, bdryCellTDofValues_;
+  // Field pattern, offsets, etc.
+  std::vector<std::vector<int>> fieldPattern_;  // local Field/DOF pattern; set from DOF manager 
+  int numFields_;                                // number of fields (equations in the PDE)
+  int numDofs_;                                  // total number of degrees of freedom for all (local) fields
+  std::vector<int> offset_;                      // for each field, a counting offset
+  std::vector<int> numFieldDofs_;                // for each field, number of degrees of freedom
+  
+  // Problem parameters.
+  Real Re_, Pr_, Gr_, h_, Tinflow_, Tstep_;
+  const Real grav_;
+
+  ROL::Ptr<FieldHelper<Real,DeviceType>> fieldHelper_;
+
+  Real velocityDirichletFunc(const std::vector<Real> & coords, int sideset, int locSideId, int dir) const {
+    const std::vector<Real> param = PDE<Real,DeviceType>::getParameter();
+    Real val(0);
+    if ((sideset==4) && (dir==0)) {
+      if ( param.size() >= 85 ) {
+        Real zero(0), one(1), two(2), pi(M_PI), four(4), c1(1), c2(-0.5);
+        if ( param[4] == zero ) {
+          val = std::sin(two*pi * (c1*coords[1] + c2));
+        }
+        else {
+          Real num = (std::exp(four*param[4]*(c1*coords[1] + c2)) - one);
+          Real den = (std::exp(two*param[4])-one);
+          val = std::sin(pi * num/den);
+        }
+      }
+      else {
+        val = static_cast<Real>(8)
+             *(coords[1]-static_cast<Real>(0.5))
+             *(static_cast<Real>(1)-coords[1]);
+      }
+    }
+    else if (((sideset==1) || (sideset==2)) && (dir==0)) {
+      if ( param.size() >= 86 ) {
+        Real zero(0), half(0.5), one(1), two(2), pi(M_PI), four(4), c1(0.5), c2(0);
+        if ( param[5] == zero ) {
+          val = half*half*std::sin(two*pi * (c1*coords[1] + c2));
+        }
+        else {
+          Real num = (std::exp(four*param[5]*(c1*coords[1] + c2)) - one);
+          Real den = (std::exp(two*param[5])-one);
+          val = half*half*std::sin(pi * num/den);
+        }
+      }
+      else {
+        val = (static_cast<Real>(1)-coords[1]) * coords[1];
+      }
+    }
+    return val;
+  }
+
+  Real thermalDirichletFunc(const std::vector<Real> & coords, int sideset, int locSideId) const {
+    const std::vector<Real> param = PDE<Real,DeviceType>::getParameter();
+    Real val(0);
+    if (sideset==4) {
+      if ( param.size() >= 87 ) {
+        Real l(-0.01), u(0.01);
+        val = (u-l)*param[6] + l;
+      }
+      else {
+        val = Tinflow_;
+      }
+    }
+    else if ((sideset==5) || (sideset==6) || (sideset==7) || (sideset==8)) {
+      if ( param.size() >= 88 ) {
+        Real l(0.99), u(1.01);
+        val = (u-l)*param[7] + l;
+      }
+      else {
+        val = Tstep_;
+      }
+    }
+    return val;
+  }
+
+  void computeDirichlet(void) {
+    // Compute Dirichlet values at DOFs.
+    int d  = basisPtrVel_->getBaseCellTopology().getDimension();
+    int fv = basisPtrVel_->getCardinality();
+    int ft = basisPtrThr_->getCardinality();
+    int numSidesets = bdryCellLocIds_.size();
+    bdryCellVDofValues_.resize(numSidesets);
+    bdryCellTDofValues_.resize(numSidesets);
+    for (int i=0; i<numSidesets; ++i) {
+      int numLocSides = bdryCellLocIds_[i].size();
+      bdryCellVDofValues_[i].resize(numLocSides);
+      bdryCellTDofValues_[i].resize(numLocSides);
+      for (int j=0; j<numLocSides; ++j) {
+        int c = bdryCellLocIds_[i][j].size();
+        bdryCellVDofValues_[i][j] = scalar_view("bdryCellVDofValues", c, fv, d);
+        bdryCellTDofValues_[i][j] = scalar_view("bdryCellTDofValues", c, ft);
+        scalar_view Vcoords("Vcoords", c, fv, d);
+        scalar_view Tcoords("Tcoords", c, ft, d);
+        if (c > 0) {
+          feVel_->computeDofCoords(Vcoords, bdryCellNodes_[i][j]);
+          feThr_->computeDofCoords(Tcoords, bdryCellNodes_[i][j]);
+        }
+        for (int k=0; k<c; ++k) {
+          for (int l=0; l<fv; ++l) {
+            std::vector<Real> dofpoint(d);
+            //std::cout << "Sideset " << i << " LocalSide " << j << "  Cell " << k << "  Field " << l << "  Coord ";
+            for (int m=0; m<d; ++m) {
+              dofpoint[m] = Vcoords(k, l, m);
+              //std::cout << dofpoint[m] << "  ";
+            }
+
+            for (int m=0; m<d; ++m) {
+              (bdryCellVDofValues_[i][j])(k, l, m) = velocityDirichletFunc(dofpoint, i, j, m);
+              //std::cout << "  " << m << "-Value " << DirichletFunc(dofpoint, i, j, m);
+            }
+            //std::cout << std::endl;
+          }
+          for (int l=0; l<ft; ++l) {
+            std::vector<Real> dofpoint(d);
+            //std::cout << "Sideset " << i << " LocalSide " << j << "  Cell " << k << "  Field " << l << "  Coord ";
+            for (int m=0; m<d; ++m) {
+              dofpoint[m] = Tcoords(k, l, m);
+              //std::cout << dofpoint[m] << "  ";
+            }
+
+            (bdryCellTDofValues_[i][j])(k, l) = thermalDirichletFunc(dofpoint, i, j);
+            //std::cout << "    Value " << DirichletFunc(dofpoint, i, j);
+            //std::cout << std::endl;
+          }
+        }
+      }
+    }
+  }
+
+  Real ReynoldsNumber(void) const {
+    const std::vector<Real> param = PDE<Real,DeviceType>::getParameter();
+    if ( param.size() >= 1 ) {
+      Real l(190), u(210);
+      return (u-l)*param[0] + l;
+    }
+    return Re_;
+  }
+
+  Real PrandtlNumber(void) const {
+    const std::vector<Real> param = PDE<Real,DeviceType>::getParameter();
+    if ( param.size() >= 2 ) {
+      Real l(0.62), u(0.82);
+      return (u-l)*param[1] + l;
+    }
+    return Pr_;
+  }
+
+  Real GrashofNumber(void) const {
+    const std::vector<Real> param = PDE<Real,DeviceType>::getParameter();
+    if ( param.size() >= 3 ) {
+      Real l(38000), u(50000);
+      return (u-l)*param[2] + l;
+    }
+    return Gr_;
+  }
+  
+  Real ThicknessNumber(void) const {
+    const std::vector<Real> param = PDE<Real,DeviceType>::getParameter();
+    if ( param.size() >= 4 ) {
+      Real l(1), u(15);
+      return (u-l)*param[3] + l;
+    }
+    return h_;
+  }
+
+  Real viscosityFunc(const std::vector<Real> &x) const {
+    Real ReynoldsNum = ReynoldsNumber();
+    return static_cast<Real>(1)/ReynoldsNum;
+  }
+
+  Real conductivityFunc(const std::vector<Real> &x) const {
+    Real ReynoldsNum = ReynoldsNumber();
+    Real PrandtlNum = PrandtlNumber();
+    return static_cast<Real>(1)/(ReynoldsNum*PrandtlNum);
+  }
+
+  Real gravityFunc(const std::vector<Real> &x) const {
+    Real ReynoldsNum = ReynoldsNumber();
+    Real GrashofNum = GrashofNumber();
+    return grav_*GrashofNum/(ReynoldsNum*ReynoldsNum);
+  }
+
+  Real thermalRobinFunc(const std::vector<Real> &x) const {
+    Real H = ThicknessNumber();
+    return H * conductivityFunc(x);
+  }
+
+  void computeCoefficients(scalar_view &nu, scalar_view &grav, scalar_view &kappa) const {
+    int c = feVel_->gradN().extent_int(0);
+    int p = feVel_->gradN().extent_int(2);
+    int d = feVel_->gradN().extent_int(3);
+    std::vector<Real> pt(d);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for ( int k = 0; k < d; ++k)
+          pt[k] = (feVel_->cubPts())(i,j,k);
+        // Compute spatially dependent coefficients
+        nu(i,j)    = viscosityFunc(pt);
+        grav(i,j)  = gravityFunc(pt);
+        kappa(i,j) = conductivityFunc(pt);
+      }
+    }
+  }
+
+  void computeThermalRobin(scalar_view &robin,
+                           const scalar_view u,
+                           const scalar_view z,
+                           int sideset,
+                           int locSideId,
+                           int deriv = 0,
+                           int component = 1) const {
+    const int c = u.extent_int(0);
+    const int p = u.extent_int(1);
+    const int d = feThr_->gradN().extent_int(3);
+    std::vector<Real> x(d);
+    Real h(0), kappa(0);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k)
+          x[k] = (feThrBdry_[sideset][locSideId]->cubPts())(i,j,k);
+        h = thermalRobinFunc(x);
+        kappa = conductivityFunc(x);
+        if ( deriv == 0 )
+          robin(i,j) = kappa * h * (u(i,j) - z(i,j));
+        else if ( deriv == 1 )
+          robin(i,j) = ((component==0) ? kappa * h : -kappa * h);
+        else
+          robin(i,j) = static_cast<Real>(0);
+      }
+    }
+  }
+
+  scalar_view getThermalBoundaryCoeff(const scalar_view cell_coeff,
+                                      int sideSet, int cell) const {
+    std::vector<int> bdryCellLocId = bdryCellLocIds_[sideSet][cell];
+    const int numCellsSide = bdryCellLocId.size();
+    const int f = basisPtrThr_->getCardinality();
+    
+    scalar_view bdry_coeff("bdry_coeff", numCellsSide, f);
+    for (int i = 0; i < numCellsSide; ++i) {
+      for (int j = 0; j < f; ++j)
+        bdry_coeff(i, j) = cell_coeff(bdryCellLocId[i], j);
+    }
+    return bdry_coeff;
+  }
+
+public:
+  PDE_ThermalFluids(ROL::ParameterList &parlist) : grav_(-1) {
+    // Finite element fields -- NOT DIMENSION INDEPENDENT!
+    basisPtrVel_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C2_FEM<DeviceType,Real,Real>>();
+    basisPtrPrs_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceType,Real,Real>>();
+    basisPtrThr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C2_FEM<DeviceType,Real,Real>>();
+    // Volume quadrature rules.
+    shards::CellTopology cellType = basisPtrVel_->getBaseCellTopology();     // get the cell type from any basis
+    Intrepid2::DefaultCubatureFactory cubFactory;                            // create cubature factory
+    int cubDegree = parlist.sublist("Problem").get("Cubature Degree", 4);    // set cubature degree, e.g., 4
+    cellCub_ = cubFactory.create<DeviceType,Real,Real>(cellType, cubDegree); // create default cubature
+    // Boundary quadrature rules.
+    int d = cellType.getDimension();
+    shards::CellTopology bdryCellType = cellType.getCellTopologyData(d-1, 0);
+    int bdryCubDegree = parlist.sublist("Problem").get("Boundary Cubature Degree",4); // set cubature degree, e.g., 4
+    bdryCub_ = cubFactory.create<DeviceType,Real,Real>(bdryCellType, bdryCubDegree);
+    // Fill finite element basis container
+    basisPtrs_.clear();
+    basisPtrs_.resize(d,basisPtrVel_);  // Velocity
+    basisPtrs_.push_back(basisPtrPrs_); // Pressure
+    basisPtrs_.push_back(basisPtrThr_); // Heat
+
+    // Other problem parameters.
+    Re_      = parlist.sublist("Problem").get("Reynolds Number",    200.0);
+    Pr_      = parlist.sublist("Problem").get("Prandtl Number",     0.72);
+    Gr_      = parlist.sublist("Problem").get("Grashof Number",     40000.0);
+    h_       = parlist.sublist("Problem").get("Robin Coefficient",  1.0);
+    Tinflow_ = parlist.sublist("Problem").get("Inflow Temperature", 0.0);
+    Tstep_   = parlist.sublist("Problem").get("Step Temperature",   1.0);
+
+    numDofs_ = 0;
+    numFields_ = basisPtrs_.size();
+    offset_.resize(numFields_);
+    numFieldDofs_.resize(numFields_);
+    for (int i=0; i<numFields_; ++i) {
+      if (i==0)
+        offset_[i] = 0;
+      else
+        offset_[i] = offset_[i-1] + basisPtrs_[i-1]->getCardinality();
+      numFieldDofs_[i] = basisPtrs_[i]->getCardinality();
+      numDofs_ += numFieldDofs_[i];
+    }
+  }
+
+  void residual(scalar_view & res,
+                const scalar_view u_coeff,
+                const scalar_view z_coeff = scalar_view(),
+                const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    int c  = u_coeff.extent_int(0);
+    int p  = cellCub_->getNumPoints();
+    int fv = basisPtrVel_->getCardinality();
+    int fp = basisPtrPrs_->getCardinality();
+    int fh = basisPtrThr_->getCardinality();
+    int d  = cellCub_->getDimension();
+ 
+    // Initialize residuals.
+    std::vector<scalar_view> R(d+2);
+    for (int i = 0; i < d; ++i)
+      R[i] = scalar_view("res", c, fv);
+    R[d]   = scalar_view("res", c, fp);
+    R[d+1] = scalar_view("res", c, fh);
+
+    // Split u_coeff into components.
+    std::vector<scalar_view> U, Z;
+    fieldHelper_->splitFieldCoeff(U, u_coeff);
+    fieldHelper_->splitFieldCoeff(Z, z_coeff);
+
+    // Evaluate problem coefficients at quadrature points.
+    scalar_view nu("nu", c, p);
+    scalar_view grav("grav", c, p);
+    scalar_view kappa("kappa", c, p);
+    computeCoefficients(nu,grav,kappa);
+
+    // Evaluate/interpolate finite element fields on cells.
+    std::vector<scalar_view> valVel_vec(d);
+    for (int i = 0; i < d; ++i) {
+      valVel_vec[i] = scalar_view("valVel_vec", c, p);
+      feVel_->evaluateValue(valVel_vec[i], U[i]);
+    }
+    scalar_view valPres_eval("valPres_eval", c, p);
+    scalar_view valHeat_eval("valHeat_eval", c, p);
+    fePrs_->evaluateValue(valPres_eval, U[d]);
+    feThr_->evaluateValue(valHeat_eval, U[d+1]);
+    // Evaluate/interpolate gradient of finite element fields on cells.
+    std::vector<scalar_view> gradVel_vec(d);
+    for (int i = 0; i < d; ++i) {
+      gradVel_vec[i] = scalar_view("gradVel_vec", c, p, d);
+      feVel_->evaluateGradient(gradVel_vec[i], U[i]);
+    }
+    scalar_view gradHeat_eval("gradHeat_eval", c, p, d);
+    feThr_->evaluateGradient(gradHeat_eval, U[d+1]);
+
+    // Assemble the velocity vector and its divergence.
+    scalar_view valVel_eval("valVel_eval", c, p, d);
+    scalar_view divVel_eval("divVel_eval", c, p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        divVel_eval(i,j) = static_cast<Real>(0);
+        for (int k = 0; k < d; ++k) {
+          valVel_eval(i,j,k) = (valVel_vec[k])(i,j);
+          divVel_eval(i,j)  += (gradVel_vec[k])(i,j,k);
+        }
+      }
+    }
+
+    /**************************************************************************/
+    /*** NAVIER STOKES ********************************************************/
+    /**************************************************************************/
+    std::vector<scalar_view> nuGradVel_vec(d), valVelDotgradVel_vec(d);
+    for (int i = 0; i < d; ++i) {
+      // Multiply velocity gradients with viscosity.
+      nuGradVel_vec[i] = scalar_view("nuGradVel_vec", c, p, d);
+      fst::tensorMultiplyDataData(nuGradVel_vec[i], nu, gradVel_vec[i]);
+      // Compute nonlinear terms in the Navier-Stokes equations.
+      valVelDotgradVel_vec[i] = scalar_view("valVelDotgradVel_vec", c, p);
+      fst::dotMultiplyDataData(valVelDotgradVel_vec[i], valVel_eval, gradVel_vec[i]);
+    }
+    // Negative pressure
+    rst::scale(valPres_eval,static_cast<Real>(-1));
+    // Compute gravitational force.
+    scalar_view gravForce("gravForce", c, p);
+    fst::scalarMultiplyDataData(gravForce, grav, valHeat_eval);
+
+    /**************************************************************************/
+    /*** THERMAL **************************************************************/
+    /**************************************************************************/
+    scalar_view kappaGradHeat_eval("kappaGradHeat_eval", c, p, d);
+    scalar_view velDotGradHeat_eval("velDotGradHeat_eval", c, p);
+    // Multiply temperature gradient with conductivity
+    fst::tensorMultiplyDataData(kappaGradHeat_eval, kappa, gradHeat_eval);
+    // Dot multiply scaled velocity with temperature gradient
+    fst::dotMultiplyDataData(velDotGradHeat_eval, valVel_eval, gradHeat_eval);
+
+    /**************************************************************************/
+    /*** EVALUATE WEAK FORM OF RESIDUAL ***************************************/
+    /**************************************************************************/
+    // Velocity equation.
+    for (int i = 0; i < d; ++i) {
+      fst::integrate(R[i],nuGradVel_vec[i],feVel_->gradNdetJ(),false);
+      fst::integrate(R[i],valVelDotgradVel_vec[i],feVel_->NdetJ(),true);
+      fst::integrate(R[i],valPres_eval,feVel_->DNDdetJ(i),true);
+    }
+    // Apply gravitational force.
+    fst::integrate(R[d-1],gravForce,feVel_->NdetJ(),true);
+    // Pressure equation.
+    fst::integrate(R[d],divVel_eval,fePrs_->NdetJ(),false);
+    rst::scale(R[d],static_cast<Real>(-1));
+    // Heat equation.
+    fst::integrate(R[d+1],kappaGradHeat_eval,feThr_->gradNdetJ(),false);
+    fst::integrate(R[d+1],velDotGradHeat_eval,feThr_->NdetJ(),true);
+
+    /**************************************************************************/
+    /*** APPLY BOUNDARY CONDITIONS ********************************************/
+    /**************************************************************************/
+    // --> Control boundaries: i=0,3
+    // --> Outflow boundaries: i=1,2
+    // -->  Inflow boundaries: i=4
+    // -->    Step boundaries: i=5,6,7,8
+    // -->   Pressure Pinning: i=9
+    int numSideSets = bdryCellLocIds_.size();
+    const int numCubPerSide = bdryCub_->getNumPoints();
+    if (numSideSets > 0) {
+      // ROBIN CONDITIONS
+      for (int i = 0; i < numSideSets; ++i) {
+        // Control boundaries
+        if (i==0 || i==3) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            if (numCellsSide) {
+              // Get U and Z coefficients on Robin boundary
+              scalar_view u_coeff_bdry = getThermalBoundaryCoeff(U[d+1], i, j);
+              scalar_view z_coeff_bdry = getThermalBoundaryCoeff(Z[d+1], i, j);
+              // Evaluate U and Z on FE basis
+              scalar_view valU_eval_bdry("valU_eval_bdry", numCellsSide, numCubPerSide);
+              scalar_view valZ_eval_bdry("valZ_eval_bdry", numCellsSide, numCubPerSide);
+              feThrBdry_[i][j]->evaluateValue(valU_eval_bdry, u_coeff_bdry);
+              feThrBdry_[i][j]->evaluateValue(valZ_eval_bdry, z_coeff_bdry);
+              // Compute Stefan-Boltzmann residual
+              scalar_view robinVal("robinVal", numCellsSide, numCubPerSide);
+              computeThermalRobin(robinVal,valU_eval_bdry,valZ_eval_bdry,i,j,0);
+              // Evaluate boundary integral
+              scalar_view robinRes("robinRes", numCellsSide, fh);
+              fst::integrate(robinRes,robinVal,feThrBdry_[i][j]->NdetJ(),false);
+              for (int k = 0; k < numCellsSide; ++k) {
+                int cidx = bdryCellLocIds_[i][j][k];
+                // Thermal boundary conditions.
+                for (int l = 0; l < fh; ++l)
+                  (R[d+1])(cidx,l) += robinRes(k,l);
+              }
+            }
+          }
+        }
+      }
+      // DIRICHLET CONDITIONS
+      computeDirichlet();
+      // Velocity Boundary Conditions
+      for (int i = 0; i < numSideSets; ++i) {
+        if ( (i==0 || i==3) || (i==1 || i==2) || (i==4) || (i==5 || i==6 || i==8) ) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numVBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numVBdryDofs; ++l) {
+                for (int m = 0; m < d; ++m) {
+                  (R[m])(cidx,fvidx_[j][l]) = (U[m])(cidx,fvidx_[j][l]) - (bdryCellVDofValues_[i][j])(k,fvidx_[j][l],m);
+                }
+              }
+            }
+          }
+        }
+        if (i==7) {
+          int j = 0, l = 0;
+          if (bdryCellLocIds_[i][0].size() > 0) { 
+            int cidx = bdryCellLocIds_[i][0][0];
+            for (int m=0; m < d; ++m) {
+              (R[m])(cidx,fvidx_[j][l]) = (U[m])(cidx,fvidx_[j][l]) - (bdryCellVDofValues_[i][j])(0,fvidx_[j][l],m);
+            }
+          }
+        }
+        // Thermal Boundary Conditions
+        if ( (i==4) || (i==5 || i==6 || i==8) ) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numHBdryDofs = fhidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numHBdryDofs; ++l) {
+                (R[d+1])(cidx,fhidx_[j][l]) = (U[d+1])(cidx,fhidx_[j][l]) - (bdryCellTDofValues_[i][j])(k,fhidx_[j][l]);
+              }
+            }
+          }
+        }
+        if (i==7) {
+          int j = 0, l = 0;
+          if (bdryCellLocIds_[i][0].size() > 0) { 
+            int cidx = bdryCellLocIds_[i][0][0];
+            (R[d+1])(cidx,fhidx_[j][l]) = (U[d+1])(cidx,fhidx_[j][l]) - (bdryCellTDofValues_[i][j])(0,fhidx_[j][l]);
+          }
+        }
+        // Pressure pinning
+        if (i==9) {
+          int j = 2, l = 0;
+          if (bdryCellLocIds_[i][0].size() > 0) { 
+            int cidx = bdryCellLocIds_[i][0][0];
+            (R[d])(cidx,fpidx_[j][l]) = (U[d])(cidx,fpidx_[j][l]);
+          }
+        }
+      }
+    }
+
+    // Combine the residuals.
+    fieldHelper_->combineFieldCoeff(res, R);
+  }
+
+  void Jacobian_1(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    int c  = u_coeff.extent_int(0);
+    int p  = cellCub_->getNumPoints();
+    int fv = basisPtrVel_->getCardinality();
+    int fp = basisPtrPrs_->getCardinality();
+    int fh = basisPtrThr_->getCardinality();
+    int d  = cellCub_->getDimension();
+ 
+    // Initialize jacobians.
+    std::vector<std::vector<scalar_view>> J(d+2);
+    for (int i = 0; i < d+2; ++i)
+      J[i].resize(d+2);
+    for (int i = 0; i < d; ++i) {
+      for (int j = 0; j < d; ++j)
+        J[i][j] = scalar_view("jac", c, fv, fv);
+      J[d][i]   = scalar_view("jac", c, fp, fv);
+      J[i][d]   = scalar_view("jac", c, fv, fp);
+      J[d+1][i] = scalar_view("jac", c, fh, fv);
+      J[i][d+1] = scalar_view("jac", c, fv, fh);
+    }
+    J[d][d]     = scalar_view("jac", c, fp, fp);
+    J[d+1][d]   = scalar_view("jac", c, fh, fp);
+    J[d][d+1]   = scalar_view("jac", c, fp, fh);
+    J[d+1][d+1] = scalar_view("jac", c, fh, fh);
+
+    // Split u_coeff into components.
+    std::vector<scalar_view> U, Z;
+    fieldHelper_->splitFieldCoeff(U, u_coeff);
+    fieldHelper_->splitFieldCoeff(Z, z_coeff);
+
+    // Evaluate problem coefficients at quadrature points.
+    scalar_view nu("nu", c, p);
+    scalar_view grav("grav", c, p);
+    scalar_view kappa("kappa", c, p);
+    computeCoefficients(nu,grav,kappa);
+
+    // Evaluate/interpolate finite element fields on cells.
+    std::vector<scalar_view> valVel_vec(d);
+    for (int i = 0; i < d; ++i) {
+      valVel_vec[i] = scalar_view("valVel_vec", c, p);
+      feVel_->evaluateValue(valVel_vec[i], U[i]);
+    }
+    // Evaluate/interpolate gradient of finite element fields on cells.
+    std::vector<scalar_view> gradVel_vec(d);
+    for (int i = 0; i < d; ++i) {
+      gradVel_vec[i] = scalar_view("gradVel_vec", c, p, d);
+      feVel_->evaluateGradient(gradVel_vec[i], U[i]);
+    }
+    scalar_view gradHeat_eval("gradHeat_eval", c, p, d);
+    feThr_->evaluateGradient(gradHeat_eval, U[d+1]);
+
+    // Assemble the velocity vector and its divergence.
+    scalar_view valVel_eval("valVel_eval", c, p, d);
+    std::vector<std::vector<scalar_view>> dVel_vec(d);
+    std::vector<scalar_view> gradHeat_vec(d);
+    for (int k = 0; k < d; ++k) {
+      dVel_vec[k].resize(d);
+      for (int l = 0; l < d; ++l) {
+        dVel_vec[k][l] = scalar_view("dVel_vec", c, p);
+        for (int i = 0; i < c; ++i) {
+          for (int j = 0; j < p; ++j)
+            (dVel_vec[k][l])(i,j) = (gradVel_vec[k])(i,j,l);
+        }
+      }
+      gradHeat_vec[k] = scalar_view("gradHeat_vec", c, p);
+      for (int i = 0; i < c; ++i) {
+        for (int j = 0; j < p; ++j) {
+          valVel_eval(i,j,k)  = (valVel_vec[k])(i,j);
+          (gradHeat_vec[k])(i,j) = gradHeat_eval(i,j,k);
+        }
+      }
+    }
+
+    /**************************************************************************/
+    /*** NAVIER STOKES ********************************************************/
+    /**************************************************************************/
+    std::vector<std::vector<scalar_view>> dVelPhi_vec(d);
+    for (int i = 0; i < d; ++i) {
+      // Compute nonlinear terms in the Navier-Stokes equations.
+      dVelPhi_vec[i].resize(d);
+      for (int j = 0; j < d; ++j) {
+        dVelPhi_vec[i][j] = scalar_view("dVelPhi_vec", c, fv, p);
+        fst::scalarMultiplyDataField(dVelPhi_vec[i][j], dVel_vec[i][j], feVel_->N());
+      }
+    }
+    // Multiply velocity gradients with viscosity.
+    scalar_view nuGradPhi_eval("nuGradPhi_eval", c, fv, p, d);
+    fst::tensorMultiplyDataField(nuGradPhi_eval, nu, feVel_->gradN());
+    // Compute nonlinear terms in the Navier-Stokes equations.
+    scalar_view valVelDotgradPhi_eval("valVelDotgradPhi_eval", c, fv, p);
+    fst::dotMultiplyDataField(valVelDotgradPhi_eval, valVel_eval, feVel_->gradN());
+    // Negative pressure basis.
+    scalar_view negPrsPhi("negPrsPhi", c, fp, p);
+    Kokkos::deep_copy(negPrsPhi,fePrs_->N());
+    rst::scale(negPrsPhi,static_cast<Real>(-1));
+    // Compute gravity Jacobian
+    scalar_view gravPhi("gravPhi", c, fh, p);
+    fst::scalarMultiplyDataField(gravPhi, grav, feThr_->N());
+
+    /**************************************************************************/
+    /*** THERMAL **************************************************************/
+    /**************************************************************************/
+    scalar_view kappaGradPhi("kappaGradPhi", c, fh, p, d);
+    scalar_view VelPhi("velPhi", c, fh, p);
+    std::vector<scalar_view> dHeatPhi(d);
+    // Compute kappa times gradient of basis.
+    fst::tensorMultiplyDataField(kappaGradPhi, kappa, feThr_->gradN());
+    // Compute scaled velocity.
+    fst::dotMultiplyDataField(VelPhi, valVel_eval, feThr_->gradN());
+    // Thermal-velocity Jacobians.
+    for (int i = 0; i < d; ++i) {
+      dHeatPhi[i] = scalar_view("dHeatPhi", c, fh, p);
+      fst::scalarMultiplyDataField(dHeatPhi[i], gradHeat_vec[i], feThr_->N());
+    }
+
+    /*** Evaluate weak form of the Jacobian. ***/
+    // X component of velocity equation.
+    for (int i = 0; i < d; ++i) {
+      // Velocity components
+      for (int j = 0; j < d; ++j)
+        fst::integrate(J[i][j],feVel_->NdetJ(),dVelPhi_vec[i][j],false);
+      fst::integrate(J[i][i],nuGradPhi_eval,feVel_->gradNdetJ(),true);
+      fst::integrate(J[i][i],feVel_->NdetJ(),valVelDotgradPhi_eval,true);
+      // Pressure components
+      fst::integrate(J[i][d],feVel_->DNDdetJ(i),negPrsPhi,false);
+      fst::integrate(J[d][i],fePrs_->NdetJ(),feVel_->DND(i),false);
+      rst::scale(J[d][i],static_cast<Real>(-1));
+      // Thermal components
+      fst::integrate(J[d+1][i],dHeatPhi[i],feVel_->NdetJ(),false);
+    }
+    // Gravitational component
+    fst::integrate(J[d-1][d+1],feVel_->NdetJ(),gravPhi,false);
+    // Thermal components
+    fst::integrate(J[d+1][d+1],kappaGradPhi,feThr_->gradNdetJ(),false);
+    fst::integrate(J[d+1][d+1],feThr_->NdetJ(),VelPhi,true);
+
+    // APPLY BOUNDARY CONDITIONS
+    int numSideSets = bdryCellLocIds_.size();
+    const int numCubPerSide = bdryCub_->getNumPoints();
+    if (numSideSets > 0) {
+      // ROBIN CONDITIONS
+      for (int i = 0; i < numSideSets; ++i) {
+        // Control boundaries
+        if (i==0 || i==3) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            if (numCellsSide) {
+              // Get U and Z coefficients on Robin boundary
+              scalar_view u_coeff_bdry = getThermalBoundaryCoeff(U[d+1], i, j);
+              scalar_view z_coeff_bdry = getThermalBoundaryCoeff(Z[d+1], i, j);
+              // Evaluate U and Z on FE basis
+              scalar_view valU_eval_bdry("valU_eval_bdry", numCellsSide, numCubPerSide);
+              scalar_view valZ_eval_bdry("valZ_eval_bdry", numCellsSide, numCubPerSide);
+              feThrBdry_[i][j]->evaluateValue(valU_eval_bdry, u_coeff_bdry);
+              feThrBdry_[i][j]->evaluateValue(valZ_eval_bdry, z_coeff_bdry);
+              // Compute Stefan-Boltzmann residual
+              scalar_view robinVal1("robinVal1", numCellsSide, numCubPerSide);
+              scalar_view robinVal("robinVal", numCellsSide, fh, numCubPerSide);
+              computeThermalRobin(robinVal1,valU_eval_bdry,valZ_eval_bdry,i,j,1,0);
+              fst::scalarMultiplyDataField(robinVal,robinVal1,feThrBdry_[i][j]->N());
+              // Evaluate boundary integral
+              scalar_view robinJac("robinJac", numCellsSide, fh, fh);
+              fst::integrate(robinJac,robinVal,feThrBdry_[i][j]->NdetJ(),false);
+              for (int k = 0; k < numCellsSide; ++k) {
+                int cidx = bdryCellLocIds_[i][j][k];
+                for (int l = 0; l < fh; ++l) {
+                  for (int m = 0; m < fh; ++m) {
+                    (J[d+1][d+1])(cidx,l,m) += robinJac(k,l,m);
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+      // DIRICHLET CONDITIONS
+      for (int i = 0; i < numSideSets; ++i) {
+        // Velocity Boundary Conditions
+        if ( (i==0 || i==3) || (i==1 || i==2) || (i==4) || (i==5 || i==6 || i==8) ) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numVBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numVBdryDofs; ++l) {
+                for (int m = 0; m < fv; ++m) {
+                  for (int n = 0; n < d; ++n) {
+                    for (int q = 0; q < d; ++q)
+                      (J[n][q])(cidx,fvidx_[j][l],m) = static_cast<Real>(0);
+                    (J[n][n])(cidx,fvidx_[j][l],fvidx_[j][l]) = static_cast<Real>(1);
+                  }
+                }
+                for (int m = 0; m < d; ++m) {
+                  for (int n = 0; n < fp; ++n)
+                    (J[m][d])(cidx,fvidx_[j][l],n) = static_cast<Real>(0);
+                  for (int n = 0; n < fh; ++n)
+                    (J[m][d+1])(cidx,fvidx_[j][l],n) = static_cast<Real>(0);
+                }
+              }
+            }
+          }
+        }
+        if (i==7) {
+          int j = 0, l = 0;
+          if (bdryCellLocIds_[i][0].size() > 0) { 
+            int cidx = bdryCellLocIds_[i][0][0];
+            for (int m=0; m < fv; ++m) {
+              for (int n=0; n < d; ++n) {
+                for (int q=0; q < d; ++q)
+                  (J[n][q])(cidx,fvidx_[j][l],m) = static_cast<Real>(0);
+                (J[n][n])(cidx,fvidx_[j][l],fvidx_[j][l]) = static_cast<Real>(1);
+              }
+            }
+            for (int m=0; m < d; ++m) {
+              for (int n=0; n < fp; ++n)
+                (J[m][d])(cidx,fvidx_[j][l],n) = static_cast<Real>(0);
+              for (int n = 0; n < fh; ++n)
+                (J[m][d+1])(cidx,fvidx_[j][l],n) = static_cast<Real>(0);
+            }
+          }
+        }
+        // Thermal boundary conditions
+        if ( (i==4) || (i==5 || i==6 || i==8) ) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numHBdryDofs = (i==7) ? 1 : fhidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numHBdryDofs; ++l) {
+                for (int m = 0; m < fv; ++m) {
+                  for (int n = 0; n < d; ++n)
+                    (J[d+1][n])(cidx,fhidx_[j][l],m) = static_cast<Real>(0);
+                }
+                for (int m = 0; m < fp; ++m)
+                  (J[d+1][d])(cidx,fhidx_[j][l],m) = static_cast<Real>(0);
+                for (int m = 0; m < fh; ++m)
+                  (J[d+1][d+1])(cidx,fhidx_[j][l],m) = static_cast<Real>(0);
+                (J[d+1][d+1])(cidx,fhidx_[j][l],fhidx_[j][l]) = static_cast<Real>(1);
+              }
+            }
+          }
+        }
+        if (i==7) {
+          int j = 0, l = 0;
+          if (bdryCellLocIds_[i][0].size() > 0) { 
+            int cidx = bdryCellLocIds_[i][0][0];
+            for (int m=0; m < fv; ++m) {
+              for (int n=0; n < d; ++n) {
+                for (int q=0; q < d; ++q)
+                  (J[d+1][q])(cidx,fvidx_[j][l],m) = static_cast<Real>(0);
+              }
+            }
+            for (int m=0; m < fp; ++m)
+              (J[d+1][d])(cidx,fvidx_[j][l],m) = static_cast<Real>(0);
+            for (int m = 0; m < fh; ++m)
+              (J[d+1][d+1])(cidx,fvidx_[j][l],m) = static_cast<Real>(0);
+            (J[d+1][d+1])(cidx,fhidx_[j][l],fhidx_[j][l]) = static_cast<Real>(1);
+          }
+        }
+        // Pressure pinning
+        if (i==9) {
+          int j = 2, l = 0;
+          if (bdryCellLocIds_[i][0].size() > 0) {
+            int cidx = bdryCellLocIds_[i][0][0];
+            for (int m = 0; m < fv; ++m) {
+              for (int n = 0; n < d; ++n)
+                (J[d][n])(cidx,fpidx_[j][l],m) = static_cast<Real>(0);
+            }
+            for (int m = 0; m < fp; ++m)
+              (J[d][d])(cidx,fpidx_[j][l],m) = static_cast<Real>(0);
+            (J[d][d])(cidx,fpidx_[j][l],fpidx_[j][l]) = static_cast<Real>(1);
+            for (int m = 0; m < fh; ++m)
+              (J[d][d+1])(cidx,fpidx_[j][l],m) = static_cast<Real>(0);
+          }
+        }
+      }
+    }
+
+    // Combine the jacobians.
+    fieldHelper_->combineFieldCoeff(jac, J);
+  }
+
+
+  void Jacobian_2(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    int c  = u_coeff.extent_int(0);
+    int fv = basisPtrVel_->getCardinality();
+    int fp = basisPtrPrs_->getCardinality();
+    int fh = basisPtrThr_->getCardinality();
+    int d  = cellCub_->getDimension();
+ 
+    // Initialize jacobians.
+    std::vector<std::vector<scalar_view>> J(d+2);
+    for (int i = 0; i < d+2; ++i)
+      J[i].resize(d+2);
+    for (int i = 0; i < d; ++i) {
+      for (int j = 0; j < d; ++j)
+        J[i][j] = scalar_view("jac", c, fv, fv);
+      J[d][i]   = scalar_view("jac", c, fp, fv);
+      J[i][d]   = scalar_view("jac", c, fv, fp);
+      J[d+1][i] = scalar_view("jac", c, fh, fv);
+      J[i][d+1] = scalar_view("jac", c, fv, fh);
+    }
+    J[d][d]     = scalar_view("jac", c, fp, fp);
+    J[d+1][d]   = scalar_view("jac", c, fh, fp);
+    J[d][d+1]   = scalar_view("jac", c, fp, fh);
+    J[d+1][d+1] = scalar_view("jac", c, fh, fh);
+
+    // Split u_coeff into components.
+    std::vector<scalar_view> U, Z;
+    fieldHelper_->splitFieldCoeff(U, u_coeff);
+    fieldHelper_->splitFieldCoeff(Z, z_coeff);
+
+    // APPLY DIRICHLET CONDITIONS
+    int numSideSets = bdryCellLocIds_.size();
+    const int numCubPerSide = bdryCub_->getNumPoints();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        // Control boundaries
+        if (i==0 || i==3) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            if (numCellsSide) {
+              // Get U and Z coefficients on Robin boundary
+              scalar_view u_coeff_bdry = getThermalBoundaryCoeff(U[d+1], i, j);
+              scalar_view z_coeff_bdry = getThermalBoundaryCoeff(Z[d+1], i, j);
+              // Evaluate U and Z on FE basis
+              scalar_view valU_eval_bdry("valU_eval_bdry", numCellsSide, numCubPerSide);
+              scalar_view valZ_eval_bdry("valZ_eval_bdry", numCellsSide, numCubPerSide);
+              feThrBdry_[i][j]->evaluateValue(valU_eval_bdry, u_coeff_bdry);
+              feThrBdry_[i][j]->evaluateValue(valZ_eval_bdry, z_coeff_bdry);
+              // Compute Stefan-Boltzmann residual
+              scalar_view robinVal1("robinVal1", numCellsSide, numCubPerSide);
+              scalar_view robinVal("robinVal", numCellsSide, fh, numCubPerSide);
+              computeThermalRobin(robinVal1,valU_eval_bdry,valZ_eval_bdry,i,j,1,1);
+              fst::scalarMultiplyDataField(robinVal,robinVal1,feThrBdry_[i][j]->N());
+              // Evaluate boundary integral
+              scalar_view robinJac("robinJac", numCellsSide, fh, fh);
+              fst::integrate(robinJac,robinVal,feThrBdry_[i][j]->NdetJ(),false);
+              for (int k = 0; k < numCellsSide; ++k) {
+                int cidx = bdryCellLocIds_[i][j][k];
+                for (int l = 0; l < fh; ++l) {
+                  for (int m = 0; m < fh; ++m)
+                    (J[d+1][d+1])(cidx,l,m) += robinJac(k,l,m);
+                }
+              }
+            }
+          }
+        }
+      }
+      // DIRICHLET CONDITIONS
+      for (int i = 0; i < numSideSets; ++i) {
+        // Velocity Boundary Conditions
+        if ( (i==0 || i==3) || (i==1 || i==2) || (i==4) || (i==5 || i==6 || i==8) ) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numVBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numVBdryDofs; ++l) {
+                for (int m = 0; m < fv; ++m) {
+                  for (int n = 0; n < d; ++n) {
+                    for (int p = 0; p < d; ++p)
+                      (J[n][p])(cidx,fvidx_[j][l],m) = static_cast<Real>(0);
+                  }
+                }
+                for (int m = 0; m < d; ++m) {
+                  for (int n = 0; n < fp; ++n)
+                    (J[m][d])(cidx,fvidx_[j][l],n) = static_cast<Real>(0);
+                  for (int n = 0; n < fh; ++n)
+                    (J[m][d+1])(cidx,fvidx_[j][l],n) = static_cast<Real>(0);
+                }
+              }
+            }
+          }
+        }
+        if (i==7) {
+          int j = 0, l = 0;
+          if (bdryCellLocIds_[i][0].size() > 0) { 
+            int cidx = bdryCellLocIds_[i][0][0];
+            for (int m=0; m < fv; ++m) {
+              for (int n=0; n < d; ++n) {
+                for (int p=0; p < d; ++p)
+                  (J[n][p])(cidx,fvidx_[j][l],m) = static_cast<Real>(0);
+              }
+            }
+            for (int m=0; m < d; ++m) {
+              for (int n=0; n < fp; ++n)
+                (J[m][d])(cidx,fvidx_[j][l],n) = static_cast<Real>(0);
+              for (int n = 0; n < fh; ++n)
+                (J[m][d+1])(cidx,fvidx_[j][l],n) = static_cast<Real>(0);
+            }
+          }
+        }
+        // Thermal boundary conditions
+        if ( (i==4) || (i==5 || i==6 || i==8) ) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numHBdryDofs = (i==7) ? 1 : fhidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numHBdryDofs; ++l) {
+                for (int m = 0; m < fv; ++m) {
+                  for (int n = 0; n < d; ++n)
+                    (J[d+1][n])(cidx,fhidx_[j][l],m) = static_cast<Real>(0);
+                }
+                for (int m = 0; m < fp; ++m)
+                  (J[d+1][d])(cidx,fhidx_[j][l],m) = static_cast<Real>(0);
+                for (int m = 0; m < fh; ++m)
+                  (J[d+1][d+1])(cidx,fhidx_[j][l],m) = static_cast<Real>(0);
+              }
+            }
+          }
+        }
+        if (i==7) {
+          int j = 0, l = 0;
+          if (bdryCellLocIds_[i][0].size() > 0) { 
+            int cidx = bdryCellLocIds_[i][0][0];
+            for (int m=0; m < fv; ++m) {
+              for (int n=0; n < d; ++n) {
+                for (int p=0; p < d; ++p)
+                  (J[d+1][p])(cidx,fvidx_[j][l],m) = static_cast<Real>(0);
+              }
+            }
+            for (int m=0; m < fp; ++m)
+              (J[d+1][d])(cidx,fvidx_[j][l],m) = static_cast<Real>(0);
+            for (int m = 0; m < fh; ++m)
+              (J[d+1][d+1])(cidx,fvidx_[j][l],m) = static_cast<Real>(0);
+          }
+        }
+        // Pressure pinning
+        if (i==9) {
+          int j = 2, l = 0;
+          if (bdryCellLocIds_[i][0].size() > 0) {
+            int cidx = bdryCellLocIds_[i][0][0];
+            for (int m = 0; m < fv; ++m) {
+              for (int n = 0; n < d; ++n)
+                (J[d][n])(cidx,fpidx_[j][l],m) = static_cast<Real>(0);
+            }
+            for (int m = 0; m < fp; ++m)
+              (J[d][d])(cidx,fpidx_[j][l],m) = static_cast<Real>(0);
+            for (int m = 0; m < fh; ++m)
+              (J[d][d+1])(cidx,fpidx_[j][l],m) = static_cast<Real>(0);
+          }
+        }
+      }
+    }
+
+    // Combine the jacobians.
+    fieldHelper_->combineFieldCoeff(jac, J);
+  }
+
+  void Hessian_11(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+//    throw Exception::NotImplemented("");
+    // Retrieve dimensions.
+    int c  = u_coeff.extent_int(0);
+    int p  = cellCub_->getNumPoints();
+    int fv = basisPtrVel_->getCardinality();
+    int fp = basisPtrPrs_->getCardinality();
+    int fh = basisPtrThr_->getCardinality();
+    int d  = cellCub_->getDimension();
+ 
+    // Initialize jacobians.
+    std::vector<std::vector<scalar_view>> H(d+2);
+    for (int i = 0; i < d+2; ++i)
+      H[i].resize(d+2);
+    for (int i = 0; i < d; ++i) {
+      for (int j = 0; j < d; ++j)
+        H[i][j] = scalar_view("hess", c, fv, fv);
+    }
+    for (int i = 0; i < d; ++i) {
+      H[d][i]   = scalar_view("hess", c, fp, fv);
+      H[i][d]   = scalar_view("hess", c, fv, fp);
+      H[d+1][i] = scalar_view("hess", c, fh, fv);
+      H[i][d+1] = scalar_view("hess", c, fv, fh);
+    }
+    H[d][d]     = scalar_view("hess", c, fp, fp);
+    H[d+1][d]   = scalar_view("hess", c, fh, fp);
+    H[d][d+1]   = scalar_view("hess", c, fp, fh);
+    H[d+1][d+1] = scalar_view("hess", c, fh, fh);
+
+    // Split l_coeff into components.
+    std::vector<scalar_view> L;
+    fieldHelper_->splitFieldCoeff(L, l_coeff);
+
+    // APPLY DIRICHLET CONDITIONS
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      // DIRICHLET CONDITIONS
+      for (int i = 0; i < numSideSets; ++i) {
+        // Velocity boundary conditions
+        if ( (i==0 || i==3) || (i==1 || i==2) || (i==4) || (i==5 || i==6 || i==8) ) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numVBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numVBdryDofs; ++l) {
+                for (int m = 0; m < d; ++m) {
+                  (L[m])(cidx,fvidx_[j][l]) = static_cast<Real>(0);
+                }
+              }
+            }
+          }
+        }
+        if (i==7) {
+          int j = 0, l = 0;
+          if (bdryCellLocIds_[i][0].size() > 0) { 
+            int cidx = bdryCellLocIds_[i][0][0];
+            for (int m=0; m < d; ++m) {
+              (L[m])(cidx,fvidx_[j][l]) = static_cast<Real>(0);
+            }
+          }
+        }
+        // Inflow boundaries
+        if ( (i==4) || (i==5 || i==6 || i==8) ) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numHBdryDofs = fhidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numHBdryDofs; ++l) {
+                (L[d+1])(cidx,fhidx_[j][l]) = static_cast<Real>(0);
+              }
+            }
+          }
+        }
+        if (i==7) {
+          int j = 0, l = 0;
+          if (bdryCellLocIds_[i][0].size() > 0) { 
+            int cidx = bdryCellLocIds_[i][0][0];
+            (L[d+1])(cidx,fhidx_[j][l]) = static_cast<Real>(0);
+          }
+        }
+        // Pressure pinning
+        if (i==9) {
+          int j = 2, l = 0;
+          if (bdryCellLocIds_[i][0].size() > 0) { 
+            int cidx = bdryCellLocIds_[i][0][0];
+            (L[d])(cidx,fpidx_[j][l]) = static_cast<Real>(0);
+          }
+        }
+      }
+    }
+
+    // Evaluate/interpolate finite element fields on cells.
+    std::vector<scalar_view> valVel_vec(d);
+    for (int i = 0; i < d; ++i) {
+      valVel_vec[i] = scalar_view("valVel_vec", c, p);
+      feVel_->evaluateValue(valVel_vec[i], L[i]);
+    }
+    scalar_view valHeat_eval("valHeat_eval", c, p);
+    feThr_->evaluateValue(valHeat_eval, L[3]);
+
+    // Compute nonlinear terms in the Navier-Stokes equations.
+    std::vector<scalar_view> valVelPhi_vec(d);
+    for (int i = 0; i < d; ++i) {
+      valVelPhi_vec[i] = scalar_view("valVelPhi_vec", c, fv, p);
+      fst::scalarMultiplyDataField(valVelPhi_vec[i], valVel_vec[i], feVel_->N());
+    }
+
+    // Compute nonlinear terms in the thermal equation.
+    scalar_view valHeatVelPhi("valHeatVelPhi", c, fv, p);
+    fst::scalarMultiplyDataField(valHeatVelPhi, valHeat_eval, feVel_->N());
+
+    /*** Evaluate weak form of the Hessian. ***/
+    for (int i = 0; i < d; ++i) {
+      // velocity equation.
+      for (int j = 0; j < d; ++j) {
+        fst::integrate(H[i][j],valVelPhi_vec[j],feVel_->DNDdetJ(i),false);
+        fst::integrate(H[i][j],feVel_->DNDdetJ(j),valVelPhi_vec[i],true);
+      }
+      fst::integrate(H[i][d+1],valHeatVelPhi,feThr_->DNDdetJ(i),false);
+      // Thermal equation.
+      fst::integrate(H[d+1][i],feThr_->DNDdetJ(i),valHeatVelPhi,false);
+    }
+
+    // Combine the Hessians.
+    fieldHelper_->combineFieldCoeff(hess, H);
+  }
+
+  void Hessian_12(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_ThermalFluids::Hessian_12): Hessian is zero.");
+  }
+
+  void Hessian_21(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_ThermalFluids::Hessian_21): Hessian is zero.");
+  }
+
+  void Hessian_22(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_ThermalFluids::Hessian_22): Hessian is zero.");
+  }
+
+  void RieszMap_1(scalar_view & riesz) override {
+    // Retrieve dimensions.
+    int c  = feVel_->N().extent_int(0);
+    int fv = basisPtrVel_->getCardinality();
+    int fp = basisPtrPrs_->getCardinality();
+    int fh = basisPtrThr_->getCardinality();
+    int d  = cellCub_->getDimension();
+ 
+    // Initialize jacobians.
+    std::vector<std::vector<scalar_view>> J(d+2);
+    for (int i = 0; i < d+2; ++i)
+      J[i].resize(d+2);
+    for (int i = 0; i < d; ++i) {
+      for (int j = 0; j < d; ++j)
+        J[i][j] = scalar_view("riesz1", c, fv, fv);
+      J[d][i]   = scalar_view("riesz1", c, fp, fv);
+      J[i][d]   = scalar_view("riesz1", c, fv, fp);
+      J[d+1][i] = scalar_view("riesz1", c, fh, fv);
+      J[i][d+1] = scalar_view("riesz1", c, fv, fh);
+    }
+    J[d][d]     = scalar_view("riesz1", c, fp, fp);
+    J[d+1][d]   = scalar_view("riesz1", c, fh, fp);
+    J[d][d+1]   = scalar_view("riesz1", c, fp, fh);
+    J[d+1][d+1] = scalar_view("riesz1", c, fh, fh);
+
+    for (int i = 0; i < d; ++i) {
+      Kokkos::deep_copy(J[i][i],feVel_->stiffMat());
+      rst::add(J[i][i],feVel_->massMat());
+    }
+    Kokkos::deep_copy(J[d][d],fePrs_->massMat());
+    Kokkos::deep_copy(J[d+1][d+1],feThr_->stiffMat());
+    rst::add(J[d+1][d+1],feThr_->massMat());
+
+    // Combine the jacobians.
+    fieldHelper_->combineFieldCoeff(riesz, J);
+  }
+
+  void RieszMap_2(scalar_view & riesz) override {
+    // Retrieve dimensions.
+    int c  = feVel_->N().extent_int(0);
+    int fv = basisPtrVel_->getCardinality();
+    int fp = basisPtrPrs_->getCardinality();
+    int fh = basisPtrThr_->getCardinality();
+    int d  = cellCub_->getDimension();
+ 
+    // Initialize jacobians.
+    std::vector<std::vector<scalar_view>> J(d+2);
+    for (int i = 0; i < d+2; ++i)
+      J[i].resize(d+2);
+    for (int i = 0; i < d; ++i) {
+      for (int j = 0; j < d; ++j)
+        J[i][j] = scalar_view("reisz2", c, fv, fv);
+      J[d][i]   = scalar_view("reisz2", c, fp, fv);
+      J[i][d]   = scalar_view("reisz2", c, fv, fp);
+      J[d+1][i] = scalar_view("reisz2", c, fh, fv);
+      J[i][d+1] = scalar_view("reisz2", c, fv, fh);
+    }
+    J[d][d]     = scalar_view("reisz2", c, fp, fp);
+    J[d+1][d]   = scalar_view("reisz2", c, fh, fp);
+    J[d][d+1]   = scalar_view("reisz2", c, fp, fh);
+    J[d+1][d+1] = scalar_view("reisz2", c, fh, fh);
+
+    for (int i = 0; i < d; ++i)
+      Kokkos::deep_copy(J[i][i],feVel_->massMat());
+    Kokkos::deep_copy(J[d][d],fePrs_->massMat());
+    Kokkos::deep_copy(J[d+1][d+1],feThr_->massMat());
+
+    // Combine the jacobians.
+    fieldHelper_->combineFieldCoeff(riesz, J);
+  }
+
+  std::vector<basis_ptr> getFields() override {
+    return basisPtrs_;
+  }
+
+  void setCellNodes(const scalar_view &volCellNodes,
+                    const std::vector<std::vector<scalar_view>> &bdryCellNodes,
+                    const std::vector<std::vector<std::vector<int>>> &bdryCellLocIds) override {
+    volCellNodes_ = volCellNodes;
+    bdryCellNodes_ = bdryCellNodes;
+    bdryCellLocIds_ = bdryCellLocIds;
+    // Finite element definition.
+    feVel_ = ROL::makePtr<fe_type>(volCellNodes_,basisPtrVel_,cellCub_);
+    fePrs_ = ROL::makePtr<fe_type>(volCellNodes_,basisPtrPrs_,cellCub_);
+    feThr_ = ROL::makePtr<fe_type>(volCellNodes_,basisPtrThr_,cellCub_);
+    // Get boundary degrees of freedom.
+    fvidx_ = feVel_->getBoundaryDofs();
+    fpidx_ = fePrs_->getBoundaryDofs();
+    fhidx_ = feThr_->getBoundaryDofs();
+    // Construct boundary FEs
+    const int numSideSets = bdryCellNodes.size();
+    feVelBdry_.resize(numSideSets);
+    fePrsBdry_.resize(numSideSets);
+    feThrBdry_.resize(numSideSets);
+    for (int i = 0; i < numSideSets; ++i) {
+      int numLocSides = bdryCellNodes[i].size();
+      feVelBdry_[i].resize(numLocSides);
+      fePrsBdry_[i].resize(numLocSides);
+      feThrBdry_[i].resize(numLocSides);
+      for (int j = 0; j < numLocSides; ++j) {
+        if (bdryCellNodes[i][j] != scalar_view()) {
+          feVelBdry_[i][j] = ROL::makePtr<fe_type>(bdryCellNodes[i][j],basisPtrVel_,bdryCub_,j);
+          fePrsBdry_[i][j] = ROL::makePtr<fe_type>(bdryCellNodes[i][j],basisPtrPrs_,bdryCub_,j);
+          feThrBdry_[i][j] = ROL::makePtr<fe_type>(bdryCellNodes[i][j],basisPtrThr_,bdryCub_,j);
+        }
+      }
+    }
+  }
+
+  void setFieldPattern(const std::vector<std::vector<int>> & fieldPattern) override {
+    fieldPattern_ = fieldPattern;
+    fieldHelper_ = ROL::makePtr<FieldHelper<Real,DeviceType>>(numFields_, numDofs_, numFieldDofs_, fieldPattern_);
+  }
+
+  const ROL::Ptr<fe_type> getVelocityFE(void) const {
+    return feVel_;
+  }
+
+  const ROL::Ptr<fe_type> getPressureFE(void) const {
+    return fePrs_;
+  }
+
+  const ROL::Ptr<fe_type> getThermalFE(void) const {
+    return feThr_;
+  }
+
+  const std::vector<std::vector<ROL::Ptr<fe_type>>> getVelocityBdryFE(void) const {
+    return feVelBdry_;
+  }
+
+  const std::vector<std::vector<ROL::Ptr<fe_type>>> getPressureBdryFE(void) const {
+    return fePrsBdry_;
+  }
+
+  const std::vector<std::vector<ROL::Ptr<fe_type>>> getThermalBdryFE(void) const {
+    return feThrBdry_;
+  }
+
+  const std::vector<std::vector<std::vector<int>>> getBdryCellLocIds(void) const {
+    return bdryCellLocIds_;
+  }
+
+  const ROL::Ptr<FieldHelper<Real,DeviceType>> getFieldHelper(void) const {
+    return fieldHelper_;
+  }
+
+}; // PDE_ThermalFluids
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/thermal-fluids/pde_thermal-fluids_ex03K.hpp b/packages/rol/example/PDE-OPT/thermal-fluids/pde_thermal-fluids_ex03K.hpp
new file mode 100644
index 000000000000..4071922474a8
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/thermal-fluids/pde_thermal-fluids_ex03K.hpp
@@ -0,0 +1,1270 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  pde_thermal-fluids.hpp
+    \brief Implements the local PDE interface for the Navier-Stokes control problem.
+*/
+
+#ifndef PDE_THERMALFLUIDS_EX03K_HPP
+#define PDE_THERMALFLUIDS_EX03K_HPP
+
+#include "../TOOLS/pdeK.hpp"
+#include "../TOOLS/feK.hpp"
+#include "../TOOLS/fieldhelperK.hpp"
+
+#include "Intrepid2_HGRAD_QUAD_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_QUAD_C2_FEM.hpp"
+#include "Intrepid2_DefaultCubatureFactory.hpp"
+#include "Intrepid2_FunctionSpaceTools.hpp"
+#include "Intrepid2_CellTools.hpp"
+
+#include "ROL_Ptr.hpp"
+
+template <class Real, class DeviceType>
+class PDE_ThermalFluids_ex03 : public PDE<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  // Finite element basis information
+  basis_ptr basisPtrVel_, basisPtrPrs_, basisPtrThr_;
+  std::vector<basis_ptr> basisPtrs_;
+  // Cell cubature information
+  ROL::Ptr<Intrepid2::Cubature<DeviceType,Real,Real>> cellCub_, bdryCub_;
+  // Cell node information
+  scalar_view volCellNodes_;
+  std::vector<std::vector<scalar_view>> bdryCellNodes_;
+  std::vector<std::vector<std::vector<int>>> bdryCellLocIds_;
+  // Finite element definition
+  ROL::Ptr<fe_type> feVel_, fePrs_, feThr_;
+  std::vector<std::vector<ROL::Ptr<fe_type>>> feVelBdry_, fePrsBdry_, feThrBdry_;
+  // Local degrees of freedom on boundary, for each side of the reference cell (first index).
+  std::vector<std::vector<int>> fvidx_, fpidx_, fhidx_;
+  // Coordinates of degrees freedom on boundary cells.
+  // Indexing:  [sideset number][local side id](cell number, value at dof)
+  std::vector<std::vector<scalar_view>> bdryCellVDofValues_, bdryCellTDofValues_;
+  // Field pattern, offsets, etc.
+  std::vector<std::vector<int>> fieldPattern_;  // local Field/DOF pattern; set from DOF manager 
+  int numFields_;                                // number of fields (equations in the PDE)
+  int numDofs_;                                  // total number of degrees of freedom for all (local) fields
+  std::vector<int> offset_;                      // for each field, a counting offset
+  std::vector<int> numFieldDofs_;                // for each field, number of degrees of freedom
+  
+  // Problem parameters.
+  Real Re_, Pr_, Gr_, h_;
+  const Real grav_;
+  int Nbottom_, Nleft_, Nright_;
+  Real ReScale_, PrScale_, GrScale_, hScale_, TScale_;
+  bool pinPressure_;
+
+  ROL::Ptr<FieldHelper<Real,DeviceType>> fieldHelper_;
+
+  Real velocityDirichletFunc(const std::vector<Real> & coords, int sideset, int locSideId, int dir) const {
+    Real val(0);
+    if ((sideset==3) && (dir==1)) {
+      const Real one(1), two(2), three(3), four(4), x = coords[0];
+      if (x <= one/three) {
+        val = two*(one/three - x)*x;
+      }
+      else if (x > one/three && x < two/three) {
+        val = -four*(x-one/three)*(two/three-x);
+      }
+      else if (x >= two/three) {
+        val = two*(x-two/three)*(one-x);
+      }
+    }
+    return val;
+  }
+
+  Real thermalDirichletFunc(const std::vector<Real> & coords, int sideset, int locSideId) const {
+    const std::vector<Real> param = PDE<Real,DeviceType>::getParameter();
+    Real val(0);
+    if (sideset==0) {
+      val = static_cast<Real>(1);
+      if (param.size()) {
+        Real root2(std::sqrt(2.0)), pi(M_PI), ln2(std::log(2.0));
+        for (int i = 0; i < Nbottom_; ++i) {
+          Real di(i+1);
+          val += TScale_ * param[i]/ln2 * (root2 * std::sin(di * pi * coords[0]))/(di * pi);
+        }
+      }
+    }
+    else if (sideset==5) {
+      val = static_cast<Real>(0);
+    }
+    return val;
+  }
+
+  void computeDirichlet(void) {
+    // Compute Dirichlet values at DOFs.
+    int d  = basisPtrVel_->getBaseCellTopology().getDimension();
+    int fv = basisPtrVel_->getCardinality();
+    int ft = basisPtrThr_->getCardinality();
+    int numSidesets = bdryCellLocIds_.size();
+    bdryCellVDofValues_.resize(numSidesets);
+    bdryCellTDofValues_.resize(numSidesets);
+    for (int i=0; i<numSidesets; ++i) {
+      int numLocSides = bdryCellLocIds_[i].size();
+      bdryCellVDofValues_[i].resize(numLocSides);
+      bdryCellTDofValues_[i].resize(numLocSides);
+      for (int j=0; j<numLocSides; ++j) {
+        int c = bdryCellLocIds_[i][j].size();
+        bdryCellVDofValues_[i][j] = scalar_view("bdryCellVDofValues", c, fv, d);
+        bdryCellTDofValues_[i][j] = scalar_view("bdryCellTDofValues", c, ft);
+        scalar_view Vcoords("Vcoords", c, fv, d);
+        scalar_view Tcoords("Tcoords", c, ft, d);
+        if (c > 0) {
+          feVel_->computeDofCoords(Vcoords, bdryCellNodes_[i][j]);
+          feThr_->computeDofCoords(Tcoords, bdryCellNodes_[i][j]);
+        }
+        for (int k=0; k<c; ++k) {
+          for (int l=0; l<fv; ++l) {
+            std::vector<Real> dofpoint(d);
+            //std::cout << "Sideset " << i << " LocalSide " << j << "  Cell " << k << "  Field " << l << "  Coord ";
+            for (int m=0; m<d; ++m) {
+              dofpoint[m] = Vcoords(k, l, m);
+              //std::cout << dofpoint[m] << "  ";
+            }
+
+            for (int m=0; m<d; ++m) {
+              (bdryCellVDofValues_[i][j])(k, l, m) = velocityDirichletFunc(dofpoint, i, j, m);
+              //std::cout << "  " << m << "-Value " << DirichletFunc(dofpoint, i, j, m);
+            }
+            //std::cout << std::endl;
+          }
+          for (int l=0; l<ft; ++l) {
+            std::vector<Real> dofpoint(d);
+            //std::cout << "Sideset " << i << " LocalSide " << j << "  Cell " << k << "  Field " << l << "  Coord ";
+            for (int m=0; m<d; ++m) {
+              dofpoint[m] = Tcoords(k, l, m);
+              //std::cout << dofpoint[m] << "  ";
+            }
+
+            (bdryCellTDofValues_[i][j])(k, l) = thermalDirichletFunc(dofpoint, i, j);
+            //std::cout << "    Value " << DirichletFunc(dofpoint, i, j);
+            //std::cout << std::endl;
+          }
+        }
+      }
+    }
+  }
+
+  Real ReynoldsNumber(void) const {
+    const std::vector<Real> param = PDE<Real,DeviceType>::getParameter();
+    Real val = Re_;
+    if (param.size()) {
+      int offset = Nbottom_ + Nright_ + Nleft_;
+      Real one(1);
+      val /= (one + ReScale_*param[offset]);
+    }
+    return val;
+  }
+
+  Real PrandtlNumber(void) const {
+    const std::vector<Real> param = PDE<Real,DeviceType>::getParameter();
+    Real val = Pr_;
+    if (param.size()) {
+      int offset = Nbottom_ + Nright_ + Nleft_;
+      Real one(1);
+      val *= (one + ReScale_*param[offset])/(one + PrScale_*param[offset+1]);
+    }
+    return val;
+  }
+
+  Real GrashofNumber(void) const {
+    const std::vector<Real> param = PDE<Real,DeviceType>::getParameter();
+    Real val = Gr_;
+    if (param.size()) {
+      int offset = Nbottom_ + Nright_ + Nleft_;
+      Real one(1), muscale = one + ReScale_*param[offset];
+      val *= (one + GrScale_*param[offset+2])/(muscale*muscale);
+    }
+    return val;
+  }
+  
+  Real ThicknessNumber(const std::vector<Real> &x, const int sideset) const {
+    const std::vector<Real> param = PDE<Real,DeviceType>::getParameter();
+    Real val = h_;
+    if ( param.size()) {
+      if ( sideset == 2 ) {
+        int offset = Nbottom_;
+        Real root2(std::sqrt(2.0)), pi(M_PI), ln2(std::log(2.0));
+        for (int i = 0; i < Nleft_; ++i) {
+          Real di(i+1);
+          val += hScale_ * param[offset + i]/ln2 * (root2 * std::sin(di * pi * x[1]))/(di * pi);
+        }
+      }
+      else if ( sideset == 1 ) {
+        int offset = Nbottom_ + Nleft_;
+        Real root2(std::sqrt(2.0)), pi(M_PI), ln2(std::log(2.0));
+        for (int i = 0; i < Nright_; ++i) {
+          Real di(i+1);
+          val += hScale_ * param[offset + i]/ln2 * (root2 * std::sin(di * pi * x[1]))/(di * pi);
+        }
+      }
+    }
+    return val;
+  }
+
+  Real viscosityFunc(const std::vector<Real> &x) const {
+    Real ReynoldsNum = ReynoldsNumber();
+    return static_cast<Real>(1)/ReynoldsNum;
+  }
+
+  Real conductivityFunc(const std::vector<Real> &x) const {
+    Real ReynoldsNum = ReynoldsNumber();
+    Real PrandtlNum = PrandtlNumber();
+    return static_cast<Real>(1)/(ReynoldsNum*PrandtlNum);
+  }
+
+  Real gravityFunc(const std::vector<Real> &x) const {
+    Real ReynoldsNum = ReynoldsNumber();
+    Real GrashofNum = GrashofNumber();
+    return grav_*GrashofNum/(ReynoldsNum*ReynoldsNum);
+  }
+
+  Real thermalRobinFunc(const std::vector<Real> &x, const int sideset) const {
+    return ThicknessNumber(x, sideset) * conductivityFunc(x);
+  }
+
+  void computeCoefficients(scalar_view &nu, scalar_view &grav, scalar_view &kappa) const {
+    int c = feVel_->gradN().extent_int(0);
+    int p = feVel_->gradN().extent_int(2);
+    int d = feVel_->gradN().extent_int(3);
+    std::vector<Real> pt(d);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for ( int k = 0; k < d; ++k) {
+          pt[k] = (feVel_->cubPts())(i,j,k);
+        }
+        // Compute spatially dependent coefficients
+        nu(i,j)    = viscosityFunc(pt);
+        grav(i,j)  = gravityFunc(pt);
+        kappa(i,j) = conductivityFunc(pt);
+      }
+    }
+  }
+
+  void computeThermalRobin(scalar_view &robin,
+                           const scalar_view u,
+                           const scalar_view z,
+                           int sideset,
+                           int locSideId,
+                           int deriv = 0,
+                           int component = 1) const {
+    const int c = u.extent_int(0);
+    const int p = u.extent_int(1);
+    const int d = feThr_->gradN().extent_int(3);
+    std::vector<Real> x(d);
+    Real h(0);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k) {
+          x[k] = (feThrBdry_[sideset][locSideId]->cubPts())(i,j,k);
+        }
+        h = thermalRobinFunc(x, sideset);
+        if ( deriv == 0 )
+          robin(i,j) = h * (u(i,j) - z(i,j));
+        else if ( deriv == 1 )
+          robin(i,j) = ((component==0) ? h : -h);
+        else
+          robin(i,j) = static_cast<Real>(0);
+      }
+    }
+  }
+
+  scalar_view getThermalBoundaryCoeff(const scalar_view cell_coeff, int sideSet, int cell) const {
+    std::vector<int> bdryCellLocId = bdryCellLocIds_[sideSet][cell];
+    const int numCellsSide = bdryCellLocId.size();
+    const int f = basisPtrThr_->getCardinality();
+    
+    scalar_view bdry_coeff("bdry_coeff", numCellsSide, f);
+    for (int i = 0; i < numCellsSide; ++i) {
+      for (int j = 0; j < f; ++j)
+        bdry_coeff(i, j) = cell_coeff(bdryCellLocId[i], j);
+    }
+    return bdry_coeff;
+  }
+
+public:
+  PDE_ThermalFluids_ex03(ROL::ParameterList &parlist) : grav_(-1) {
+    // Finite element fields -- NOT DIMENSION INDEPENDENT!
+    basisPtrVel_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C2_FEM<DeviceType,Real,Real>>();
+    basisPtrPrs_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceType,Real,Real>>();
+    basisPtrThr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C2_FEM<DeviceType,Real,Real>>();
+    // Volume quadrature rules.
+    shards::CellTopology cellType = basisPtrVel_->getBaseCellTopology();     // get the cell type from any basis
+    Intrepid2::DefaultCubatureFactory cubFactory;                            // create cubature factory
+    int cubDegree = parlist.sublist("Problem").get("Cubature Degree", 4);    // set cubature degree, e.g., 4
+    cellCub_ = cubFactory.create<DeviceType,Real,Real>(cellType, cubDegree); // create default cubature
+    // Boundary quadrature rules.
+    int d = cellType.getDimension();
+    shards::CellTopology bdryCellType = cellType.getCellTopologyData(d-1, 0);
+    int bdryCubDegree = parlist.sublist("Problem").get("Boundary Cubature Degree",4); // set cubature degree, e.g., 4
+    bdryCub_ = cubFactory.create<DeviceType,Real,Real>(bdryCellType, bdryCubDegree);
+    // Fill finite element basis container
+    basisPtrs_.clear();
+    basisPtrs_.resize(d, basisPtrVel_); // Velocity
+    basisPtrs_.push_back(basisPtrPrs_); // Pressure
+    basisPtrs_.push_back(basisPtrThr_); // Heat
+
+    // Other problem parameters.
+    Re_      = parlist.sublist("Problem").get("Reynolds Number",    200.0);
+    Pr_      = parlist.sublist("Problem").get("Prandtl Number",     0.72);
+    Gr_      = parlist.sublist("Problem").get("Grashof Number",     40000.0);
+    h_       = parlist.sublist("Problem").get("Robin Coefficient",  1.0);
+    // Stochastic Expansion Information.
+    Nbottom_ = parlist.sublist("Problem").get("Bottom KL Truncation Order",5);
+    Nleft_   = parlist.sublist("Problem").get("Left KL Truncation Order",5);
+    Nright_  = parlist.sublist("Problem").get("Right KL Truncation Order",5);
+    // Stochastic scales
+    ReScale_ = parlist.sublist("Problem").get("Reynolds Number Noise Scale",0.05);
+    PrScale_ = parlist.sublist("Problem").get("Prandtl Number Noise Scale",0.05);
+    GrScale_ = parlist.sublist("Problem").get("Grashof Number Noise Scale",0.05);
+    hScale_  = parlist.sublist("Problem").get("Robin Noise Scale",0.2);
+    TScale_  = parlist.sublist("Problem").get("Bottom Temperature Noise Scale",0.2);
+    // Pin pressure
+    pinPressure_ = parlist.sublist("Problem").get("Pin Pressure",true);
+
+    numDofs_ = 0;
+    numFields_ = basisPtrs_.size();
+    offset_.resize(numFields_);
+    numFieldDofs_.resize(numFields_);
+    for (int i=0; i<numFields_; ++i) {
+      if (i==0)
+        offset_[i] = 0;
+      else
+        offset_[i] = offset_[i-1] + basisPtrs_[i-1]->getCardinality();
+      numFieldDofs_[i] = basisPtrs_[i]->getCardinality();
+      numDofs_ += numFieldDofs_[i];
+    }
+  }
+
+  void residual(scalar_view & res,
+                const scalar_view u_coeff,
+                const scalar_view z_coeff = scalar_view(),
+                const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    int c  = u_coeff.extent_int(0);
+    int p  = cellCub_->getNumPoints();
+    int fv = basisPtrVel_->getCardinality();
+    int fp = basisPtrPrs_->getCardinality();
+    int fh = basisPtrThr_->getCardinality();
+    int d  = cellCub_->getDimension();
+ 
+    // Initialize residuals.
+    std::vector<scalar_view> R(d+2);
+    for (int i = 0; i < d; ++i)
+      R[i] = scalar_view("res", c, fv);
+    R[d]   = scalar_view("res", c, fp);
+    R[d+1] = scalar_view("res", c, fh);
+
+    // Split u_coeff into components.
+    std::vector<scalar_view> U, Z;
+    fieldHelper_->splitFieldCoeff(U, u_coeff);
+    fieldHelper_->splitFieldCoeff(Z, z_coeff);
+
+    // Evaluate problem coefficients at quadrature points.
+    scalar_view nu("nu", c, p);
+    scalar_view grav("grav", c, p);
+    scalar_view kappa("kappa", c, p);
+    computeCoefficients(nu,grav,kappa);
+
+    // Evaluate/interpolate finite element fields on cells.
+    std::vector<scalar_view> valVel_vec(d);
+    for (int i = 0; i < d; ++i) {
+      valVel_vec[i] = scalar_view("valVel_vec", c, p);
+      feVel_->evaluateValue(valVel_vec[i], U[i]);
+    }
+    scalar_view valPres_eval("valPres_eval", c, p);
+    scalar_view valHeat_eval("valHeat_eval", c, p);
+    fePrs_->evaluateValue(valPres_eval, U[d]);
+    feThr_->evaluateValue(valHeat_eval, U[d+1]);
+    // Evaluate/interpolate gradient of finite element fields on cells.
+    std::vector<scalar_view> gradVel_vec(d);
+    for (int i = 0; i < d; ++i) {
+      gradVel_vec[i] = scalar_view("gradVel_vec", c, p, d);
+      feVel_->evaluateGradient(gradVel_vec[i], U[i]);
+    }
+    scalar_view gradHeat_eval("gradHeat_eval", c, p, d);
+    feThr_->evaluateGradient(gradHeat_eval, U[d+1]);
+
+    // Assemble the velocity vector and its divergence.
+    scalar_view valVel_eval("valVel_eval", c, p, d);
+    scalar_view divVel_eval("divVel_eval", c, p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        divVel_eval(i,j) = static_cast<Real>(0);
+        for (int k = 0; k < d; ++k) {
+          valVel_eval(i,j,k) = (valVel_vec[k])(i,j);
+          divVel_eval(i,j)  += (gradVel_vec[k])(i,j,k);
+        }
+      }
+    }
+
+    /**************************************************************************/
+    /*** NAVIER STOKES ********************************************************/
+    /**************************************************************************/
+    std::vector<scalar_view> nuGradVel_vec(d), valVelDotgradVel_vec(d);
+    for (int i = 0; i < d; ++i) {
+      // Multiply velocity gradients with viscosity.
+      nuGradVel_vec[i] = scalar_view("nuGradVel_vec", c, p, d);
+      fst::tensorMultiplyDataData(nuGradVel_vec[i], nu, gradVel_vec[i]);
+      // Compute nonlinear terms in the Navier-Stokes equations.
+      valVelDotgradVel_vec[i] = scalar_view("valVelDotgradVel_vec", c, p);
+      fst::dotMultiplyDataData(valVelDotgradVel_vec[i], valVel_eval, gradVel_vec[i]);
+    }
+    // Negative pressure
+    rst::scale(valPres_eval,static_cast<Real>(-1));
+    // Compute gravitational force.
+    scalar_view gravForce("gravForce", c, p);
+    fst::scalarMultiplyDataData(gravForce, grav, valHeat_eval);
+
+    /**************************************************************************/
+    /*** THERMAL **************************************************************/
+    /**************************************************************************/
+    scalar_view kappaGradHeat_eval("kappaGradHeat_eval", c, p, d);
+    scalar_view velDotGradHeat_eval("velDotGradHeat_eval", c, p);
+    // Multiply temperature gradient with conductivity
+    fst::tensorMultiplyDataData(kappaGradHeat_eval, kappa, gradHeat_eval);
+    // Dot multiply scaled velocity with temperature gradient
+    fst::dotMultiplyDataData(velDotGradHeat_eval, valVel_eval, gradHeat_eval);
+
+    /**************************************************************************/
+    /*** EVALUATE WEAK FORM OF RESIDUAL ***************************************/
+    /**************************************************************************/
+    // Velocity equation.
+    for (int i = 0; i < d; ++i) {
+      fst::integrate(R[i],nuGradVel_vec[i],feVel_->gradNdetJ(),false);
+      fst::integrate(R[i],valVelDotgradVel_vec[i],feVel_->NdetJ(),true);
+      fst::integrate(R[i],valPres_eval,feVel_->DNDdetJ(i),true);
+    }
+    // Apply gravitational force.
+    fst::integrate(R[d-1],gravForce,feVel_->NdetJ(),true);
+    // Pressure equation.
+    fst::integrate(R[d],divVel_eval,fePrs_->NdetJ(),false);
+    rst::scale(R[d],static_cast<Real>(-1));
+    // Heat equation.
+    fst::integrate(R[d+1],kappaGradHeat_eval,feThr_->gradNdetJ(),false);
+    fst::integrate(R[d+1],velDotGradHeat_eval,feThr_->NdetJ(),true);
+
+    /**************************************************************************/
+    /*** APPLY BOUNDARY CONDITIONS ********************************************/
+    /**************************************************************************/
+    // -->        Control boundaries: i=1,2
+    // -->        No slip boundaries: i=0,1,2
+    // --> Outflow/Inflow boundaries: i=3
+    // -->              Pressure Pin: i=4
+    int numSideSets = bdryCellLocIds_.size();
+    const int numCubPerSide = bdryCub_->getNumPoints();
+    if (numSideSets > 0) {
+      // ROBIN CONDITIONS
+      for (int i = 0; i < numSideSets; ++i) {
+        // Control boundaries
+        if (i==1 || i==2) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            if (numCellsSide) {
+              // Get U and Z coefficients on Robin boundary
+              scalar_view u_coeff_bdry = getThermalBoundaryCoeff(U[d+1], i, j);
+              scalar_view z_coeff_bdry = getThermalBoundaryCoeff(Z[d+1], i, j);
+              // Evaluate U and Z on FE basis
+              scalar_view valU_eval_bdry("valU_eval_bdry", numCellsSide, numCubPerSide);
+              scalar_view valZ_eval_bdry("valZ_eval_bdry", numCellsSide, numCubPerSide);
+              feThrBdry_[i][j]->evaluateValue(valU_eval_bdry, u_coeff_bdry);
+              feThrBdry_[i][j]->evaluateValue(valZ_eval_bdry, z_coeff_bdry);
+              // Compute Stefan-Boltzmann residual
+              scalar_view robinVal("robinVal", numCellsSide, numCubPerSide);
+              computeThermalRobin(robinVal,valU_eval_bdry,valZ_eval_bdry,i,j,0);
+              // Evaluate boundary integral
+              scalar_view robinRes("robinRes", numCellsSide, fh);
+              fst::integrate(robinRes,robinVal,feThrBdry_[i][j]->NdetJ(),false);
+              for (int k = 0; k < numCellsSide; ++k) {
+                int cidx = bdryCellLocIds_[i][j][k];
+                // Thermal boundary conditions.
+                for (int l = 0; l < fh; ++l)
+                  (R[d+1])(cidx,l) += robinRes(k,l);
+              }
+            }
+          }
+        }
+      }
+      // DIRICHLET CONDITIONS
+      computeDirichlet();
+      // Velocity Boundary Conditions
+      for (int i = 0; i < numSideSets; ++i) {
+        if (i<4) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numVBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numVBdryDofs; ++l) {
+                for (int m = 0; m < d; ++m) {
+                  (R[m])(cidx,fvidx_[j][l]) = (U[m])(cidx,fvidx_[j][l]) - (bdryCellVDofValues_[i][j])(k,fvidx_[j][l],m);
+                }
+              }
+            }
+          }
+        }
+        // Thermal Boundary Conditions
+        if ( (i==0) || (i==5) ) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();   // Number of sides per cell
+          for (int j = 0; j < numLocalSideIds; ++j) {        // Loop over sides of cell: Quad = {0, 1, 2, 3}
+            int numCellsSide = bdryCellLocIds_[i][j].size(); // Number of cells with side j
+            int numHBdryDofs = fhidx_[j].size();             // Number of thermal boundary DOFs
+            for (int k = 0; k < numCellsSide; ++k) {         // Loop over cells with side j
+              int cidx = bdryCellLocIds_[i][j][k];           // Cell index
+              for (int l = 0; l < numHBdryDofs; ++l) {       // Loop over all fields of cell k on side j
+                (R[d+1])(cidx,fhidx_[j][l]) = (U[d+1])(cidx,fhidx_[j][l]) - (bdryCellTDofValues_[i][j])(k,fhidx_[j][l]);
+              }
+            }
+          }
+        }
+        // Pressure pinning
+        if (i==7 && pinPressure_) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int l = 0, cidx = bdryCellLocIds_[i][j][k];
+              (R[d])(cidx,fpidx_[j][l]) = (U[d])(cidx,fpidx_[j][l]) - static_cast<Real>(0);
+            }
+          }
+        }
+      }
+    }
+
+    // Combine the residuals.
+    fieldHelper_->combineFieldCoeff(res, R);
+  }
+
+  void Jacobian_1(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    int c  = u_coeff.extent_int(0);
+    int p  = cellCub_->getNumPoints();
+    int fv = basisPtrVel_->getCardinality();
+    int fp = basisPtrPrs_->getCardinality();
+    int fh = basisPtrThr_->getCardinality();
+    int d  = cellCub_->getDimension();
+ 
+    // Initialize jacobians.
+    std::vector<std::vector<scalar_view>> J(d+2);
+    for (int i = 0; i < d+2; ++i)
+      J[i].resize(d+2);
+    for (int i = 0; i < d; ++i) {
+      for (int j = 0; j < d; ++j)
+        J[i][j] = scalar_view("jac", c, fv, fv);
+      J[d][i]   = scalar_view("jac", c, fp, fv);
+      J[i][d]   = scalar_view("jac", c, fv, fp);
+      J[d+1][i] = scalar_view("jac", c, fh, fv);
+      J[i][d+1] = scalar_view("jac", c, fv, fh);
+    }
+    J[d][d]     = scalar_view("jac", c, fp, fp);
+    J[d+1][d]   = scalar_view("jac", c, fh, fp);
+    J[d][d+1]   = scalar_view("jac", c, fp, fh);
+    J[d+1][d+1] = scalar_view("jac", c, fh, fh);
+
+    // Split u_coeff into components.
+    std::vector<scalar_view> U, Z;
+    fieldHelper_->splitFieldCoeff(U, u_coeff);
+    fieldHelper_->splitFieldCoeff(Z, z_coeff);
+
+    // Evaluate problem coefficients at quadrature points.
+    scalar_view nu("nu", c, p);
+    scalar_view grav("grav", c, p);
+    scalar_view kappa("kappa", c, p);
+    computeCoefficients(nu,grav,kappa);
+
+    // Evaluate/interpolate finite element fields on cells.
+    std::vector<scalar_view> valVel_vec(d);
+    for (int i = 0; i < d; ++i) {
+      valVel_vec[i] = scalar_view("valVel_vec", c, p);
+      feVel_->evaluateValue(valVel_vec[i], U[i]);
+    }
+    // Evaluate/interpolate gradient of finite element fields on cells.
+    std::vector<scalar_view> gradVel_vec(d);
+    for (int i = 0; i < d; ++i) {
+      gradVel_vec[i] = scalar_view("gradVel_vec", c, p, d);
+      feVel_->evaluateGradient(gradVel_vec[i], U[i]);
+    }
+    scalar_view gradHeat_eval("gradHeat_eval", c, p, d);
+    feThr_->evaluateGradient(gradHeat_eval, U[d+1]);
+
+    // Assemble the velocity vector and its divergence.
+    scalar_view valVel_eval("valVel_eval", c, p, d);
+    std::vector<std::vector<scalar_view>> dVel_vec(d);
+    std::vector<scalar_view> gradHeat_vec(d);
+    for (int k = 0; k < d; ++k) {
+      dVel_vec[k].resize(d);
+      for (int l = 0; l < d; ++l) {
+        dVel_vec[k][l] = scalar_view("dVel_vec", c, p);
+        for (int i = 0; i < c; ++i) {
+          for (int j = 0; j < p; ++j) {
+            (dVel_vec[k][l])(i,j) = (gradVel_vec[k])(i,j,l);
+          }
+        }
+      }
+      gradHeat_vec[k] = scalar_view("gradHeat_vec", c, p);
+      for (int i = 0; i < c; ++i) {
+        for (int j = 0; j < p; ++j) {
+          valVel_eval(i,j,k)  = (valVel_vec[k])(i,j);
+          (gradHeat_vec[k])(i,j) = gradHeat_eval(i,j,k);
+        }
+      }
+    }
+
+    /**************************************************************************/
+    /*** NAVIER STOKES ********************************************************/
+    /**************************************************************************/
+    std::vector<std::vector<scalar_view>> dVelPhi_vec(d);
+    for (int i = 0; i < d; ++i) {
+      // Compute nonlinear terms in the Navier-Stokes equations.
+      dVelPhi_vec[i].resize(d);
+      for (int j = 0; j < d; ++j) {
+        dVelPhi_vec[i][j] = scalar_view("dVelPhi_vec", c, fv, p);
+        fst::scalarMultiplyDataField(dVelPhi_vec[i][j], dVel_vec[i][j], feVel_->N());
+      }
+    }
+    // Multiply velocity gradients with viscosity.
+    scalar_view nuGradPhi_eval("nuGradPhi_eval", c, fv, p, d);
+    fst::tensorMultiplyDataField(nuGradPhi_eval, nu, feVel_->gradN());
+    // Compute nonlinear terms in the Navier-Stokes equations.
+    scalar_view valVelDotgradPhi_eval("valVelDotgradPhi_eval", c, fv, p);
+    fst::dotMultiplyDataField(valVelDotgradPhi_eval, valVel_eval, feVel_->gradN());
+    // Negative pressure basis.
+    scalar_view negPrsPhi("negPrsPhi", c, fp, p);
+    Kokkos::deep_copy(negPrsPhi, fePrs_->N());
+    rst::scale(negPrsPhi,static_cast<Real>(-1));
+    // Compute gravity Jacobian
+    scalar_view gravPhi("gravPhi", c, fh, p);
+    fst::scalarMultiplyDataField(gravPhi, grav, feThr_->N());
+
+    /**************************************************************************/
+    /*** THERMAL **************************************************************/
+    /**************************************************************************/
+    scalar_view kappaGradPhi("kappaGradPhi", c, fh, p, d);
+    scalar_view VelPhi("VelPhi", c, fh, p);
+    std::vector<scalar_view> dHeatPhi(d);
+    // Compute kappa times gradient of basis.
+    fst::tensorMultiplyDataField(kappaGradPhi, kappa, feThr_->gradN());
+    // Compute scaled velocity.
+    fst::dotMultiplyDataField(VelPhi, valVel_eval, feThr_->gradN());
+    // Thermal-velocity Jacobians.
+    for (int i = 0; i < d; ++i) {
+      dHeatPhi[i] = scalar_view("dHeatPhi", c, fh, p);
+      fst::scalarMultiplyDataField(dHeatPhi[i], gradHeat_vec[i], feThr_->N());
+    }
+
+    /*** Evaluate weak form of the Jacobian. ***/
+    // X component of velocity equation.
+    for (int i = 0; i < d; ++i) {
+      // Velocity components
+      for (int j = 0; j < d; ++j)
+        fst::integrate(J[i][j],feVel_->NdetJ(),dVelPhi_vec[i][j],false);
+      fst::integrate(J[i][i],nuGradPhi_eval,feVel_->gradNdetJ(),true);
+      fst::integrate(J[i][i],feVel_->NdetJ(),valVelDotgradPhi_eval,true);
+      // Pressure components
+      fst::integrate(J[i][d],feVel_->DNDdetJ(i),negPrsPhi,false);
+      fst::integrate(J[d][i],fePrs_->NdetJ(),feVel_->DND(i),false);
+      rst::scale(J[d][i],static_cast<Real>(-1));
+      // Thermal components
+      fst::integrate(J[d+1][i],dHeatPhi[i],feVel_->NdetJ(),false);
+    }
+    // Gravitational component
+    fst::integrate(J[d-1][d+1],feVel_->NdetJ(),gravPhi,false);
+    // Thermal components
+    fst::integrate(J[d+1][d+1],kappaGradPhi,feThr_->gradNdetJ(),false);
+    fst::integrate(J[d+1][d+1],feThr_->NdetJ(),VelPhi,true);
+
+    // APPLY BOUNDARY CONDITIONS
+    int numSideSets = bdryCellLocIds_.size();
+    const int numCubPerSide = bdryCub_->getNumPoints();
+    if (numSideSets > 0) {
+      // ROBIN CONDITIONS
+      for (int i = 0; i < numSideSets; ++i) {
+        // Control boundaries
+        if (i==1 || i==2) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            if (numCellsSide) {
+              // Get U and Z coefficients on Robin boundary
+              scalar_view u_coeff_bdry = getThermalBoundaryCoeff(U[d+1], i, j);
+              scalar_view z_coeff_bdry = getThermalBoundaryCoeff(Z[d+1], i, j);
+              // Evaluate U and Z on FE basis
+              scalar_view valU_eval_bdry("valU_eval_bdry", numCellsSide, numCubPerSide);
+              scalar_view valZ_eval_bdry("valZ_eval_bdry", numCellsSide, numCubPerSide);
+              feThrBdry_[i][j]->evaluateValue(valU_eval_bdry, u_coeff_bdry);
+              feThrBdry_[i][j]->evaluateValue(valZ_eval_bdry, z_coeff_bdry);
+              // Compute Stefan-Boltzmann residual
+              scalar_view robinVal1("robinVal1", numCellsSide, numCubPerSide);
+              scalar_view robinVal("robinVal", numCellsSide, fh, numCubPerSide);
+              computeThermalRobin(robinVal1,valU_eval_bdry,valZ_eval_bdry,i,j,1,0);
+              fst::scalarMultiplyDataField(robinVal,robinVal1,feThrBdry_[i][j]->N());
+              // Evaluate boundary integral
+              scalar_view robinJac = scalar_view("robinJac", numCellsSide, fh, fh);
+              fst::integrate(robinJac,robinVal,feThrBdry_[i][j]->NdetJ(),false);
+              for (int k = 0; k < numCellsSide; ++k) {
+                int cidx = bdryCellLocIds_[i][j][k];
+                for (int l = 0; l < fh; ++l) {
+                  for (int m = 0; m < fh; ++m)
+                    (J[d+1][d+1])(cidx,l,m) += robinJac(k,l,m);
+                }
+              }
+            }
+          }
+        }
+      }
+      // DIRICHLET CONDITIONS
+      for (int i = 0; i < numSideSets; ++i) {
+        // Velocity Boundary Conditions
+        if (i<4) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numVBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numVBdryDofs; ++l) {
+                for (int m = 0; m < fv; ++m) {
+                  for (int n = 0; n < d; ++n) {
+                    for (int q = 0; q < d; ++q) {
+                      (J[n][q])(cidx,fvidx_[j][l],m) = static_cast<Real>(0);
+                    }
+                    (J[n][n])(cidx,fvidx_[j][l],fvidx_[j][l]) = static_cast<Real>(1);
+                  }
+                }
+                for (int m = 0; m < d; ++m) {
+                  for (int n = 0; n < fp; ++n) {
+                    (J[m][d])(cidx,fvidx_[j][l],n) = static_cast<Real>(0);
+                  }
+                  for (int n = 0; n < fh; ++n) {
+                    (J[m][d+1])(cidx,fvidx_[j][l],n) = static_cast<Real>(0);
+                  }
+                }
+              }
+            }
+          }
+        }
+        // Thermal boundary conditions
+        if ( (i==0) || (i==5) ) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numHBdryDofs = fhidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numHBdryDofs; ++l) {
+                for (int m = 0; m < fv; ++m) {
+                  for (int n = 0; n < d; ++n) {
+                    (J[d+1][n])(cidx,fhidx_[j][l],m) = static_cast<Real>(0);
+                  }
+                }
+                for (int m = 0; m < fp; ++m) {
+                  (J[d+1][d])(cidx,fhidx_[j][l],m) = static_cast<Real>(0);
+                }
+                for (int m = 0; m < fh; ++m) {
+                  (J[d+1][d+1])(cidx,fhidx_[j][l],m) = static_cast<Real>(0);
+                }
+                (J[d+1][d+1])(cidx,fhidx_[j][l],fhidx_[j][l]) = static_cast<Real>(1);
+              }
+            }
+          }
+        }
+        // Pressure pinning
+        if (i==7 && pinPressure_) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int l = 0, cidx = bdryCellLocIds_[i][j][k];
+              for (int m = 0; m < fv; ++m) {
+                for (int n = 0; n < d; ++n) {
+                  (J[d][n])(cidx,fpidx_[j][l],m) = static_cast<Real>(0);
+                }
+              }
+              for (int m = 0; m < fp; ++m) {
+                (J[d][d])(cidx,fpidx_[j][l],m) = static_cast<Real>(0);
+              }
+              (J[d][d])(cidx,fpidx_[j][l],fpidx_[j][l]) = static_cast<Real>(1);
+              for (int m = 0; m < fh; ++m) {
+                (J[d][d+1])(cidx,fpidx_[j][l],m) = static_cast<Real>(0);
+              }
+            }
+          }
+        }
+      }
+    }
+
+    // Combine the jacobians.
+    fieldHelper_->combineFieldCoeff(jac, J);
+  }
+
+
+  void Jacobian_2(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    int c  = u_coeff.extent_int(0);
+    int fv = basisPtrVel_->getCardinality();
+    int fp = basisPtrPrs_->getCardinality();
+    int fh = basisPtrThr_->getCardinality();
+    int d  = cellCub_->getDimension();
+ 
+    // Initialize jacobians.
+    std::vector<std::vector<scalar_view>> J(d+2);
+    for (int i = 0; i < d+2; ++i)
+      J[i].resize(d+2);
+    for (int i = 0; i < d; ++i) {
+      for (int j = 0; j < d; ++j)
+        J[i][j] = scalar_view("jac", c, fv, fv);
+      J[d][i]   = scalar_view("jac", c, fp, fv);
+      J[i][d]   = scalar_view("jac", c, fv, fp);
+      J[d+1][i] = scalar_view("jac", c, fh, fv);
+      J[i][d+1] = scalar_view("jac", c, fv, fh);
+    }
+    J[d][d]     = scalar_view("jac", c, fp, fp);
+    J[d+1][d]   = scalar_view("jac", c, fh, fp);
+    J[d][d+1]   = scalar_view("jac", c, fp, fh);
+    J[d+1][d+1] = scalar_view("jac", c, fh, fh);
+
+    // Split u_coeff into components.
+    std::vector<scalar_view> U, Z;
+    fieldHelper_->splitFieldCoeff(U, u_coeff);
+    fieldHelper_->splitFieldCoeff(Z, z_coeff);
+
+    // APPLY DIRICHLET CONDITIONS
+    int numSideSets = bdryCellLocIds_.size();
+    const int numCubPerSide = bdryCub_->getNumPoints();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        // Control boundaries
+        if (i==1 || i==2) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            if (numCellsSide) {
+              // Get U and Z coefficients on Robin boundary
+              scalar_view u_coeff_bdry = getThermalBoundaryCoeff(U[d+1], i, j);
+              scalar_view z_coeff_bdry = getThermalBoundaryCoeff(Z[d+1], i, j);
+              // Evaluate U and Z on FE basis
+              scalar_view valU_eval_bdry("valU_eval_bdry", numCellsSide, numCubPerSide);
+              scalar_view valZ_eval_bdry("valZ_eval_bdry", numCellsSide, numCubPerSide);
+              feThrBdry_[i][j]->evaluateValue(valU_eval_bdry, u_coeff_bdry);
+              feThrBdry_[i][j]->evaluateValue(valZ_eval_bdry, z_coeff_bdry);
+              // Compute Stefan-Boltzmann residual
+              scalar_view robinVal1("robinVal1", numCellsSide, numCubPerSide);
+              scalar_view robinVal("robinVal", numCellsSide, fh, numCubPerSide);
+              computeThermalRobin(robinVal1,valU_eval_bdry,valZ_eval_bdry,i,j,1,1);
+              fst::scalarMultiplyDataField(robinVal,robinVal1,feThrBdry_[i][j]->N());
+              // Evaluate boundary integral
+              scalar_view robinJac("robinJac", numCellsSide, fh, fh);
+              fst::integrate(robinJac,robinVal,feThrBdry_[i][j]->NdetJ(),false);
+              for (int k = 0; k < numCellsSide; ++k) {
+                int cidx = bdryCellLocIds_[i][j][k];
+                for (int l = 0; l < fh; ++l) {
+                  for (int m = 0; m < fh; ++m) {
+                    (J[d+1][d+1])(cidx,l,m) += robinJac(k,l,m);
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+      // DIRICHLET CONDITIONS
+      for (int i = 0; i < numSideSets; ++i) {
+        // Velocity Boundary Conditions
+        if (i<4) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numVBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numVBdryDofs; ++l) {
+                for (int m = 0; m < fv; ++m) {
+                  for (int n = 0; n < d; ++n) {
+                    for (int p = 0; p < d; ++p) {
+                      (J[n][p])(cidx,fvidx_[j][l],m) = static_cast<Real>(0);
+                    }
+                  }
+                }
+                for (int m = 0; m < d; ++m) {
+                  for (int n = 0; n < fp; ++n) {
+                    (J[m][d])(cidx,fvidx_[j][l],n) = static_cast<Real>(0);
+                  }
+                  for (int n = 0; n < fh; ++n) {
+                    (J[m][d+1])(cidx,fvidx_[j][l],n) = static_cast<Real>(0);
+                  }
+                }
+              }
+            }
+          }
+        }
+        // Thermal boundary conditions
+        if ( (i==0) || (i==5) ) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numHBdryDofs = fhidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numHBdryDofs; ++l) {
+                for (int m = 0; m < fv; ++m) {
+                  for (int n = 0; n < d; ++n) {
+                    (J[d+1][n])(cidx,fhidx_[j][l],m) = static_cast<Real>(0);
+                  }
+                }
+                for (int m = 0; m < fp; ++m) {
+                  (J[d+1][d])(cidx,fhidx_[j][l],m) = static_cast<Real>(0);
+                }
+                for (int m = 0; m < fh; ++m) {
+                  (J[d+1][d+1])(cidx,fhidx_[j][l],m) = static_cast<Real>(0);
+                }
+              }
+            }
+          }
+        }
+        // Pressure pinning
+        if (i==7 && pinPressure_) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int l = 0, cidx = bdryCellLocIds_[i][j][k];
+              for (int m = 0; m < fv; ++m) {
+                for (int n = 0; n < d; ++n) {
+                  (J[d][n])(cidx,fpidx_[j][l],m) = static_cast<Real>(0);
+                }
+              }
+              for (int m = 0; m < fp; ++m) {
+                (J[d][d])(cidx,fpidx_[j][l],m) = static_cast<Real>(0);
+              }
+              for (int m = 0; m < fh; ++m) {
+                (J[d][d+1])(cidx,fpidx_[j][l],m) = static_cast<Real>(0);
+              }
+            }
+          }
+        }
+      }
+    }
+
+    // Combine the jacobians.
+    fieldHelper_->combineFieldCoeff(jac, J);
+  }
+
+  void Hessian_11(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+//    throw Exception::NotImplemented("");
+    // Retrieve dimensions.
+    int c  = u_coeff.extent_int(0);
+    int p  = cellCub_->getNumPoints();
+    int fv = basisPtrVel_->getCardinality();
+    int fp = basisPtrPrs_->getCardinality();
+    int fh = basisPtrThr_->getCardinality();
+    int d  = cellCub_->getDimension();
+ 
+    // Initialize jacobians.
+    std::vector<std::vector<scalar_view>> H(d+2);
+    for (int i = 0; i < d+2; ++i)
+      H[i].resize(d+2);
+    for (int i = 0; i < d; ++i) {
+      for (int j = 0; j < d; ++j)
+        H[i][j] = scalar_view("hess", c, fv, fv);
+    }
+    for (int i = 0; i < d; ++i) {
+      H[d][i]   = scalar_view("hess", c, fp, fv);
+      H[i][d]   = scalar_view("hess", c, fv, fp);
+      H[d+1][i] = scalar_view("hess", c, fh, fv);
+      H[i][d+1] = scalar_view("hess", c, fv, fh);
+    }
+    H[d][d]     = scalar_view("hess", c, fp, fp);
+    H[d+1][d]   = scalar_view("hess", c, fh, fp);
+    H[d][d+1]   = scalar_view("hess", c, fp, fh);
+    H[d+1][d+1] = scalar_view("hess", c, fh, fh);
+
+    // Split l_coeff into components.
+    std::vector<scalar_view> L;
+    fieldHelper_->splitFieldCoeff(L, l_coeff);
+
+    // APPLY DIRICHLET CONDITIONS
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      // DIRICHLET CONDITIONS
+      for (int i = 0; i < numSideSets; ++i) {
+        // Velocity boundary conditions
+        if (i<4) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numVBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numVBdryDofs; ++l) {
+                for (int m = 0; m < d; ++m) {
+                  (L[m])(cidx,fvidx_[j][l]) = static_cast<Real>(0);
+                }
+              }
+            }
+          }
+        }
+        // Thermal boundaries
+        if ( (i==0) || (i==5) ) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numHBdryDofs = fhidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numHBdryDofs; ++l) {
+                (L[d+1])(cidx,fhidx_[j][l]) = static_cast<Real>(0);
+              }
+            }
+          }
+        }
+        // Pressure pinning
+        if (i==7 && pinPressure_) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int l = 0, cidx = bdryCellLocIds_[i][j][k];
+              (L[d])(cidx,fpidx_[j][l]) = static_cast<Real>(0);
+            }
+          }
+        }
+      }
+    }
+
+    // Evaluate/interpolate finite element fields on cells.
+    std::vector<scalar_view> valVel_vec(d);
+    for (int i = 0; i < d; ++i) {
+      valVel_vec[i] = scalar_view("valVel_vec", c, p);
+      feVel_->evaluateValue(valVel_vec[i], L[i]);
+    }
+    scalar_view valHeat_eval("valHeat_eval", c, p);
+    feThr_->evaluateValue(valHeat_eval, L[3]);
+
+    // Compute nonlinear terms in the Navier-Stokes equations.
+    std::vector<scalar_view> valVelPhi_vec(d);
+    for (int i = 0; i < d; ++i) {
+      valVelPhi_vec[i] = scalar_view("valVelPhi_vec", c, fv, p);
+      fst::scalarMultiplyDataField(valVelPhi_vec[i], valVel_vec[i], feVel_->N());
+    }
+
+    // Compute nonlinear terms in the thermal equation.
+    scalar_view valHeatVelPhi("valHeatVelPhi", c, fv, p);
+    fst::scalarMultiplyDataField(valHeatVelPhi, valHeat_eval, feVel_->N());
+
+    /*** Evaluate weak form of the Hessian. ***/
+    for (int i = 0; i < d; ++i) {
+      // velocity equation.
+      for (int j = 0; j < d; ++j) {
+        fst::integrate(H[i][j],valVelPhi_vec[j],feVel_->DNDdetJ(i),false);
+        fst::integrate(H[i][j],feVel_->DNDdetJ(j),valVelPhi_vec[i],true);
+      }
+      fst::integrate(H[i][d+1],valHeatVelPhi,feThr_->DNDdetJ(i),false);
+      // Thermal equation.
+      fst::integrate(H[d+1][i],feThr_->DNDdetJ(i),valHeatVelPhi,false);
+    }
+
+    // Combine the Hessians.
+    fieldHelper_->combineFieldCoeff(hess, H);
+  }
+
+  void Hessian_12(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_ThermalFluids::Hessian_12): Hessian is zero.");
+  }
+
+  void Hessian_21(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_ThermalFluids::Hessian_21): Hessian is zero.");
+  }
+
+  void Hessian_22(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_ThermalFluids::Hessian_22): Hessian is zero.");
+  }
+
+  void RieszMap_1(scalar_view & riesz) override {
+    // Retrieve dimensions.
+    int c  = feVel_->N().extent_int(0);
+    int fv = basisPtrVel_->getCardinality();
+    int fp = basisPtrPrs_->getCardinality();
+    int fh = basisPtrThr_->getCardinality();
+    int d  = cellCub_->getDimension();
+ 
+    // Initialize jacobians.
+    std::vector<std::vector<scalar_view>> J(d+2);
+    for (int i = 0; i < d+2; ++i)
+      J[i].resize(d+2);
+    for (int i = 0; i < d; ++i) {
+      for (int j = 0; j < d; ++j)
+        J[i][j] = scalar_view("riesz1", c, fv, fv);
+      J[d][i]   = scalar_view("riesz1", c, fp, fv);
+      J[i][d]   = scalar_view("riesz1", c, fv, fp);
+      J[d+1][i] = scalar_view("riesz1", c, fh, fv);
+      J[i][d+1] = scalar_view("riesz1", c, fv, fh);
+    }
+    J[d][d]     = scalar_view("riesz1", c, fp, fp);
+    J[d+1][d]   = scalar_view("riesz1", c, fh, fp);
+    J[d][d+1]   = scalar_view("riesz1", c, fp, fh);
+    J[d+1][d+1] = scalar_view("riesz1", c, fh, fh);
+
+    for (int i = 0; i < d; ++i) {
+      Kokkos::deep_copy(J[i][i], feVel_->stiffMat());
+      rst::add(J[i][i],feVel_->massMat());
+    }
+    Kokkos::deep_copy(J[d][d], fePrs_->massMat());
+    Kokkos::deep_copy(J[d+1][d+1], feThr_->stiffMat());
+    rst::add(J[d+1][d+1], feThr_->massMat());
+
+    // Combine the jacobians.
+    fieldHelper_->combineFieldCoeff(riesz, J);
+  }
+
+  void RieszMap_2(scalar_view & riesz) override {
+    // Retrieve dimensions.
+    int c  = feVel_->N().extent_int(0);
+    int fv = basisPtrVel_->getCardinality();
+    int fp = basisPtrPrs_->getCardinality();
+    int fh = basisPtrThr_->getCardinality();
+    int d  = cellCub_->getDimension();
+ 
+    // Initialize jacobians.
+    std::vector<std::vector<scalar_view>> J(d+2);
+    for (int i = 0; i < d+2; ++i)
+      J[i].resize(d+2);
+    for (int i = 0; i < d; ++i) {
+      for (int j = 0; j < d; ++j)
+        J[i][j] = scalar_view("riesz2", c, fv, fv);
+      J[d][i]   = scalar_view("riesz2", c, fp, fv);
+      J[i][d]   = scalar_view("riesz2", c, fv, fp);
+      J[d+1][i] = scalar_view("riesz2", c, fh, fv);
+      J[i][d+1] = scalar_view("riesz2", c, fv, fh);
+    }
+    J[d][d]     = scalar_view("riesz2", c, fp, fp);
+    J[d+1][d]   = scalar_view("riesz2", c, fh, fp);
+    J[d][d+1]   = scalar_view("riesz2", c, fp, fh);
+    J[d+1][d+1] = scalar_view("riesz2", c, fh, fh);
+
+
+    for (int i = 0; i < d; ++i)
+      Kokkos::deep_copy(J[i][i],feVel_->massMat());
+    Kokkos::deep_copy(J[d][d],fePrs_->massMat());
+    Kokkos::deep_copy(J[d+1][d+1],feThr_->massMat());
+
+    // Combine the jacobians.
+    fieldHelper_->combineFieldCoeff(riesz, J);
+  }
+
+  std::vector<basis_ptr> getFields() override {
+    return basisPtrs_;
+  }
+
+  void setCellNodes(const scalar_view &volCellNodes,
+                    const std::vector<std::vector<scalar_view>> &bdryCellNodes,
+                    const std::vector<std::vector<std::vector<int>>> &bdryCellLocIds) override {
+    volCellNodes_ = volCellNodes;
+    bdryCellNodes_ = bdryCellNodes;
+    bdryCellLocIds_ = bdryCellLocIds;
+    // Finite element definition.
+    feVel_ = ROL::makePtr<fe_type>(volCellNodes_,basisPtrVel_,cellCub_);
+    fePrs_ = ROL::makePtr<fe_type>(volCellNodes_,basisPtrPrs_,cellCub_);
+    feThr_ = ROL::makePtr<fe_type>(volCellNodes_,basisPtrThr_,cellCub_);
+    // Get boundary degrees of freedom.
+    fvidx_ = feVel_->getBoundaryDofs();
+    fpidx_ = fePrs_->getBoundaryDofs();
+    fhidx_ = feThr_->getBoundaryDofs();
+    // Construct boundary FEs
+    const int numSideSets = bdryCellNodes.size();
+    feVelBdry_.resize(numSideSets);
+    fePrsBdry_.resize(numSideSets);
+    feThrBdry_.resize(numSideSets);
+    for (int i = 0; i < numSideSets; ++i) {
+      int numLocSides = bdryCellNodes[i].size();
+      feVelBdry_[i].resize(numLocSides);
+      fePrsBdry_[i].resize(numLocSides);
+      feThrBdry_[i].resize(numLocSides);
+      for (int j = 0; j < numLocSides; ++j) {
+        if (bdryCellNodes[i][j] != scalar_view()) {
+          feVelBdry_[i][j] = ROL::makePtr<fe_type>(bdryCellNodes[i][j],basisPtrVel_,bdryCub_,j);
+          fePrsBdry_[i][j] = ROL::makePtr<fe_type>(bdryCellNodes[i][j],basisPtrPrs_,bdryCub_,j);
+          feThrBdry_[i][j] = ROL::makePtr<fe_type>(bdryCellNodes[i][j],basisPtrThr_,bdryCub_,j);
+        }
+      }
+    }
+  }
+
+  void setFieldPattern(const std::vector<std::vector<int>> & fieldPattern) override {
+    fieldPattern_ = fieldPattern;
+    fieldHelper_ = ROL::makePtr<FieldHelper<Real,DeviceType>>(numFields_, numDofs_, numFieldDofs_, fieldPattern_);
+  }
+
+  const ROL::Ptr<fe_type> getVelocityFE(void) const {
+    return feVel_;
+  }
+
+  const ROL::Ptr<fe_type> getPressureFE(void) const {
+    return fePrs_;
+  }
+
+  const ROL::Ptr<fe_type> getThermalFE(void) const {
+    return feThr_;
+  }
+
+  const std::vector<std::vector<ROL::Ptr<fe_type>>> getVelocityBdryFE(void) const {
+    return feVelBdry_;
+  }
+
+  const std::vector<std::vector<ROL::Ptr<fe_type>>> getPressureBdryFE(void) const {
+    return fePrsBdry_;
+  }
+
+  const std::vector<std::vector<ROL::Ptr<fe_type>>> getThermalBdryFE(void) const {
+    return feThrBdry_;
+  }
+
+  const std::vector<std::vector<std::vector<int>>> getBdryCellLocIds(void) const {
+    return bdryCellLocIds_;
+  }
+
+  const ROL::Ptr<FieldHelper<Real,DeviceType>> getFieldHelper(void) const {
+    return fieldHelper_;
+  }
+
+}; // PDE_ThermalFluids
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/thermal-fluids/pde_thermal-fluids_ex07K.hpp b/packages/rol/example/PDE-OPT/thermal-fluids/pde_thermal-fluids_ex07K.hpp
new file mode 100644
index 000000000000..21146685d7ad
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/thermal-fluids/pde_thermal-fluids_ex07K.hpp
@@ -0,0 +1,1326 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  pde_thermal-fluids.hpp
+    \brief Implements the local PDE interface for the Navier-Stokes control problem.
+*/
+
+#ifndef PDE_THERMALFLUIDS_EX07K_HPP
+#define PDE_THERMALFLUIDS_EX07K_HPP
+
+#include "../TOOLS/pdeK.hpp"
+#include "../TOOLS/feK.hpp"
+#include "../TOOLS/fieldhelperK.hpp"
+
+#include "Intrepid2_HGRAD_HEX_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_HEX_C2_FEM.hpp"
+#include "Intrepid2_DefaultCubatureFactory.hpp"
+#include "Intrepid2_FunctionSpaceTools.hpp"
+#include "Intrepid2_CellTools.hpp"
+
+#include "ROL_Ptr.hpp"
+
+template <class Real, class DeviceType>
+class PDE_ThermalFluids_ex07 : public PDE<Real, DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  // Finite element basis information
+  basis_ptr basisPtrVel_, basisPtrPrs_, basisPtrThr_;
+  std::vector<basis_ptr> basisPtrs_;
+  // Cell cubature information
+  ROL::Ptr<Intrepid2::Cubature<DeviceType,Real,Real>> cellCub_, bdryCub_;
+  // Cell node information
+  scalar_view volCellNodes_;
+  std::vector<std::vector<scalar_view>> bdryCellNodes_;
+  std::vector<std::vector<std::vector<int>>> bdryCellLocIds_;
+  // Finite element definition
+  ROL::Ptr<fe_type> feVel_, fePrs_, feThr_;
+  std::vector<std::vector<ROL::Ptr<fe_type>>> feVelBdry_, fePrsBdry_, feThrBdry_;
+  // Local degrees of freedom on boundary, for each side of the reference cell (first index).
+  std::vector<std::vector<int>> fvidx_, fpidx_, fhidx_;
+  // Coordinates of degrees freedom on boundary cells.
+  // Indexing:  [sideset number][local side id](cell number, value at dof)
+  std::vector<std::vector<scalar_view>> bdryCellVDofValues_, bdryCellTDofValues_;
+  // Field pattern, offsets, etc.
+  std::vector<std::vector<int>> fieldPattern_;  // local Field/DOF pattern; set from DOF manager 
+  int numFields_;                                // number of fields (equations in the PDE)
+  int numDofs_;                                  // total number of degrees of freedom for all (local) fields
+  std::vector<int> offset_;                      // for each field, a counting offset
+  std::vector<int> numFieldDofs_;                // for each field, number of degrees of freedom
+  
+  // Problem parameters.
+  Real Re_, Pr_, Gr_, h_;
+  const Real grav_;
+  int Nbottom_, N0_, N1_, N2_, N3_;
+  Real ReScale_, PrScale_, GrScale_, hScale_, TScale_;
+  bool pinPressure_;
+
+  ROL::Ptr<FieldHelper<Real,DeviceType>> fieldHelper_;
+
+  Real velocityDirichletFunc(const std::vector<Real> & coords, int sideset, int locSideId, int dir) const {
+    Real val(0);
+    const Real one(1), two(2), three(3), four(4), x = coords[0];
+    if (sideset==5 && dir==1)
+      val = two*(one/three - x)*x;
+    if (sideset==6 && dir==1)
+      val = -four*(x-one/three)*(two/three-x);
+    if (sideset==7 && dir==1)
+      val = two*(x-two/three)*(one-x);
+    return val;
+  }
+
+  Real thermalDirichletFunc(const std::vector<Real> & coords, int sideset, int locSideId) const {
+    const std::vector<Real> param = PDE<Real,DeviceType>::getParameter();
+    Real val(0);
+    if (sideset==4) {
+      val = static_cast<Real>(1);
+      if (param.size()) {
+        Real root2(std::sqrt(2.0)), pi(M_PI), ln2(std::log(2.0));
+        for (int i = 0; i < Nbottom_; ++i) {
+          Real di(i+1);
+          Real xcomp = param[2*i  ]/ln2 * (root2 * std::sin(di * pi * coords[0]))/(di * pi);
+          Real zcomp = param[2*i+1]/ln2 * (root2 * std::sin(di * pi * coords[2]))/(di * pi);
+          val += TScale_*xcomp*zcomp;
+        }
+      }
+    }
+    else if (sideset==6) {
+      val = static_cast<Real>(0);
+    }
+    return val;
+  }
+
+  void computeDirichlet(void) {
+    // Compute Dirichlet values at DOFs.
+    int d  = basisPtrVel_->getBaseCellTopology().getDimension();
+    int fv = basisPtrVel_->getCardinality();
+    int ft = basisPtrThr_->getCardinality();
+    int numSidesets = bdryCellLocIds_.size();
+    bdryCellVDofValues_.resize(numSidesets);
+    bdryCellTDofValues_.resize(numSidesets);
+    for (int i=0; i<numSidesets; ++i) {
+      int numLocSides = bdryCellLocIds_[i].size();
+      bdryCellVDofValues_[i].resize(numLocSides);
+      bdryCellTDofValues_[i].resize(numLocSides);
+      for (int j=0; j<numLocSides; ++j) {
+        int c = bdryCellLocIds_[i][j].size();
+        bdryCellVDofValues_[i][j] = scalar_view("bdryCellVDofValues", c, fv, d);
+        bdryCellTDofValues_[i][j] = scalar_view("bdryCellTDofValues", c, ft);
+        scalar_view Vcoords = scalar_view("Vcoords", c, fv, d);
+        scalar_view Tcoords = scalar_view("Tcoords", c, ft, d);
+        if (c > 0) {
+          feVel_->computeDofCoords(Vcoords, bdryCellNodes_[i][j]);
+          feThr_->computeDofCoords(Tcoords, bdryCellNodes_[i][j]);
+        }
+        for (int k=0; k<c; ++k) {
+          for (int l=0; l<fv; ++l) {
+            std::vector<Real> dofpoint(d);
+            //std::cout << "Sideset " << i << " LocalSide " << j << "  Cell " << k << "  Field " << l << "  Coord ";
+            for (int m=0; m<d; ++m) {
+              dofpoint[m] = Vcoords(k, l, m);
+              //std::cout << dofpoint[m] << "  ";
+            }
+
+            for (int m=0; m<d; ++m) {
+              (bdryCellVDofValues_[i][j])(k, l, m) = velocityDirichletFunc(dofpoint, i, j, m);
+              //std::cout << "  " << m << "-Value " << DirichletFunc(dofpoint, i, j, m);
+            }
+            //std::cout << std::endl;
+          }
+          for (int l=0; l<ft; ++l) {
+            std::vector<Real> dofpoint(d);
+            //std::cout << "Sideset " << i << " LocalSide " << j << "  Cell " << k << "  Field " << l << "  Coord ";
+            for (int m=0; m<d; ++m) {
+              dofpoint[m] = Tcoords(k, l, m);
+              //std::cout << dofpoint[m] << "  ";
+            }
+
+            (bdryCellTDofValues_[i][j])(k, l) = thermalDirichletFunc(dofpoint, i, j);
+            //std::cout << "    Value " << DirichletFunc(dofpoint, i, j);
+            //std::cout << std::endl;
+          }
+        }
+      }
+    }
+  }
+
+  Real ReynoldsNumber(void) const {
+    const std::vector<Real> param = PDE<Real,DeviceType>::getParameter();
+    Real val = Re_;
+    if (param.size()) {
+      int offset = 2*(Nbottom_ + N0_ + N1_ + N2_ + N3_);
+      Real one(1);
+      val /= (one + ReScale_*param[offset]);
+    }
+    return val;
+  }
+
+  Real PrandtlNumber(void) const {
+    const std::vector<Real> param = PDE<Real,DeviceType>::getParameter();
+    Real val = Pr_;
+    if (param.size()) {
+      int offset = 2*(Nbottom_ + N0_ + N1_ + N2_ + N3_);
+      Real one(1);
+      val *= (one + ReScale_*param[offset])/(one + PrScale_*param[offset+1]);
+    }
+    return val;
+  }
+
+  Real GrashofNumber(void) const {
+    const std::vector<Real> param = PDE<Real,DeviceType>::getParameter();
+    Real val = Gr_;
+    if (param.size()) {
+      int offset = 2*(Nbottom_ + N0_ + N1_ + N2_ + N3_);
+      Real one(1), muscale = one + ReScale_*param[offset];
+      val *= (one + GrScale_*param[offset+2])/(muscale*muscale);
+    }
+    return val;
+  }
+  
+  Real ThicknessNumber(const std::vector<Real> &x, const int sideset) const {
+    const std::vector<Real> param = PDE<Real,DeviceType>::getParameter();
+    Real val = h_, x1(0), x2(0);
+    if ( param.size()) {
+      const Real root2(std::sqrt(2.0)), pi(M_PI), ln2(std::log(2.0));
+      int offset = 0, N = 0;
+      if ( sideset == 0 ) {
+        offset = 2*Nbottom_;
+        N      = N0_;
+        x1     = x[1];
+        x2     = x[2];
+      }
+      else if ( sideset == 1 ) {
+        offset = 2*(Nbottom_+N0_);
+        N      = N1_;
+        x1     = x[0];
+        x2     = x[1];
+      }
+      else if ( sideset == 2 ) {
+        offset = 2*(Nbottom_+N0_+N1_);
+        N      = N2_;
+        x1     = x[1];
+        x2     = x[2];
+      }
+      else if ( sideset == 3 ) {
+        offset = 2*(Nbottom_+N0_+N1_+N2_);
+        N      = N3_;
+        x1     = x[0];
+        x2     = x[1];
+      }
+      for (int i = 0; i < N; ++i) {
+        Real di(i+1);
+        Real xcomp = param[offset + 2*i  ]/ln2 * (root2 * std::sin(di * pi * x1))/(di * pi);
+        Real ycomp = param[offset + 2*i+1]/ln2 * (root2 * std::sin(di * pi * x2))/(di * pi);
+        val += hScale_*xcomp*ycomp;
+      }
+    }
+    return val;
+  }
+
+  Real viscosityFunc(const std::vector<Real> &x) const {
+    Real ReynoldsNum = ReynoldsNumber();
+    return static_cast<Real>(1)/ReynoldsNum;
+  }
+
+  Real conductivityFunc(const std::vector<Real> &x) const {
+    Real ReynoldsNum = ReynoldsNumber();
+    Real PrandtlNum = PrandtlNumber();
+    return static_cast<Real>(1)/(ReynoldsNum*PrandtlNum);
+  }
+
+  Real gravityFunc(const std::vector<Real> &x) const {
+    Real ReynoldsNum = ReynoldsNumber();
+    Real GrashofNum = GrashofNumber();
+    return grav_*GrashofNum/(ReynoldsNum*ReynoldsNum);
+  }
+
+  Real thermalRobinFunc(const std::vector<Real> &x, const int sideset) const {
+    return ThicknessNumber(x, sideset) * conductivityFunc(x);
+  }
+
+  void computeCoefficients(scalar_view &nu, scalar_view &grav, scalar_view &kappa) const {
+    int c = feVel_->gradN().extent_int(0);
+    int p = feVel_->gradN().extent_int(2);
+    int d = feVel_->gradN().extent_int(3);
+    std::vector<Real> pt(d);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for ( int k = 0; k < d; ++k)
+          pt[k] = (feVel_->cubPts())(i,j,k);
+        // Compute spatially dependent coefficients
+        nu(i,j)    = viscosityFunc(pt);
+        grav(i,j)  = gravityFunc(pt);
+        kappa(i,j) = conductivityFunc(pt);
+      }
+    }
+  }
+
+  void computeThermalRobin(scalar_view &robin,
+                           const scalar_view u,
+                           const scalar_view z,
+                           int sideset,
+                           int locSideId,
+                           int deriv = 0,
+                           int component = 1) const {
+    const int c = u.extent_int(0);
+    const int p = u.extent_int(1);
+    const int d = feThr_->gradN().extent_int(3);
+    std::vector<Real> x(d);
+    Real h(0);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k)
+          x[k] = (feThrBdry_[sideset][locSideId]->cubPts())(i,j,k);
+        h = thermalRobinFunc(x, sideset);
+        if ( deriv == 0 )
+          robin(i,j) = h * (u(i,j) - z(i,j));
+        else if ( deriv == 1 )
+          robin(i,j) = ((component==0) ? h : -h);
+        else {
+          robin(i,j) = static_cast<Real>(0);
+        }
+      }
+    }
+  }
+
+  scalar_view getThermalBoundaryCoeff(const scalar_view cell_coeff, int sideSet, int cell) const {
+    std::vector<int> bdryCellLocId = bdryCellLocIds_[sideSet][cell];
+    const int numCellsSide = bdryCellLocId.size();
+    const int f = basisPtrThr_->getCardinality();
+    
+    scalar_view bdry_coeff("bdry_coeff", numCellsSide, f);
+    for (int i = 0; i < numCellsSide; ++i) {
+      for (int j = 0; j < f; ++j)
+        bdry_coeff(i, j) = cell_coeff(bdryCellLocId[i], j);
+    }
+    return bdry_coeff;
+  }
+
+public:
+  PDE_ThermalFluids_ex07(ROL::ParameterList &parlist) : grav_(-1) {
+    // Finite element fields -- NOT DIMENSION INDEPENDENT!
+    basisPtrVel_ = ROL::makePtr<Intrepid2::Basis_HGRAD_HEX_C2_FEM<DeviceType,Real,Real>>();
+    basisPtrPrs_ = ROL::makePtr<Intrepid2::Basis_HGRAD_HEX_C1_FEM<DeviceType,Real,Real>>();
+    basisPtrThr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_HEX_C2_FEM<DeviceType,Real,Real>>();
+    // Volume quadrature rules.
+    shards::CellTopology cellType = basisPtrVel_->getBaseCellTopology();     // get the cell type from any basis
+    Intrepid2::DefaultCubatureFactory cubFactory;                            // create cubature factory
+    int cubDegree = parlist.sublist("Problem").get("Cubature Degree", 4);    // set cubature degree, e.g., 4
+    cellCub_ = cubFactory.create<DeviceType,Real,Real>(cellType, cubDegree); // create default cubature
+    // Boundary quadrature rules.
+    int d = cellType.getDimension();
+    shards::CellTopology bdryCellType = cellType.getCellTopologyData(d-1, 0);
+    int bdryCubDegree = parlist.sublist("Problem").get("Boundary Cubature Degree",4); // set cubature degree, e.g., 4
+    bdryCub_ = cubFactory.create<DeviceType,Real,Real>(bdryCellType, bdryCubDegree);
+    // Fill finite element basis container
+    basisPtrs_.clear();
+    basisPtrs_.resize(d,basisPtrVel_);  // Velocity
+    basisPtrs_.push_back(basisPtrPrs_); // Pressure
+    basisPtrs_.push_back(basisPtrThr_); // Heat
+
+    // Other problem parameters.
+    Re_      = parlist.sublist("Problem").get("Reynolds Number",    200.0);
+    Pr_      = parlist.sublist("Problem").get("Prandtl Number",     0.72);
+    Gr_      = parlist.sublist("Problem").get("Grashof Number",     40000.0);
+    h_       = parlist.sublist("Problem").get("Robin Coefficient",  1.0);
+    // Stochastic Expansion Information.
+    Nbottom_ = parlist.sublist("Problem").get("Bottom KL Truncation Order",5);
+    N0_      = parlist.sublist("Problem").get("Side 0 KL Truncation Order",5);
+    N1_      = parlist.sublist("Problem").get("Side 1 KL Truncation Order",5);
+    N2_      = parlist.sublist("Problem").get("Side 2 KL Truncation Order",5);
+    N3_      = parlist.sublist("Problem").get("Side 3 KL Truncation Order",5);
+    // Stochastic scales
+    ReScale_ = parlist.sublist("Problem").get("Reynolds Number Noise Scale",0.05);
+    PrScale_ = parlist.sublist("Problem").get("Prandtl Number Noise Scale",0.05);
+    GrScale_ = parlist.sublist("Problem").get("Grashof Number Noise Scale",0.05);
+    hScale_  = parlist.sublist("Problem").get("Robin Noise Scale",0.2);
+    TScale_  = parlist.sublist("Problem").get("Bottom Temperature Noise Scale",0.2);
+    // Pin pressure
+    pinPressure_ = parlist.sublist("Problem").get("Pin Pressure",false);
+
+    numDofs_ = 0;
+    numFields_ = basisPtrs_.size();
+    offset_.resize(numFields_);
+    numFieldDofs_.resize(numFields_);
+    for (int i=0; i<numFields_; ++i) {
+      if (i==0)
+        offset_[i] = 0;
+      else
+        offset_[i] = offset_[i-1] + basisPtrs_[i-1]->getCardinality();
+      numFieldDofs_[i] = basisPtrs_[i]->getCardinality();
+      numDofs_ += numFieldDofs_[i];
+    }
+  }
+
+  void residual(scalar_view & res,
+                const scalar_view u_coeff,
+                const scalar_view z_coeff = scalar_view(),
+                const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    int c  = u_coeff.extent_int(0);
+    int p  = cellCub_->getNumPoints();
+    int fv = basisPtrVel_->getCardinality();
+    int fp = basisPtrPrs_->getCardinality();
+    int fh = basisPtrThr_->getCardinality();
+    int d  = cellCub_->getDimension();
+ 
+    // Initialize residuals.
+    std::vector<scalar_view> R(d+2);
+    for (int i = 0; i < d; ++i)
+      R[i] = scalar_view("res", c, fv);
+    R[d]   = scalar_view("res", c, fp);
+    R[d+1] = scalar_view("res", c, fh);
+
+    // Split u_coeff into components.
+    std::vector<scalar_view> U, Z;
+    fieldHelper_->splitFieldCoeff(U, u_coeff);
+    fieldHelper_->splitFieldCoeff(Z, z_coeff);
+
+    // Evaluate problem coefficients at quadrature points.
+    scalar_view nu("nu", c, p);
+    scalar_view grav("grav", c, p);
+    scalar_view kappa("kappa", c, p);
+    computeCoefficients(nu,grav,kappa);
+
+    // Evaluate/interpolate finite element fields on cells.
+    std::vector<scalar_view> valVel_vec(d);
+    for (int i = 0; i < d; ++i) {
+      valVel_vec[i] = scalar_view("valVel_vec", c, p);
+      feVel_->evaluateValue(valVel_vec[i], U[i]);
+    }
+    scalar_view valPres_eval("valPres_eval", c, p);
+    scalar_view valHeat_eval("valHeat_eval", c, p);
+    fePrs_->evaluateValue(valPres_eval, U[d]);
+    feThr_->evaluateValue(valHeat_eval, U[d+1]);
+    // Evaluate/interpolate gradient of finite element fields on cells.
+    std::vector<scalar_view> gradVel_vec(d);
+    for (int i = 0; i < d; ++i) {
+      gradVel_vec[i] = scalar_view("gradVel_vec", c, p, d);
+      feVel_->evaluateGradient(gradVel_vec[i], U[i]);
+    }
+    scalar_view gradHeat_eval("gradHeat_eval", c, p, d);
+    feThr_->evaluateGradient(gradHeat_eval, U[d+1]);
+
+    // Assemble the velocity vector and its divergence.
+    scalar_view valVel_eval("valVel_eval", c, p, d);
+    scalar_view divVel_eval("divVel_eval", c, p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        divVel_eval(i,j) = static_cast<Real>(0);
+        for (int k = 0; k < d; ++k) {
+          valVel_eval(i,j,k) = (valVel_vec[k])(i,j);
+          divVel_eval(i,j)  += (gradVel_vec[k])(i,j,k);
+        }
+      }
+    }
+
+    /**************************************************************************/
+    /*** NAVIER STOKES ********************************************************/
+    /**************************************************************************/
+    std::vector<scalar_view> nuGradVel_vec(d), valVelDotgradVel_vec(d);
+    for (int i = 0; i < d; ++i) {
+      // Multiply velocity gradients with viscosity.
+      nuGradVel_vec[i] = scalar_view("nuGradVel_vec", c, p, d);
+      fst::tensorMultiplyDataData(nuGradVel_vec[i], nu, gradVel_vec[i]);
+      // Compute nonlinear terms in the Navier-Stokes equations.
+      valVelDotgradVel_vec[i] = scalar_view("valVelDotgradVel_vec", c, p);
+      fst::dotMultiplyDataData(valVelDotgradVel_vec[i], valVel_eval, gradVel_vec[i]);
+    }
+    // Negative pressure
+    rst::scale(valPres_eval,static_cast<Real>(-1));
+    // Compute gravitational force.
+    scalar_view gravForce("gravForce", c, p);
+    fst::scalarMultiplyDataData(gravForce, grav, valHeat_eval);
+
+    /**************************************************************************/
+    /*** THERMAL **************************************************************/
+    /**************************************************************************/
+    scalar_view kappaGradHeat_eval("kappaGradHeat_eval", c, p, d);
+    scalar_view velDotGradHeat_eval("velDotGradHeat_eval", c, p);
+    // Multiply temperature gradient with conductivity
+    fst::tensorMultiplyDataData(kappaGradHeat_eval, kappa, gradHeat_eval);
+    // Dot multiply scaled velocity with temperature gradient
+    fst::dotMultiplyDataData(velDotGradHeat_eval, valVel_eval, gradHeat_eval);
+
+    /**************************************************************************/
+    /*** EVALUATE WEAK FORM OF RESIDUAL ***************************************/
+    /**************************************************************************/
+    // Velocity equation.
+    for (int i = 0; i < d; ++i) {
+      fst::integrate(R[i],nuGradVel_vec[i],feVel_->gradNdetJ(),false);
+      fst::integrate(R[i],valVelDotgradVel_vec[i],feVel_->NdetJ(),true);
+      fst::integrate(R[i],valPres_eval,feVel_->DNDdetJ(i),true);
+    }
+    // Apply gravitational force.
+    fst::integrate(R[d-1],gravForce,feVel_->NdetJ(),true);
+    // Pressure equation.
+    fst::integrate(R[d],divVel_eval,fePrs_->NdetJ(),false);
+    rst::scale(R[d],static_cast<Real>(-1));
+    // Heat equation.
+    fst::integrate(R[d+1],kappaGradHeat_eval,feThr_->gradNdetJ(),false);
+    fst::integrate(R[d+1],velDotGradHeat_eval,feThr_->NdetJ(),true);
+
+    /**************************************************************************/
+    /*** APPLY BOUNDARY CONDITIONS ********************************************/
+    /**************************************************************************/
+    // THERMAL BOUNDARIES
+    // -->        Control boundaries: i=0,1,2,3
+    // -->        Substrate boundary: i=4
+    // -->        Inflow boundary:    i=6
+    // -->        Outflow boundaries: i=5,7
+    // VELOCITY BOUNDARIES
+    // -->        No slip boundaries: i=0,1,2,3,4
+    // -->        Inflow Boundary:    i=6
+    // -->        Outflow Boundaries: i=5,7
+    // PRESSURE BOUNDARIES
+    // -->              Pressure Pin: i=?
+    int numSideSets = bdryCellLocIds_.size();
+    const int numCubPerSide = bdryCub_->getNumPoints();
+    if (numSideSets > 0) {
+      // ROBIN CONDITIONS
+      for (int i = 0; i < numSideSets; ++i) {
+        // Control boundaries
+        if (i==0 || i==1 || i==2 || i==3) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            if (numCellsSide) {
+              // Get U and Z coefficients on Robin boundary
+              scalar_view u_coeff_bdry = getThermalBoundaryCoeff(U[d+1], i, j);
+              scalar_view z_coeff_bdry = getThermalBoundaryCoeff(Z[d+1], i, j);
+              // Evaluate U and Z on FE basis
+              scalar_view valU_eval_bdry("valU_eval_bdry", numCellsSide, numCubPerSide);
+              scalar_view valZ_eval_bdry("valZ_eval_bdry", numCellsSide, numCubPerSide);
+              feThrBdry_[i][j]->evaluateValue(valU_eval_bdry, u_coeff_bdry);
+              feThrBdry_[i][j]->evaluateValue(valZ_eval_bdry, z_coeff_bdry);
+              // Compute Stefan-Boltzmann residual
+              scalar_view robinVal("robinVal", numCellsSide, numCubPerSide);
+              computeThermalRobin(robinVal,valU_eval_bdry,valZ_eval_bdry,i,j,0);
+              // Evaluate boundary integral
+              scalar_view robinRes("robinRes", numCellsSide, fh);
+              fst::integrate(robinRes,robinVal,feThrBdry_[i][j]->NdetJ(),false);
+              for (int k = 0; k < numCellsSide; ++k) {
+                int cidx = bdryCellLocIds_[i][j][k];
+                // Thermal boundary conditions.
+                for (int l = 0; l < fh; ++l) {
+                  R[d+1](cidx,l) += robinRes(k,l);
+                }
+              }
+            }
+          }
+        }
+      }
+      // DIRICHLET CONDITIONS
+      computeDirichlet();
+      // Velocity Boundary Conditions
+      for (int i = 0; i < numSideSets; ++i) {
+        if (i==0 || i==1 || i==2 || i==3 || i==4 || i==5 || i==6 || i==7) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numVBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numVBdryDofs; ++l) {
+                for (int m = 0; m < d; ++m) {
+                  (R[m])(cidx,fvidx_[j][l]) = (U[m])(cidx,fvidx_[j][l]) - (bdryCellVDofValues_[i][j])(k,fvidx_[j][l],m);
+                }
+              }
+            }
+          }
+        }
+        // Thermal Boundary Conditions
+        if ( (i==4) || (i==6) ) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();   // Number of sides per cell
+          for (int j = 0; j < numLocalSideIds; ++j) {        // Loop over sides of cell: Quad = {0, 1, 2, 3}
+            int numCellsSide = bdryCellLocIds_[i][j].size(); // Number of cells with side j
+            int numHBdryDofs = fhidx_[j].size();             // Number of thermal boundary DOFs
+            for (int k = 0; k < numCellsSide; ++k) {         // Loop over cells with side j
+              int cidx = bdryCellLocIds_[i][j][k];           // Cell index
+              for (int l = 0; l < numHBdryDofs; ++l) {       // Loop over all fields of cell k on side j
+                (R[d+1])(cidx,fhidx_[j][l]) = (U[d+1])(cidx,fhidx_[j][l]) - (bdryCellTDofValues_[i][j])(k,fhidx_[j][l]);
+              }
+            }
+          }
+        }
+        // Pressure pinning
+        if (i==8 && pinPressure_) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int l = 0, cidx = bdryCellLocIds_[i][j][k];
+              (R[d])(cidx,fpidx_[j][l]) = (U[d])(cidx,fpidx_[j][l]) - static_cast<Real>(0);
+            }
+          }
+        }
+      }
+    }
+
+    // Combine the residuals.
+    fieldHelper_->combineFieldCoeff(res, R);
+  }
+
+  void Jacobian_1(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    int c  = u_coeff.extent_int(0);
+    int p  = cellCub_->getNumPoints();
+    int fv = basisPtrVel_->getCardinality();
+    int fp = basisPtrPrs_->getCardinality();
+    int fh = basisPtrThr_->getCardinality();
+    int d  = cellCub_->getDimension();
+ 
+    // Initialize jacobians.
+    std::vector<std::vector<scalar_view>> J(d+2);
+    for (int i = 0; i < d+2; ++i)
+      J[i].resize(d+2);
+    for (int i = 0; i < d; ++i) {
+      for (int j = 0; j < d; ++j)
+        J[i][j] = scalar_view("jac", c, fv, fv);
+      J[d][i]   = scalar_view("jac", c, fp, fv);
+      J[i][d]   = scalar_view("jac", c, fv, fp);
+      J[d+1][i] = scalar_view("jac", c, fh, fv);
+      J[i][d+1] = scalar_view("jac", c, fv, fh);
+    }
+    J[d][d]     = scalar_view("jac", c, fp, fp);
+    J[d+1][d]   = scalar_view("jac", c, fh, fp);
+    J[d][d+1]   = scalar_view("jac", c, fp, fh);
+    J[d+1][d+1] = scalar_view("jac", c, fh, fh);
+
+    // Split u_coeff into components.
+    std::vector<scalar_view> U, Z;
+    fieldHelper_->splitFieldCoeff(U, u_coeff);
+    fieldHelper_->splitFieldCoeff(Z, z_coeff);
+
+    // Evaluate problem coefficients at quadrature points.
+    scalar_view nu("nu", c, p);
+    scalar_view grav("grav", c, p);
+    scalar_view kappa("kappa", c, p);
+    computeCoefficients(nu,grav,kappa);
+
+    // Evaluate/interpolate finite element fields on cells.
+    std::vector<scalar_view> valVel_vec(d);
+    for (int i = 0; i < d; ++i) {
+      valVel_vec[i] = scalar_view("valVel_vec", c, p);
+      feVel_->evaluateValue(valVel_vec[i], U[i]);
+    }
+    // Evaluate/interpolate gradient of finite element fields on cells.
+    std::vector<scalar_view> gradVel_vec(d);
+    for (int i = 0; i < d; ++i) {
+      gradVel_vec[i] = scalar_view("gradVel_vec", c, p, d);
+      feVel_->evaluateGradient(gradVel_vec[i], U[i]);
+    }
+    scalar_view gradHeat_eval("gradHeat_eval", c, p, d);
+    feThr_->evaluateGradient(gradHeat_eval, U[d+1]);
+
+    // Assemble the velocity vector and its divergence.
+    scalar_view valVel_eval("valVel_eval", c, p, d);
+    std::vector<std::vector<scalar_view>> dVel_vec(d);
+    std::vector<scalar_view> gradHeat_vec(d);
+    for (int k = 0; k < d; ++k) {
+      dVel_vec[k].resize(d);
+      for (int l = 0; l < d; ++l) {
+        dVel_vec[k][l] = scalar_view("dVel_vec", c, p);
+        for (int i = 0; i < c; ++i) {
+          for (int j = 0; j < p; ++j) {
+            (dVel_vec[k][l])(i,j) = (gradVel_vec[k])(i,j,l);
+          }
+        }
+      }
+      gradHeat_vec[k] = scalar_view("gradHeat_vec", c, p);
+      for (int i = 0; i < c; ++i) {
+        for (int j = 0; j < p; ++j) {
+          valVel_eval(i,j,k)  = (valVel_vec[k])(i,j);
+          (gradHeat_vec[k])(i,j) = (gradHeat_eval)(i,j,k);
+        }
+      }
+    }
+
+    /**************************************************************************/
+    /*** NAVIER STOKES ********************************************************/
+    /**************************************************************************/
+    std::vector<std::vector<scalar_view>> dVelPhi_vec(d);
+    for (int i = 0; i < d; ++i) {
+      // Compute nonlinear terms in the Navier-Stokes equations.
+      dVelPhi_vec[i].resize(d);
+      for (int j = 0; j < d; ++j) {
+        dVelPhi_vec[i][j] = scalar_view("dVelPhi_vec", c, fv, p);
+        fst::scalarMultiplyDataField(dVelPhi_vec[i][j], dVel_vec[i][j], feVel_->N());
+      }
+    }
+    // Multiply velocity gradients with viscosity.
+    scalar_view nuGradPhi_eval("nuGradPhi_eval", c, fv, p, d);
+    fst::tensorMultiplyDataField(nuGradPhi_eval, nu, feVel_->gradN());
+    // Compute nonlinear terms in the Navier-Stokes equations.
+    scalar_view valVelDotgradPhi_eval("vaVelDotgradPhi_eval", c, fv, p);
+    fst::dotMultiplyDataField(valVelDotgradPhi_eval, valVel_eval, feVel_->gradN());
+    // Negative pressure basis.
+    scalar_view negPrsPhi("negPrsPhi", c, fp, p);
+    Kokkos::deep_copy(negPrsPhi, fePrs_->N());
+    rst::scale(negPrsPhi,static_cast<Real>(-1));
+    // Compute gravity Jacobian
+    scalar_view gravPhi("gravPhi", c, fh, p);
+    fst::scalarMultiplyDataField(gravPhi, grav, feThr_->N());
+
+    /**************************************************************************/
+    /*** THERMAL **************************************************************/
+    /**************************************************************************/
+    scalar_view kappaGradPhi("kappaGradPhi", c, fh, p, d);
+    scalar_view VelPhi("VelPhi", c, fh, p);
+    std::vector<scalar_view> dHeatPhi(d);
+    // Compute kappa times gradient of basis.
+    fst::tensorMultiplyDataField(kappaGradPhi, kappa, feThr_->gradN());
+    // Compute scaled velocity.
+    fst::dotMultiplyDataField(VelPhi, valVel_eval, feThr_->gradN());
+    // Thermal-velocity Jacobians.
+    for (int i = 0; i < d; ++i) {
+      dHeatPhi[i] = scalar_view("dHeatPhi", c, fh, p);
+      fst::scalarMultiplyDataField(dHeatPhi[i], gradHeat_vec[i], feThr_->N());
+    }
+
+    /*** Evaluate weak form of the Jacobian. ***/
+    // X component of velocity equation.
+    for (int i = 0; i < d; ++i) {
+      // Velocity components
+      for (int j = 0; j < d; ++j)
+        fst::integrate(J[i][j],feVel_->NdetJ(),dVelPhi_vec[i][j],false);
+      fst::integrate(J[i][i],nuGradPhi_eval,feVel_->gradNdetJ(),true);
+      fst::integrate(J[i][i],feVel_->NdetJ(),valVelDotgradPhi_eval,true);
+      // Pressure components
+      fst::integrate(J[i][d],feVel_->DNDdetJ(i),negPrsPhi,false);
+      fst::integrate(J[d][i],fePrs_->NdetJ(),feVel_->DND(i),false);
+      rst::scale(J[d][i],static_cast<Real>(-1));
+      // Thermal components
+      fst::integrate(J[d+1][i],dHeatPhi[i],feVel_->NdetJ(),false);
+    }
+    // Gravitational component
+    fst::integrate(J[d-1][d+1],feVel_->NdetJ(),gravPhi,false);
+    // Thermal components
+    fst::integrate(J[d+1][d+1],kappaGradPhi,feThr_->gradNdetJ(),false);
+    fst::integrate(J[d+1][d+1],feThr_->NdetJ(),VelPhi,true);
+
+    // APPLY BOUNDARY CONDITIONS
+    // THERMAL BOUNDARIES
+    // -->        Control boundaries: i=0,1,2,3
+    // -->        Substrate boundary: i=4
+    // -->        Inflow boundary:    i=6
+    // -->        Outflow boundaries: i=5,7
+    // VELOCITY BOUNDARIES
+    // -->        No slip boundaries: i=0,1,2,3,4
+    // -->        Inflow Boundary:    i=6
+    // -->        Outflow Boundaries: i=5,7
+    // PRESSURE BOUNDARIES
+    // -->              Pressure Pin: i=?
+    int numSideSets = bdryCellLocIds_.size();
+    const int numCubPerSide = bdryCub_->getNumPoints();
+    if (numSideSets > 0) {
+      // ROBIN CONDITIONS
+      for (int i = 0; i < numSideSets; ++i) {
+        // Control boundaries
+        if (i==0 || i==1 || i==2 || i==3) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            if (numCellsSide) {
+              // Get U and Z coefficients on Robin boundary
+              scalar_view u_coeff_bdry = getThermalBoundaryCoeff(U[d+1], i, j);
+              scalar_view z_coeff_bdry = getThermalBoundaryCoeff(Z[d+1], i, j);
+              // Evaluate U and Z on FE basis
+              scalar_view valU_eval_bdry("valU_eval_bdry", numCellsSide, numCubPerSide);
+              scalar_view valZ_eval_bdry("valZ_eval_bdry", numCellsSide, numCubPerSide);
+              feThrBdry_[i][j]->evaluateValue(valU_eval_bdry, u_coeff_bdry);
+              feThrBdry_[i][j]->evaluateValue(valZ_eval_bdry, z_coeff_bdry);
+              // Compute Stefan-Boltzmann residual
+              scalar_view robinVal1("robinVal1", numCellsSide, numCubPerSide);
+              scalar_view robinVal("robinVal", numCellsSide, fh, numCubPerSide);
+              computeThermalRobin(robinVal1,valU_eval_bdry,valZ_eval_bdry,i,j,1,0);
+              fst::scalarMultiplyDataField(robinVal,robinVal1,feThrBdry_[i][j]->N());
+              // Evaluate boundary integral
+              scalar_view robinJac("robinJac", numCellsSide, fh, fh);
+              fst::integrate(robinJac,robinVal,feThrBdry_[i][j]->NdetJ(),false);
+              for (int k = 0; k < numCellsSide; ++k) {
+                int cidx = bdryCellLocIds_[i][j][k];
+                for (int l = 0; l < fh; ++l) {
+                  for (int m = 0; m < fh; ++m) {
+                    (J[d+1][d+1])(cidx,l,m) += robinJac(k,l,m);
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+      // DIRICHLET CONDITIONS
+      for (int i = 0; i < numSideSets; ++i) {
+        // Velocity Boundary Conditions
+        if (i==0 || i==1 || i==2 || i==3 || i==4 || i==5 || i==6 || i==7) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numVBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numVBdryDofs; ++l) {
+                for (int m = 0; m < fv; ++m) {
+                  for (int n = 0; n < d; ++n) {
+                    for (int q = 0; q < d; ++q) {
+                      (J[n][q])(cidx,fvidx_[j][l],m) = static_cast<Real>(0);
+                    }
+                    (J[n][n])(cidx,fvidx_[j][l],fvidx_[j][l]) = static_cast<Real>(1);
+                  }
+                }
+                for (int m = 0; m < d; ++m) {
+                  for (int n = 0; n < fp; ++n) {
+                    (J[m][d])(cidx,fvidx_[j][l],n) = static_cast<Real>(0);
+                  }
+                  for (int n = 0; n < fh; ++n) {
+                    (J[m][d+1])(cidx,fvidx_[j][l],n) = static_cast<Real>(0);
+                  }
+                }
+              }
+            }
+          }
+        }
+        // Thermal boundary conditions
+        if ( (i==4) || (i==6) ) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numHBdryDofs = fhidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numHBdryDofs; ++l) {
+                for (int m = 0; m < fv; ++m) {
+                  for (int n = 0; n < d; ++n) {
+                    (J[d+1][n])(cidx,fhidx_[j][l],m) = static_cast<Real>(0);
+                  }
+                }
+                for (int m = 0; m < fp; ++m) {
+                  (J[d+1][d])(cidx,fhidx_[j][l],m) = static_cast<Real>(0);
+                }
+                for (int m = 0; m < fh; ++m) {
+                  (J[d+1][d+1])(cidx,fhidx_[j][l],m) = static_cast<Real>(0);
+                }
+                (J[d+1][d+1])(cidx,fhidx_[j][l],fhidx_[j][l]) = static_cast<Real>(1);
+              }
+            }
+          }
+        }
+        // Pressure pinning
+        if (i==8 && pinPressure_) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int l = 0, cidx = bdryCellLocIds_[i][j][k];
+              for (int m = 0; m < fv; ++m) {
+                for (int n = 0; n < d; ++n) {
+                  (J[d][n])(cidx,fpidx_[j][l],m) = static_cast<Real>(0);
+                }
+              }
+              for (int m = 0; m < fp; ++m) {
+                (J[d][d])(cidx,fpidx_[j][l],m) = static_cast<Real>(0);
+              }
+              (J[d][d])(cidx,fpidx_[j][l],fpidx_[j][l]) = static_cast<Real>(1);
+              for (int m = 0; m < fh; ++m) {
+                (J[d][d+1])(cidx,fpidx_[j][l],m) = static_cast<Real>(0);
+              }
+            }
+          }
+        }
+      }
+    }
+
+    // Combine the jacobians.
+    fieldHelper_->combineFieldCoeff(jac, J);
+  }
+
+
+  void Jacobian_2(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    int c  = u_coeff.extent_int(0);
+    int fv = basisPtrVel_->getCardinality();
+    int fp = basisPtrPrs_->getCardinality();
+    int fh = basisPtrThr_->getCardinality();
+    int d  = cellCub_->getDimension();
+ 
+    // Initialize jacobians.
+    std::vector<std::vector<scalar_view>> J(d+2);
+    for (int i = 0; i < d+2; ++i)
+      J[i].resize(d+2);
+    for (int i = 0; i < d; ++i) {
+      for (int j = 0; j < d; ++j)
+        J[i][j] = scalar_view("jac", c, fv, fv);
+      J[d][i]   = scalar_view("jac", c, fp, fv);
+      J[i][d]   = scalar_view("jac", c, fv, fp);
+      J[d+1][i] = scalar_view("jac", c, fh, fv);
+      J[i][d+1] = scalar_view("jac", c, fv, fh);
+    }
+    J[d][d]     = scalar_view("jac", c, fp, fp);
+    J[d+1][d]   = scalar_view("jac", c, fh, fp);
+    J[d][d+1]   = scalar_view("jac", c, fp, fh);
+    J[d+1][d+1] = scalar_view("jac", c, fh, fh);
+
+    // Split u_coeff into components.
+    std::vector<scalar_view> U, Z;
+    fieldHelper_->splitFieldCoeff(U, u_coeff);
+    fieldHelper_->splitFieldCoeff(Z, z_coeff);
+
+    // APPLY BOUNDARY CONDITIONS
+    // THERMAL BOUNDARIES
+    // -->        Control boundaries: i=0,1,2,3
+    // -->        Substrate boundary: i=4
+    // -->        Inflow boundary:    i=6
+    // -->        Outflow boundaries: i=5,7
+    // VELOCITY BOUNDARIES
+    // -->        No slip boundaries: i=0,1,2,3,4
+    // -->        Inflow Boundary:    i=6
+    // -->        Outflow Boundaries: i=5,7
+    // PRESSURE BOUNDARIES
+    // -->              Pressure Pin: i=?
+    int numSideSets = bdryCellLocIds_.size();
+    const int numCubPerSide = bdryCub_->getNumPoints();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        // Control boundaries
+        if (i==0 || i==1 || i==2 || i==3) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            if (numCellsSide) {
+              // Get U and Z coefficients on Robin boundary
+              scalar_view u_coeff_bdry = getThermalBoundaryCoeff(U[d+1], i, j);
+              scalar_view z_coeff_bdry = getThermalBoundaryCoeff(Z[d+1], i, j);
+              // Evaluate U and Z on FE basis
+              scalar_view valU_eval_bdry("valU_eval_bdry", numCellsSide, numCubPerSide);
+              scalar_view valZ_eval_bdry("valZ_eval_bdry", numCellsSide, numCubPerSide);
+              feThrBdry_[i][j]->evaluateValue(valU_eval_bdry, u_coeff_bdry);
+              feThrBdry_[i][j]->evaluateValue(valZ_eval_bdry, z_coeff_bdry);
+              // Compute Stefan-Boltzmann residual
+              scalar_view robinVal1("robinVal1", numCellsSide, numCubPerSide);
+              scalar_view robinVal("robinVal", numCellsSide, fh, numCubPerSide);
+              computeThermalRobin(robinVal1,valU_eval_bdry,valZ_eval_bdry,i,j,1,1);
+              fst::scalarMultiplyDataField(robinVal,robinVal1,feThrBdry_[i][j]->N());
+              // Evaluate boundary integral
+              scalar_view robinJac("robinJac", numCellsSide, fh, fh);
+              fst::integrate(robinJac,robinVal,feThrBdry_[i][j]->NdetJ(),false);
+              for (int k = 0; k < numCellsSide; ++k) {
+                int cidx = bdryCellLocIds_[i][j][k];
+                for (int l = 0; l < fh; ++l) {
+                  for (int m = 0; m < fh; ++m) {
+                    (J[d+1][d+1])(cidx,l,m) += robinJac(k,l,m);
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+      // DIRICHLET CONDITIONS
+      for (int i = 0; i < numSideSets; ++i) {
+        // Velocity Boundary Conditions
+        if (i==0 || i==1 || i==2 || i==3 || i==4 || i==5 || i==6 || i==7) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numVBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numVBdryDofs; ++l) {
+                for (int m = 0; m < fv; ++m) {
+                  for (int n = 0; n < d; ++n) {
+                    for (int p = 0; p < d; ++p) {
+                      (J[n][p])(cidx,fvidx_[j][l],m) = static_cast<Real>(0);
+                    }
+                  }
+                }
+                for (int m = 0; m < d; ++m) {
+                  for (int n = 0; n < fp; ++n) {
+                    (J[m][d])(cidx,fvidx_[j][l],n) = static_cast<Real>(0);
+                  }
+                  for (int n = 0; n < fh; ++n) {
+                    (J[m][d+1])(cidx,fvidx_[j][l],n) = static_cast<Real>(0);
+                  }
+                }
+              }
+            }
+          }
+        }
+        // Thermal boundary conditions
+        if ( (i==4) || (i==6) ) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numHBdryDofs = fhidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numHBdryDofs; ++l) {
+                for (int m = 0; m < fv; ++m) {
+                  for (int n = 0; n < d; ++n) {
+                    (J[d+1][n])(cidx,fhidx_[j][l],m) = static_cast<Real>(0);
+                  }
+                }
+                for (int m = 0; m < fp; ++m) {
+                  (J[d+1][d])(cidx,fhidx_[j][l],m) = static_cast<Real>(0);
+                }
+                for (int m = 0; m < fh; ++m) {
+                  (J[d+1][d+1])(cidx,fhidx_[j][l],m) = static_cast<Real>(0);
+                }
+              }
+            }
+          }
+        }
+        // Pressure pinning
+        if (i==8 && pinPressure_) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int l = 0, cidx = bdryCellLocIds_[i][j][k];
+              for (int m = 0; m < fv; ++m) {
+                for (int n = 0; n < d; ++n) {
+                  (J[d][n])(cidx,fpidx_[j][l],m) = static_cast<Real>(0);
+                }
+              }
+              for (int m = 0; m < fp; ++m) {
+                (J[d][d])(cidx,fpidx_[j][l],m) = static_cast<Real>(0);
+              }
+              for (int m = 0; m < fh; ++m) {
+                (J[d][d+1])(cidx,fpidx_[j][l],m) = static_cast<Real>(0);
+              }
+            }
+          }
+        }
+      }
+    }
+
+    // Combine the jacobians.
+    fieldHelper_->combineFieldCoeff(jac, J);
+  }
+
+  void Hessian_11(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+//    throw Exception::NotImplemented("");
+    // Retrieve dimensions.
+    int c  = u_coeff.extent_int(0);
+    int p  = cellCub_->getNumPoints();
+    int fv = basisPtrVel_->getCardinality();
+    int fp = basisPtrPrs_->getCardinality();
+    int fh = basisPtrThr_->getCardinality();
+    int d  = cellCub_->getDimension();
+ 
+    // Initialize jacobians.
+    std::vector<std::vector<scalar_view>> H(d+2);
+    for (int i = 0; i < d+2; ++i)
+      H[i].resize(d+2);
+    for (int i = 0; i < d; ++i) {
+      for (int j = 0; j < d; ++j)
+        H[i][j] = scalar_view("hess", c, fv, fv);
+    }
+    for (int i = 0; i < d; ++i) {
+      H[d][i]   = scalar_view("hess", c, fp, fv);
+      H[i][d]   = scalar_view("hess", c, fv, fp);
+      H[d+1][i] = scalar_view("hess", c, fh, fv);
+      H[i][d+1] = scalar_view("hess", c, fv, fh);
+    }
+    H[d][d]     = scalar_view("hess", c, fp, fp);
+    H[d+1][d]   = scalar_view("hess", c, fh, fp);
+    H[d][d+1]   = scalar_view("hess", c, fp, fh);
+    H[d+1][d+1] = scalar_view("hess", c, fh, fh);
+
+    // Split l_coeff into components.
+    std::vector<scalar_view> L;
+    fieldHelper_->splitFieldCoeff(L, l_coeff);
+
+    // APPLY DIRICHLET CONDITIONS
+    // THERMAL BOUNDARIES
+    // -->        Control boundaries: i=0,1,2,3
+    // -->        Substrate boundary: i=4
+    // -->        Inflow boundary:    i=6
+    // -->        Outflow boundaries: i=5,7
+    // VELOCITY BOUNDARIES
+    // -->        No slip boundaries: i=0,1,2,3,4
+    // -->        Inflow Boundary:    i=6
+    // -->        Outflow Boundaries: i=5,7
+    // PRESSURE BOUNDARIES
+    // -->              Pressure Pin: i=?
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      // DIRICHLET CONDITIONS
+      for (int i = 0; i < numSideSets; ++i) {
+        // Velocity boundary conditions
+        if (i==0 || i==1 || i==2 || i==3 || i==4 || i==5 || i==6 || i==7) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numVBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numVBdryDofs; ++l) {
+                for (int m = 0; m < d; ++m) {
+                  (L[m])(cidx,fvidx_[j][l]) = static_cast<Real>(0);
+                }
+              }
+            }
+          }
+        }
+        // Thermal boundaries
+        if ( (i==4) || (i==6) ) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numHBdryDofs = fhidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numHBdryDofs; ++l) {
+                (L[d+1])(cidx,fhidx_[j][l]) = static_cast<Real>(0);
+              }
+            }
+          }
+        }
+        // Pressure pinning
+        if (i==8 && pinPressure_) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int l = 0, cidx = bdryCellLocIds_[i][j][k];
+              (L[d])(cidx,fpidx_[j][l]) = static_cast<Real>(0);
+            }
+          }
+        }
+      }
+    }
+
+    // Evaluate/interpolate finite element fields on cells.
+    std::vector<scalar_view> valVel_vec(d);
+    for (int i = 0; i < d; ++i) {
+      valVel_vec[i] = scalar_view("valVel_vec", c, p);
+      feVel_->evaluateValue(valVel_vec[i], L[i]);
+    }
+    scalar_view valHeat_eval("valHeat_eval", c, p);
+    feThr_->evaluateValue(valHeat_eval, L[d+1]);
+
+    // Compute nonlinear terms in the Navier-Stokes equations.
+    std::vector<scalar_view> valVelPhi_vec(d);
+    for (int i = 0; i < d; ++i) {
+      valVelPhi_vec[i] = scalar_view("valVelPhi_vec", c, fv, p);
+      fst::scalarMultiplyDataField(valVelPhi_vec[i], valVel_vec[i], feVel_->N());
+    }
+
+    // Compute nonlinear terms in the thermal equation.
+    scalar_view valHeatVelPhi("valHeatVelPhi", c, fv, p);
+    fst::scalarMultiplyDataField(valHeatVelPhi, valHeat_eval, feVel_->N());
+
+    /*** Evaluate weak form of the Hessian. ***/
+    for (int i = 0; i < d; ++i) {
+      // velocity equation.
+      for (int j = 0; j < d; ++j) {
+        fst::integrate(H[i][j],valVelPhi_vec[j],feVel_->DNDdetJ(i),false);
+        fst::integrate(H[i][j],feVel_->DNDdetJ(j),valVelPhi_vec[i],true);
+      }
+      fst::integrate(H[i][d+1],valHeatVelPhi,feThr_->DNDdetJ(i),false);
+      // Thermal equation.
+      fst::integrate(H[d+1][i],feThr_->DNDdetJ(i),valHeatVelPhi,false);
+    }
+
+    // Combine the Hessians.
+    fieldHelper_->combineFieldCoeff(hess, H);
+  }
+
+  void Hessian_12(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_ThermalFluids::Hessian_12): Hessian is zero.");
+  }
+
+  void Hessian_21(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_ThermalFluids::Hessian_21): Hessian is zero.");
+  }
+
+  void Hessian_22(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_ThermalFluids::Hessian_22): Hessian is zero.");
+  }
+
+  void RieszMap_1(scalar_view & riesz) override {
+    // Retrieve dimensions.
+    int c  = feVel_->N().extent_int(0);
+    int fv = basisPtrVel_->getCardinality();
+    int fp = basisPtrPrs_->getCardinality();
+    int fh = basisPtrThr_->getCardinality();
+    int d  = cellCub_->getDimension();
+ 
+    // Initialize jacobians.
+    std::vector<std::vector<scalar_view>> J(d+2);
+    for (int i = 0; i < d+2; ++i)
+      J[i].resize(d+2);
+    for (int i = 0; i < d; ++i) {
+      for (int j = 0; j < d; ++j)
+        J[i][j] = scalar_view("riesz1", c, fv, fv);
+      J[d][i]   = scalar_view("riesz1", c, fp, fv);
+      J[i][d]   = scalar_view("riesz1", c, fv, fp);
+      J[d+1][i] = scalar_view("riesz1", c, fh, fv);
+      J[i][d+1] = scalar_view("riesz1", c, fv, fh);
+    }
+    J[d][d]     = scalar_view("riesz1", c, fp, fp);
+    J[d+1][d]   = scalar_view("riesz1", c, fh, fp);
+    J[d][d+1]   = scalar_view("riesz1", c, fp, fh);
+    J[d+1][d+1] = scalar_view("riesz1", c, fh, fh);
+
+    for (int i = 0; i < d; ++i) {
+      Kokkos::deep_copy(J[i][i], feVel_->stiffMat());
+      rst::add(J[i][i],feVel_->massMat());
+    }
+    Kokkos::deep_copy(J[d][d], fePrs_->massMat());
+    Kokkos::deep_copy(J[d+1][d+1], feThr_->stiffMat());
+    rst::add(J[d+1][d+1], feThr_->massMat());
+
+    // Combine the jacobians.
+    fieldHelper_->combineFieldCoeff(riesz, J);
+  }
+
+  void RieszMap_2(scalar_view & riesz) override {
+    // Retrieve dimensions.
+    int c  = feVel_->N().extent_int(0);
+    int fv = basisPtrVel_->getCardinality();
+    int fp = basisPtrPrs_->getCardinality();
+    int fh = basisPtrThr_->getCardinality();
+    int d  = cellCub_->getDimension();
+ 
+    // Initialize jacobians.
+    std::vector<std::vector<scalar_view>> J(d+2);
+    for (int i = 0; i < d+2; ++i)
+      J[i].resize(d+2);
+    for (int i = 0; i < d; ++i) {
+      for (int j = 0; j < d; ++j)
+        J[i][j] = scalar_view("riesz2", c, fv, fv);
+      J[d][i]   = scalar_view("riesz2", c, fp, fv);
+      J[i][d]   = scalar_view("riesz2", c, fv, fp);
+      J[d+1][i] = scalar_view("riesz2", c, fh, fv);
+      J[i][d+1] = scalar_view("riesz2", c, fv, fh);
+    }
+    J[d][d]     = scalar_view("riesz2", c, fp, fp);
+    J[d+1][d]   = scalar_view("riesz2", c, fh, fp);
+    J[d][d+1]   = scalar_view("riesz2", c, fp, fh);
+    J[d+1][d+1] = scalar_view("riesz2", c, fh, fh);
+
+
+    for (int i = 0; i < d; ++i)
+      Kokkos::deep_copy(J[i][i], feVel_->massMat());
+    Kokkos::deep_copy(J[d][d], fePrs_->massMat());
+    Kokkos::deep_copy(J[d+1][d+1], feThr_->massMat());
+
+    // Combine the jacobians.
+    fieldHelper_->combineFieldCoeff(riesz, J);
+  }
+
+  std::vector<basis_ptr> getFields() override {
+    return basisPtrs_;
+  }
+
+  void setCellNodes(const scalar_view &volCellNodes,
+                    const std::vector<std::vector<scalar_view>> &bdryCellNodes,
+                    const std::vector<std::vector<std::vector<int>>> &bdryCellLocIds) override {
+    volCellNodes_ = volCellNodes;
+    bdryCellNodes_ = bdryCellNodes;
+    bdryCellLocIds_ = bdryCellLocIds;
+    // Finite element definition.
+    feVel_ = ROL::makePtr<fe_type>(volCellNodes_,basisPtrVel_,cellCub_);
+    fePrs_ = ROL::makePtr<fe_type>(volCellNodes_,basisPtrPrs_,cellCub_);
+    feThr_ = ROL::makePtr<fe_type>(volCellNodes_,basisPtrThr_,cellCub_);
+    // Get boundary degrees of freedom.
+    fvidx_ = feVel_->getBoundaryDofs();
+    fpidx_ = fePrs_->getBoundaryDofs();
+    fhidx_ = feThr_->getBoundaryDofs();
+    // Construct boundary FEs
+    const int numSideSets = bdryCellNodes.size();
+    feVelBdry_.resize(numSideSets);
+    fePrsBdry_.resize(numSideSets);
+    feThrBdry_.resize(numSideSets);
+    for (int i = 0; i < numSideSets; ++i) {
+      int numLocSides = bdryCellNodes[i].size();
+      feVelBdry_[i].resize(numLocSides);
+      fePrsBdry_[i].resize(numLocSides);
+      feThrBdry_[i].resize(numLocSides);
+      for (int j = 0; j < numLocSides; ++j) {
+        if (bdryCellNodes[i][j] != scalar_view()) {
+          feVelBdry_[i][j] = ROL::makePtr<fe_type>(bdryCellNodes[i][j],basisPtrVel_,bdryCub_,j);
+          fePrsBdry_[i][j] = ROL::makePtr<fe_type>(bdryCellNodes[i][j],basisPtrPrs_,bdryCub_,j);
+          feThrBdry_[i][j] = ROL::makePtr<fe_type>(bdryCellNodes[i][j],basisPtrThr_,bdryCub_,j);
+        }
+      }
+    }
+  }
+
+  void setFieldPattern(const std::vector<std::vector<int>> & fieldPattern) override {
+    fieldPattern_ = fieldPattern;
+    fieldHelper_ = ROL::makePtr<FieldHelper<Real,DeviceType>>(numFields_, numDofs_, numFieldDofs_, fieldPattern_);
+  }
+
+  const ROL::Ptr<fe_type> getVelocityFE(void) const {
+    return feVel_;
+  }
+
+  const ROL::Ptr<fe_type> getPressureFE(void) const {
+    return fePrs_;
+  }
+
+  const ROL::Ptr<fe_type> getThermalFE(void) const {
+    return feThr_;
+  }
+
+  const std::vector<std::vector<ROL::Ptr<fe_type>>> getVelocityBdryFE(void) const {
+    return feVelBdry_;
+  }
+
+  const std::vector<std::vector<ROL::Ptr<fe_type>>> getPressureBdryFE(void) const {
+    return fePrsBdry_;
+  }
+
+  const std::vector<std::vector<ROL::Ptr<fe_type>>> getThermalBdryFE(void) const {
+    return feThrBdry_;
+  }
+
+  const std::vector<std::vector<std::vector<int>>> getBdryCellLocIds(void) const {
+    return bdryCellLocIds_;
+  }
+
+  const ROL::Ptr<FieldHelper<Real,DeviceType>> getFieldHelper(void) const {
+    return fieldHelper_;
+  }
+
+}; // PDE_ThermalFluids
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/topo-opt/elasticity/CMakeLists.txt b/packages/rol/example/PDE-OPT/topo-opt/elasticity/CMakeLists.txt
index 55ae07c38807..5d43f0a087d7 100644
--- a/packages/rol/example/PDE-OPT/topo-opt/elasticity/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/topo-opt/elasticity/CMakeLists.txt
@@ -11,25 +11,25 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_01
     SOURCES example_01.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_02
     SOURCES example_02.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_03
     SOURCES example_03.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     TopoOptDataCopy
     SOURCE_FILES
       input_ex01.xml
@@ -50,5 +50,57 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
   )
 
+ENDIF()
+
+IF(${PROJECT_NAME}_ENABLE_Intrepid2 AND 
+   ${PROJECT_NAME}_ENABLE_Amesos2   AND
+   ${PROJECT_NAME}_ENABLE_Ifpack2   AND
+   ${PROJECT_NAME}_ENABLE_MueLu     AND
+   ${PROJECT_NAME}_ENABLE_Tpetra )
+
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/teuchos/src/sol)
+
+  # Need ROL_TpetraMultiVector.hpp
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
+
+  ROL_ADD_EXECUTABLE(
+    example_01_Kokkos
+    SOURCES example_01K.cpp
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_ADD_EXECUTABLE(
+    example_02_Kokkos
+    SOURCES example_02K.cpp
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_ADD_EXECUTABLE(
+    example_03_Kokkos
+    SOURCES example_03K.cpp
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_COPY_FILES_TO_BINARY_DIR(
+    TopoOptDataCopyK
+    SOURCE_FILES
+      input_ex01.xml
+      input_ex02.xml
+      input_ex03.xml
+      plotresults.m
+      bwr.m
+      plotdensity.m
+      processpics.m
+      picme
+      process3D.sh
+      meshfiles/brick1.txt
+      meshfiles/brick1.e
+      meshfiles/brick2.txt
+      meshfiles/brick2.e
+      meshfiles/Lbeam.jou
+    SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}"
+    DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
+  )
 
 ENDIF()
diff --git a/packages/rol/example/PDE-OPT/topo-opt/elasticity/example_01.cpp b/packages/rol/example/PDE-OPT/topo-opt/elasticity/example_01.cpp
index 585a7b0da0c4..dbefc5d73d1f 100644
--- a/packages/rol/example/PDE-OPT/topo-opt/elasticity/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/topo-opt/elasticity/example_01.cpp
@@ -15,8 +15,7 @@
 #include "Teuchos_Comm.hpp"
 #include "Teuchos_Time.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -52,7 +51,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int>> comm
     = Tpetra::getDefaultComm();
   const int myRank = comm->getRank();
diff --git a/packages/rol/example/PDE-OPT/topo-opt/elasticity/example_01K.cpp b/packages/rol/example/PDE-OPT/topo-opt/elasticity/example_01K.cpp
new file mode 100644
index 000000000000..5cee91ab2f1f
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/topo-opt/elasticity/example_01K.cpp
@@ -0,0 +1,369 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_01.cpp
+    \brief Shows how to solve the stuctural topology optimization problem.
+
+*/
+
+#include "Teuchos_Comm.hpp"
+#include "Teuchos_Time.hpp"
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+
+#include "ROL_Solver.hpp"
+#include "ROL_Bounds.hpp"
+#include "ROL_ConstraintFromObjective.hpp"
+#include "ROL_LinearCombinationObjective.hpp"
+
+#include "../../TOOLS/pdevectorK.hpp"
+#include "../../TOOLS/pdeobjectiveK.hpp"
+#include "../../TOOLS/meshreaderK.hpp"
+
+#include "src/mesh_topo-optK.hpp"
+#include "src/pde_elasticityK.hpp"
+
+#include "../src/pde_filterK.hpp"
+#include "../src/filtered_compliance_robjK.hpp"
+#include "../src/volume_conK.hpp"
+#include "../src/volume_objK.hpp"
+#include "../src/obj_volumeK.hpp"
+#include "../src/obj_phasefieldK.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+int main(int argc, char *argv[]) {
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  Kokkos::ScopeGuard kokkosScope (argc, argv);
+  ROL::Ptr<const Teuchos::Comm<int>> comm
+    = Tpetra::getDefaultComm();
+  const int myRank = comm->getRank();
+  if ((iprint > 0) && (myRank == 0))
+    outStream = ROL::makePtrFromRef(std::cout);
+  else
+    outStream = ROL::makePtrFromRef(bhs);
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+    RealT tol(1e-8);
+
+    /*** Read in XML input ***/
+    std::string filename = "input_ex01.xml";
+    auto parlist = ROL::getParametersFromXmlFile(filename);
+
+    // Retrieve parameters.
+    const std::string example = parlist->sublist("Problem").get("Example", "Default");
+    const RealT volFraction   = parlist->sublist("Problem").get("Volume Fraction", 0.4);
+    const RealT initDens      = parlist->sublist("Problem").get("Initial Density",volFraction);
+    const RealT cmpFactor     = parlist->sublist("Problem").get("Compliance Factor", 1.1);
+    int probDim               = parlist->sublist("Problem").get("Problem Dimension", 2);
+    const std::string minType = parlist->sublist("Problem").get("Minimization Type", "Volume");
+    const bool usePhaseField  = parlist->sublist("Problem").get("Use Phase Field", false);
+    bool useFilter            = parlist->sublist("Problem").get("Use Filter", true);
+    const bool normalizeObj   = parlist->sublist("Problem").get("Normalize Compliance", true);
+    const bool volEq          = parlist->sublist("Problem").get("Use Volume Equality Constraint",true);
+    std::string hessAppr      = parlist->sublist("Problem").get("Hessian Approximation","None");
+    hessAppr                  = (minType == "Compliance" ? hessAppr : "None");
+    if (example == "2D Wheel"                   ||
+        example == "2D Truss"                   ||
+        example == "2D Cantilever with 1 Load"  ||
+        example == "2D Cantilever with 3 Loads" ||
+        example == "2D Beams"                   ||
+        example == "2D Carrier Plate")
+      probDim = 2;
+    else if (example == "3D Cantilever" ||
+             example == "3D L Beam")
+      probDim = 3;
+    if (usePhaseField) useFilter = false;
+
+    /*** Initialize main data structure. ***/
+    TEUCHOS_TEST_FOR_EXCEPTION(probDim<2 || probDim>3, std::invalid_argument,
+      ">>> PDE-OPT/topo-opt/elasticity/example_01.cpp: Problem dim is not 2 or 3!");
+    ROL::Ptr<MeshManager<RealT,DeviceT>> meshMgr;
+    if (probDim == 2) meshMgr = ROL::makePtr<MeshManager_TopoOpt<RealT,DeviceT>>(*parlist);
+    else              meshMgr = ROL::makePtr<MeshReader<RealT,DeviceT>>(*parlist);
+    // Initialize PDE describing elasticity equations.
+    auto pde = ROL::makePtr<PDE_Elasticity<RealT,DeviceT>>(*parlist);
+
+    // Initialize the filter PDE.
+    ROL::Ptr<PDE<RealT,DeviceT>> pdeFilter = pde;
+
+    // Initialize reduced compliance objective function.
+    ROL::Ptr<ROL::Objective<RealT>> robj_com;
+    ROL::Ptr<Assembler<RealT,DeviceT>> assembler, assemblerFilter;
+    if (useFilter && !usePhaseField) {
+      pdeFilter = ROL::makePtr<PDE_Filter<RealT,DeviceT>>(*parlist);
+      pde->setDensityFields(pdeFilter->getFields());
+      robj_com = ROL::makePtr<TopOptFilteredComplianceObjective<RealT,DeviceT>>(
+                 pde,pdeFilter,meshMgr,comm,*parlist,*outStream);
+      assembler = ROL::dynamicPtrCast<TopOptFilteredComplianceObjective<RealT,DeviceT>>(robj_com)->getAssembler();
+      assemblerFilter = ROL::dynamicPtrCast<TopOptFilteredComplianceObjective<RealT,DeviceT>>(robj_com)->getFilterAssembler();
+    }
+    else {
+      robj_com = ROL::makePtr<TopOptComplianceObjective<RealT,DeviceT>>(
+                 pde,meshMgr,comm,*parlist,*outStream);
+      assembler = ROL::dynamicPtrCast<TopOptComplianceObjective<RealT,DeviceT>>(robj_com)->getAssembler();
+      assemblerFilter = assembler;
+    }
+
+    // Create vectors.
+    auto u_ptr = assembler->createStateVector();         u_ptr->putScalar(0.0);
+    auto p_ptr = assembler->createStateVector();         p_ptr->putScalar(0.0);
+    auto r_ptr = assembler->createResidualVector();      r_ptr->putScalar(0.0);
+    auto z_ptr = assemblerFilter->createControlVector(); z_ptr->putScalar(1.0);
+    auto up = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(u_ptr,pde,assembler,*parlist);
+    auto pp = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(p_ptr,pde,assembler,*parlist);
+    auto rp = ROL::makePtr<PDE_DualSimVector<RealT,DeviceT>>(r_ptr,pde,assembler,*parlist);
+    auto zp = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(z_ptr,pdeFilter,assemblerFilter,*parlist);
+
+    // Build volume objective function.
+    ROL::Ptr<QoI<RealT,DeviceT>> qoi_vol;
+    if (minType == "Compliance" && volEq) {
+      if (useFilter && !usePhaseField)
+        qoi_vol = ROL::makePtr<QoI_Volume_TopoOpt<RealT,DeviceT>>(
+                    ROL::dynamicPtrCast<PDE_Filter<RealT,DeviceT>>(pdeFilter)->getDensityFE(),volFraction);
+      else
+        qoi_vol = ROL::makePtr<QoI_Volume_TopoOpt<RealT,DeviceT>>(pde->getDensityFE(),volFraction);
+    }
+    else {
+      if (useFilter && !usePhaseField)
+        qoi_vol = ROL::makePtr<QoI_Volume_TopoOpt<RealT,DeviceT>>(ROL::dynamicPtrCast<PDE_Filter<RealT,DeviceT>>(pdeFilter)->getDensityFE());
+      else
+        qoi_vol = ROL::makePtr<QoI_Volume_TopoOpt<RealT,DeviceT>>(pde->getDensityFE());
+    }
+    auto obj_vol = ROL::makePtr<TopOptVolumeObjective<RealT,DeviceT>>(qoi_vol,assemblerFilter,zp);
+    auto con_vol = ROL::makePtr<TopOptVolumeConstraint<RealT,DeviceT>>(qoi_vol,assemblerFilter,zp);
+
+    // Compute volume
+    zp->setScalar(1.0);
+    RealT vol = obj_vol->value(*zp,tol);
+
+    // Normalize compliance objective function
+    RealT cs(1);
+    zp->setScalar(initDens);
+    if (normalizeObj) {
+      if (useFilter && !usePhaseField)
+        cs = ROL::dynamicPtrCast<TopOptFilteredComplianceObjective<RealT,DeviceT>>(robj_com)->normalize(*zp,tol);
+      else
+        cs = ROL::dynamicPtrCast<TopOptComplianceObjective<RealT,DeviceT>>(robj_com)->normalize(*zp,tol);
+    }
+
+    // Output problem details
+    *outStream << std::endl;
+    *outStream << "Problem Data"          << std::endl;
+    *outStream << "  Example:           " << example << std::endl;
+    *outStream << "  Dimension:         " << probDim << std::endl;
+    *outStream << "  Minimize Type:     " << minType << std::endl;
+    *outStream << "  Use Phase Field:   " << usePhaseField << std::endl;
+    if (!usePhaseField) {
+      *outStream << "  SIMP Power:        "
+                 << parlist->sublist("Problem").get("SIMP Power",3.0) << std::endl;
+      *outStream << "  Use Filter:        " << useFilter << std::endl;
+    }
+    if (minType == "Volume") {
+      *outStream << "  Compliance Factor: " << cmpFactor << std::endl;
+    }
+    else if (minType == "Compliance") {
+      *outStream << "  Volume Fraction:   " << volFraction << std::endl;
+      *outStream << "  Initial Density:   " << initDens    << std::endl;
+      *outStream << "  Use Equality:      " << volEq       << std::endl;
+      *outStream << "  Hessian Approx:    " << hessAppr    << std::endl;
+    }
+    *outStream << "  Domain Volume:     " << vol << std::endl;
+    *outStream << "  Compliance Scale:  " << cs  << std::endl;
+    *outStream << std::endl;
+
+    // Create objective, constraint, multiplier and bounds
+    ROL::Ptr<ROL::Objective<RealT>>       obj;
+    ROL::Ptr<ROL::Constraint<RealT>>      icon;
+    ROL::Ptr<ROL::Vector<RealT>>          iup, ilp;
+    ROL::Ptr<ROL::Vector<RealT>>          imul;
+    ROL::Ptr<ROL::BoundConstraint<RealT>> ibnd;
+    if (!usePhaseField) {
+      if (minType == "Volume") {
+        obj  = obj_vol;
+        icon = ROL::makePtr<ROL::ConstraintFromObjective<RealT>>(robj_com);
+        // Set upper bound to compliance for solid beam.
+        zp->setScalar(1.0);
+        RealT comp = robj_com->value(*zp,tol);
+        ilp  = ROL::makePtr<ROL::SingletonVector<RealT>>(0);
+        iup  = ROL::makePtr<ROL::SingletonVector<RealT>>(cmpFactor*comp);
+        imul = ROL::makePtr<ROL::SingletonVector<RealT>>(0);
+        ibnd = ROL::makePtr<ROL::Bounds<RealT>>(ilp,iup);
+      }
+      else if (minType == "Compliance") {
+        obj  = robj_com;
+        icon = con_vol;
+        imul = ROL::makePtr<ROL::SingletonVector<RealT>>(0);
+        if (volEq) {
+          ilp  = ROL::nullPtr;
+          iup  = ROL::nullPtr;
+          ibnd = ROL::nullPtr;
+        }
+        else {
+          // Set upper bound to fraction of total volume.
+          ilp  = ROL::makePtr<ROL::SingletonVector<RealT>>(0);
+          iup  = ROL::makePtr<ROL::SingletonVector<RealT>>(volFraction*vol);
+          ibnd = ROL::makePtr<ROL::Bounds<RealT>>(ilp,iup);
+        }
+      }
+      else if (minType == "Total") {
+        std::vector<ROL::Ptr<ROL::Objective<RealT>>> obj_vec(2,ROL::nullPtr);
+        obj_vec[0] = robj_com;
+        obj_vec[1] = obj_vol;
+        std::vector<RealT> weights(2,0.0);
+        weights[0] = static_cast<RealT>(1);
+        weights[1] = parlist->sublist("Problem").get("Volume Objective Scale",0.04096);
+        obj  = ROL::makePtr<ROL::LinearCombinationObjective<RealT>>(weights,obj_vec);
+        icon = ROL::nullPtr;
+        ilp  = ROL::nullPtr;
+        iup  = ROL::nullPtr;
+        imul = ROL::nullPtr;
+        ibnd = ROL::nullPtr;
+      }
+      else {
+        throw ROL::Exception::NotImplemented("Unknown minimization type!");
+      }
+    }
+    else {
+      // Build Modica-Mortola Energy objective function.
+      RealT penParam = parlist->sublist("Problem").get("Phase Field Penalty Parameter",1e-1);
+      ROL::Ptr<QoI<RealT,DeviceT>>
+        qoi_pfe = ROL::makePtr<QoI_ModicaMortolaEnergy_PhaseField<RealT,DeviceT>>(pde->getDensityFE(),
+                                                                          penParam);
+      ROL::Ptr<IntegralOptObjective<RealT,DeviceT>>
+        obj_pfe = ROL::makePtr<IntegralOptObjective<RealT,DeviceT>>(qoi_pfe,assembler);
+      // Get weights for linear combination objective.
+      std::vector<RealT> weights(1,0.0);
+      weights[0] = parlist->sublist("Problem").get("Phase Field Energy Objective Scale",0.00064);
+      std::vector<ROL::Ptr<ROL::Objective<RealT>>> obj_vec(1,ROL::nullPtr);
+      obj_vec[0] = obj_pfe;
+      if (minType == "Volume") {
+        weights.push_back(parlist->sublist("Problem").get("Volume Objective Scale",0.04096));
+        obj_vec.push_back(obj_vol);
+        obj  = ROL::makePtr<ROL::LinearCombinationObjective<RealT>>(weights,obj_vec);
+        icon = ROL::makePtr<ROL::ConstraintFromObjective<RealT>>(robj_com);
+        // Set upper bound to compliance for solid beam.
+        RealT comp = robj_com->value(*zp,tol);
+        ilp  = ROL::makePtr<ROL::SingletonVector<RealT>>(0);
+        iup  = ROL::makePtr<ROL::SingletonVector<RealT>>(cmpFactor*comp);
+        imul = ROL::makePtr<ROL::SingletonVector<RealT>>(0);
+        ibnd = ROL::makePtr<ROL::Bounds<RealT>>(ilp,iup);
+      }
+      else if (minType == "Compliance") {
+        weights.push_back(static_cast<RealT>(1));
+        obj_vec.push_back(robj_com);
+        obj  = ROL::makePtr<ROL::LinearCombinationObjective<RealT>>(weights,obj_vec);
+        icon = con_vol;
+        imul = ROL::makePtr<ROL::SingletonVector<RealT>>(0);
+        if (volEq) {
+          ilp  = ROL::nullPtr;
+          iup  = ROL::nullPtr;
+          ibnd = ROL::nullPtr;
+        }
+        else {
+          // Set upper bound to fraction of total volume.
+          ilp  = ROL::makePtr<ROL::SingletonVector<RealT>>(0);
+          iup  = ROL::makePtr<ROL::SingletonVector<RealT>>(volFraction*vol);
+          ibnd = ROL::makePtr<ROL::Bounds<RealT>>(ilp,iup);
+        }
+      }
+      else if (minType == "Total") {
+        weights.push_back(static_cast<RealT>(1));
+        obj_vec.push_back(robj_com);
+        weights.push_back(parlist->sublist("Problem").get("Volume Objective Scale",0.04096));
+        obj_vec.push_back(obj_vol);
+        obj  = ROL::makePtr<ROL::LinearCombinationObjective<RealT>>(weights,obj_vec);
+        icon = ROL::nullPtr;
+        ilp  = ROL::nullPtr;
+        iup  = ROL::nullPtr;
+        imul = ROL::nullPtr;
+        ibnd = ROL::nullPtr;
+      }
+      else {
+        throw ROL::Exception::NotImplemented("Unknown minimization type!");
+      }
+    }
+
+    // Initialize bound constraints.
+    RealT lval = (usePhaseField ? -1.0 : 0.0), uval = 1.0;
+    ROL::Ptr<ROL::Vector<RealT>> lop = zp->clone(); lop->setScalar(lval);
+    ROL::Ptr<ROL::Vector<RealT>> hip = zp->clone(); hip->setScalar(uval);
+    ROL::Ptr<ROL::BoundConstraint<RealT>>
+      bnd = ROL::makePtr<ROL::Bounds<RealT>>(lop,hip);
+    if (usePhaseField) bnd->deactivate();
+
+    // Set up optimization problem.
+    ROL::Ptr<ROL::Problem<RealT>>
+      prob = ROL::makePtr<ROL::Problem<RealT>>(obj,zp);
+    if (bnd->isActivated()) prob->addBoundConstraint(bnd);
+    if ( minType == "Compliance" ) {
+      if ( volEq ) prob->addLinearConstraint("Volume",icon,imul);
+      else         prob->addLinearConstraint("Volume",icon,imul,ibnd);
+    }
+    else if ( minType == "Volume" )
+      prob->addConstraint("Compliance",icon,imul,ibnd);
+
+    bool expLin = parlist->sublist("Problem").get("Project Linear Constraints",true);
+    prob->setProjectionAlgorithm(*parlist);
+    prob->finalize(!expLin,true,*outStream);
+
+    // Check derivatives.
+    bool derivCheck = parlist->sublist("Problem").get("Check derivatives",false);
+    if (derivCheck) prob->check(true,*outStream);
+
+    // Solve optimization problem.
+    Teuchos::Time algoTimer("Algorithm Time", true);
+    ROL::Solver<RealT> solver(prob,*parlist);
+    solver.solve(*outStream);
+    algoTimer.stop();
+    *outStream << "Total optimization time = " << algoTimer.totalElapsedTime() << " seconds.\n";
+    // Output.
+    if (useFilter && !usePhaseField) {
+      ROL::dynamicPtrCast<TopOptFilteredComplianceObjective<RealT,DeviceT>>(robj_com)->summarize(*outStream);
+      ROL::dynamicPtrCast<TopOptFilteredComplianceObjective<RealT,DeviceT>>(robj_com)->printToFile(*zp,*outStream);
+    }
+    else {
+      ROL::dynamicPtrCast<TopOptComplianceObjective<RealT,DeviceT>>(robj_com)->summarize(*outStream);
+      ROL::dynamicPtrCast<TopOptComplianceObjective<RealT,DeviceT>>(robj_com)->printToFile(*zp,*outStream);
+    }
+    if (minType == "Compliance")
+      con_vol->summarize(*outStream);
+    else if (minType == "Volume")
+      obj_vol->summarize(*outStream);
+
+    // Get a summary from the time monitor.
+    Teuchos::TimeMonitor::summarize();
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/topo-opt/elasticity/example_02.cpp b/packages/rol/example/PDE-OPT/topo-opt/elasticity/example_02.cpp
index 1bdb7d0dfac2..89a2d192eea6 100644
--- a/packages/rol/example/PDE-OPT/topo-opt/elasticity/example_02.cpp
+++ b/packages/rol/example/PDE-OPT/topo-opt/elasticity/example_02.cpp
@@ -14,8 +14,7 @@
 #include "Teuchos_Comm.hpp"
 #include "Teuchos_Time.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -55,7 +54,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int> > comm
     = Tpetra::getDefaultComm();
   ROL::Ptr<const Teuchos::Comm<int> > serial_comm
diff --git a/packages/rol/example/PDE-OPT/topo-opt/elasticity/example_02K.cpp b/packages/rol/example/PDE-OPT/topo-opt/elasticity/example_02K.cpp
new file mode 100644
index 000000000000..cd292c57725a
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/topo-opt/elasticity/example_02K.cpp
@@ -0,0 +1,406 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_02.cpp
+    \brief Shows how to solve the stochastic stuctural topology optimization problem.
+*/
+
+#include "Teuchos_Comm.hpp"
+#include "Teuchos_Time.hpp"
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+
+#include "ROL_Solver.hpp"
+#include "ROL_StochasticProblem.hpp"
+#include "ROL_Bounds.hpp"
+#include "ROL_LinearCombinationObjective.hpp"
+#include "ROL_TpetraTeuchosBatchManager.hpp"
+#include "ROL_SingletonTeuchosBatchManager.hpp"
+
+#include "../../TOOLS/pdeobjectiveK.hpp"
+#include "../../TOOLS/pdevectorK.hpp"
+#include "../../TOOLS/meshreaderK.hpp"
+
+#include "src/mesh_topo-optK.hpp"
+#include "src/pde_elasticityK.hpp"
+#include "src/sampler.hpp"
+
+#include "../src/pde_filterK.hpp"
+#include "../src/filtered_compliance_robjK.hpp"
+#include "../src/volume_conK.hpp"
+#include "../src/volume_objK.hpp"
+#include "../src/obj_volumeK.hpp"
+#include "../src/obj_phasefieldK.hpp"
+#include "../src/printCDF.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+int main(int argc, char *argv[]) {
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  Kokkos::ScopeGuard kokkosScope (argc, argv);
+  ROL::Ptr<const Teuchos::Comm<int> > comm
+    = Tpetra::getDefaultComm();
+  ROL::Ptr<const Teuchos::Comm<int> > serial_comm
+    = ROL::makePtr<Teuchos::SerialComm<int>>();
+  const int myRank = comm->getRank();
+  if ((iprint > 0) && (myRank == 0))
+    outStream = ROL::makePtrFromRef(std::cout);
+  else
+    outStream = ROL::makePtrFromRef(bhs);
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+    RealT tol(1e-8);
+
+    /*** Read in XML input ***/
+    std::string filename = "input_ex02.xml";
+    auto parlist = ROL::getParametersFromXmlFile(filename);
+
+    // Retrieve parameters.
+    const std::string example = parlist->sublist("Problem").get("Example", "Default");
+    const RealT volFraction   = parlist->sublist("Problem").get("Volume Fraction", 0.4);
+    const RealT initDens      = parlist->sublist("Problem").get("Initial Density",volFraction);
+    const RealT cmpFactor     = parlist->sublist("Problem").get("Compliance Factor", 1.1);
+    int probDim               = parlist->sublist("Problem").get("Problem Dimension", 2);
+    const std::string minType = parlist->sublist("Problem").get("Minimization Type", "Volume");
+    const bool usePhaseField  = parlist->sublist("Problem").get("Use Phase Field", false);
+    bool useFilter            = parlist->sublist("Problem").get("Use Filter", true);
+    const bool normalizeObj   = parlist->sublist("Problem").get("Normalize Compliance", true);
+    const bool volEq          = parlist->sublist("Problem").get("Use Volume Equality Constraint",true);
+    std::string hessAppr      = parlist->sublist("Problem").get("Hessian Approximation","None");
+    hessAppr                  = (minType == "Compliance" ? hessAppr : "None");
+    if (example == "2D Wheel"                   ||
+        example == "2D Truss"                   ||
+        example == "2D Cantilever with 1 Load"  ||
+        example == "2D Cantilever with 3 Loads" ||
+        example == "2D Beams"                   ||
+        example == "2D Carrier Plate")
+      probDim = 2;
+    else if (example == "3D Cantilever" ||
+             example == "3D L Beam")
+      probDim = 3;
+    if (usePhaseField) useFilter = false;
+
+    /*** Initialize main data structure. ***/
+    TEUCHOS_TEST_FOR_EXCEPTION(probDim<2 || probDim>3, std::invalid_argument,
+      ">>> PDE-OPT/topo-opt/elasticity/example_01.cpp: Problem dim is not 2 or 3!");
+    ROL::Ptr<MeshManager<RealT,DeviceT>> meshMgr;
+    if (probDim == 2) meshMgr = ROL::makePtr<MeshManager_TopoOpt<RealT,DeviceT>>(*parlist);
+    else              meshMgr = ROL::makePtr<MeshReader<RealT,DeviceT>>(*parlist);
+    // Initialize PDE describing elasticity equations.
+    auto pde = ROL::makePtr<PDE_Elasticity<RealT,DeviceT>>(*parlist);
+
+    // Initialize the filter PDE.
+    ROL::Ptr<PDE<RealT,DeviceT>> pdeFilter = pde;
+
+    // Initialize reduced compliance objective function.
+    ROL::Ptr<ROL::Objective<RealT>> robj_com;
+    ROL::Ptr<Assembler<RealT,DeviceT>> assembler, assemblerFilter;
+    if (useFilter && !usePhaseField) {
+      pdeFilter = ROL::makePtr<PDE_Filter<RealT,DeviceT>>(*parlist);
+      pde->setDensityFields(pdeFilter->getFields());
+      robj_com = ROL::makePtr<TopOptFilteredComplianceObjective<RealT,DeviceT>>(
+                 pde,pdeFilter,meshMgr,comm,*parlist,*outStream);
+      assembler = ROL::dynamicPtrCast<TopOptFilteredComplianceObjective<RealT,DeviceT>>(robj_com)->getAssembler();
+      assemblerFilter = ROL::dynamicPtrCast<TopOptFilteredComplianceObjective<RealT,DeviceT>>(robj_com)->getFilterAssembler();
+    }
+    else {
+      robj_com = ROL::makePtr<TopOptComplianceObjective<RealT,DeviceT>>(
+                 pde,meshMgr,comm,*parlist,*outStream);
+      assembler = ROL::dynamicPtrCast<TopOptComplianceObjective<RealT,DeviceT>>(robj_com)->getAssembler();
+      assemblerFilter = assembler;
+    }
+
+    // Create vectors.
+    auto u_ptr = assembler->createStateVector();         u_ptr->putScalar(0.0);
+    auto p_ptr = assembler->createStateVector();         p_ptr->putScalar(0.0);
+    auto r_ptr = assembler->createResidualVector();      r_ptr->putScalar(0.0);
+    auto z_ptr = assemblerFilter->createControlVector(); z_ptr->putScalar(1.0);
+    auto up = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(u_ptr,pde,assembler,*parlist);
+    auto pp = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(p_ptr,pde,assembler,*parlist);
+    auto rp = ROL::makePtr<PDE_DualSimVector<RealT,DeviceT>>(r_ptr,pde,assembler,*parlist);
+    auto zp = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(z_ptr,pdeFilter,assemblerFilter,*parlist);
+
+    // Build volume objective function.
+    ROL::Ptr<QoI<RealT,DeviceT>> qoi_vol;
+    if (minType == "Compliance" && volEq) {
+      if (useFilter && !usePhaseField)
+        qoi_vol = ROL::makePtr<QoI_Volume_TopoOpt<RealT,DeviceT>>(
+                    ROL::dynamicPtrCast<PDE_Filter<RealT,DeviceT>>(pdeFilter)->getDensityFE(),volFraction);
+      else
+        qoi_vol = ROL::makePtr<QoI_Volume_TopoOpt<RealT,DeviceT>>(pde->getDensityFE(),volFraction);
+    }
+    else {
+      if (useFilter && !usePhaseField)
+        qoi_vol = ROL::makePtr<QoI_Volume_TopoOpt<RealT,DeviceT>>(ROL::dynamicPtrCast<PDE_Filter<RealT,DeviceT>>(pdeFilter)->getDensityFE());
+      else
+        qoi_vol = ROL::makePtr<QoI_Volume_TopoOpt<RealT,DeviceT>>(pde->getDensityFE());
+    }
+    auto obj_vol = ROL::makePtr<TopOptVolumeObjective<RealT,DeviceT>>(qoi_vol,assemblerFilter,zp);
+    auto con_vol = ROL::makePtr<TopOptVolumeConstraint<RealT,DeviceT>>(qoi_vol,assemblerFilter,zp);
+
+    // Compute volume
+    zp->setScalar(1.0);
+    RealT vol = obj_vol->value(*zp,tol);
+
+    // Normalize compliance objective function
+    RealT cs(1);
+    zp->setScalar(initDens);
+    if (normalizeObj) {
+      if (useFilter && !usePhaseField)
+        cs = ROL::dynamicPtrCast<TopOptFilteredComplianceObjective<RealT,DeviceT>>(robj_com)->normalize(*zp,tol);
+      else
+        cs = ROL::dynamicPtrCast<TopOptComplianceObjective<RealT,DeviceT>>(robj_com)->normalize(*zp,tol);
+    }
+
+    // Output problem details
+    *outStream << std::endl;
+    *outStream << "Problem Data"          << std::endl;
+    *outStream << "  Example:           " << example << std::endl;
+    *outStream << "  Dimension:         " << probDim << std::endl;
+    *outStream << "  Minimize Type:     " << minType << std::endl;
+    *outStream << "  Use Phase Field:   " << usePhaseField << std::endl;
+    if (!usePhaseField) {
+      *outStream << "  SIMP Power:        "
+                 << parlist->sublist("Problem").get("SIMP Power",3.0) << std::endl;
+      *outStream << "  Use Filter:        " << useFilter << std::endl;
+    }
+    if (minType == "Volume") {
+      *outStream << "  Compliance Factor: " << cmpFactor << std::endl;
+    }
+    else if (minType == "Compliance") {
+      *outStream << "  Volume Fraction:   " << volFraction << std::endl;
+      *outStream << "  Initial Density:   " << initDens    << std::endl;
+      *outStream << "  Use Equality:      " << volEq       << std::endl;
+      *outStream << "  Hessian Approx:    " << hessAppr    << std::endl;
+    }
+    *outStream << "  Domain Volume:     " << vol << std::endl;
+    *outStream << "  Compliance Scale:  " << cs  << std::endl;
+    *outStream << std::endl;
+
+    // Create objective, constraint, multiplier and bounds
+    ROL::Ptr<ROL::Objective<RealT>>       obj;
+    ROL::Ptr<ROL::Constraint<RealT>>      icon;
+    ROL::Ptr<ROL::Vector<RealT>>          iup, ilp;
+    ROL::Ptr<ROL::Vector<RealT>>          imul;
+    ROL::Ptr<ROL::BoundConstraint<RealT>> ibnd;
+    if (!usePhaseField) {
+      if (minType == "Volume") {
+        obj  = obj_vol;
+        icon = ROL::makePtr<ROL::ConstraintFromObjective<RealT>>(robj_com);
+        // Set upper bound to compliance for solid beam.
+        zp->setScalar(1.0);
+        RealT comp = robj_com->value(*zp,tol);
+        ilp  = ROL::makePtr<ROL::SingletonVector<RealT>>(0);
+        iup  = ROL::makePtr<ROL::SingletonVector<RealT>>(cmpFactor*comp);
+        imul = ROL::makePtr<ROL::SingletonVector<RealT>>(0);
+        ibnd = ROL::makePtr<ROL::Bounds<RealT>>(ilp,iup);
+      }
+      else if (minType == "Compliance") {
+        obj  = robj_com;
+        icon = con_vol;
+        imul = ROL::makePtr<ROL::SingletonVector<RealT>>(0);
+        if (volEq) {
+          ilp  = ROL::nullPtr;
+          iup  = ROL::nullPtr;
+          ibnd = ROL::nullPtr;
+        }
+        else {
+          // Set upper bound to fraction of total volume.
+          ilp  = ROL::makePtr<ROL::SingletonVector<RealT>>(0);
+          iup  = ROL::makePtr<ROL::SingletonVector<RealT>>(volFraction*vol);
+          ibnd = ROL::makePtr<ROL::Bounds<RealT>>(ilp,iup);
+        }
+      }
+      else if (minType == "Total") {
+        std::vector<ROL::Ptr<ROL::Objective<RealT>>> obj_vec(2,ROL::nullPtr);
+        obj_vec[0] = robj_com;
+        obj_vec[1] = obj_vol;
+        std::vector<RealT> weights(2,0.0);
+        weights[0] = static_cast<RealT>(1);
+        weights[1] = parlist->sublist("Problem").get("Volume Objective Scale",0.04096);
+        obj  = ROL::makePtr<ROL::LinearCombinationObjective<RealT>>(weights,obj_vec);
+        icon = ROL::nullPtr;
+        ilp  = ROL::nullPtr;
+        iup  = ROL::nullPtr;
+        imul = ROL::nullPtr;
+        ibnd = ROL::nullPtr;
+      }
+      else {
+        throw ROL::Exception::NotImplemented("Unknown minimization type!");
+      }
+    }
+    else {
+      // Build Modica-Mortola Energy objective function.
+      RealT penParam = parlist->sublist("Problem").get("Phase Field Penalty Parameter",1e-1);
+      ROL::Ptr<QoI<RealT,DeviceT>>
+        qoi_pfe = ROL::makePtr<QoI_ModicaMortolaEnergy_PhaseField<RealT,DeviceT>>(pde->getDensityFE(),
+                                                                          penParam);
+      ROL::Ptr<IntegralOptObjective<RealT,DeviceT>>
+        obj_pfe = ROL::makePtr<IntegralOptObjective<RealT,DeviceT>>(qoi_pfe,assembler);
+      // Get weights for linear combination objective.
+      std::vector<RealT> weights(1,0.0);
+      weights[0] = parlist->sublist("Problem").get("Phase Field Energy Objective Scale",0.00064);
+      std::vector<ROL::Ptr<ROL::Objective<RealT>>> obj_vec(1,ROL::nullPtr);
+      obj_vec[0] = obj_pfe;
+      if (minType == "Volume") {
+        weights.push_back(parlist->sublist("Problem").get("Volume Objective Scale",0.04096));
+        obj_vec.push_back(obj_vol);
+        obj  = ROL::makePtr<ROL::LinearCombinationObjective<RealT>>(weights,obj_vec);
+        icon = ROL::makePtr<ROL::ConstraintFromObjective<RealT>>(robj_com);
+        // Set upper bound to compliance for solid beam.
+        RealT comp = robj_com->value(*zp,tol);
+        ilp  = ROL::makePtr<ROL::SingletonVector<RealT>>(0);
+        iup  = ROL::makePtr<ROL::SingletonVector<RealT>>(cmpFactor*comp);
+        imul = ROL::makePtr<ROL::SingletonVector<RealT>>(0);
+        ibnd = ROL::makePtr<ROL::Bounds<RealT>>(ilp,iup);
+      }
+      else if (minType == "Compliance") {
+        weights.push_back(static_cast<RealT>(1));
+        obj_vec.push_back(robj_com);
+        obj  = ROL::makePtr<ROL::LinearCombinationObjective<RealT>>(weights,obj_vec);
+        icon = con_vol;
+        imul = ROL::makePtr<ROL::SingletonVector<RealT>>(0);
+        if (volEq) {
+          ilp  = ROL::nullPtr;
+          iup  = ROL::nullPtr;
+          ibnd = ROL::nullPtr;
+        }
+        else {
+          // Set upper bound to fraction of total volume.
+          ilp  = ROL::makePtr<ROL::SingletonVector<RealT>>(0);
+          iup  = ROL::makePtr<ROL::SingletonVector<RealT>>(volFraction*vol);
+          ibnd = ROL::makePtr<ROL::Bounds<RealT>>(ilp,iup);
+        }
+      }
+      else if (minType == "Total") {
+        weights.push_back(static_cast<RealT>(1));
+        obj_vec.push_back(robj_com);
+        weights.push_back(parlist->sublist("Problem").get("Volume Objective Scale",0.04096));
+        obj_vec.push_back(obj_vol);
+        obj  = ROL::makePtr<ROL::LinearCombinationObjective<RealT>>(weights,obj_vec);
+        icon = ROL::nullPtr;
+        ilp  = ROL::nullPtr;
+        iup  = ROL::nullPtr;
+        imul = ROL::nullPtr;
+        ibnd = ROL::nullPtr;
+      }
+      else {
+        throw ROL::Exception::NotImplemented("Unknown minimization type!");
+      }
+    }
+
+    // Initialize bound constraints.
+    RealT lval = (usePhaseField ? -1.0 : 0.0), uval = 1.0;
+    ROL::Ptr<ROL::Vector<RealT>> lop = zp->clone(); lop->setScalar(lval);
+    ROL::Ptr<ROL::Vector<RealT>> hip = zp->clone(); hip->setScalar(uval);
+    ROL::Ptr<ROL::BoundConstraint<RealT>>
+      bnd = ROL::makePtr<ROL::Bounds<RealT>>(lop,hip);
+    if (usePhaseField) bnd->deactivate();
+
+    // Build sampler.
+    BuildSampler<RealT> buildSampler(parlist->sublist("Problem"),probDim,example);
+    //int stochDim = buildSampler.getDimension();
+    int nsamp = parlist->sublist("Problem").get("Number of samples", 1);
+    ROL::Ptr<ROL::BatchManager<RealT>>
+      bman = ROL::makePtr<ROL::TpetraTeuchosBatchManager<RealT>>(comm);
+    ROL::Ptr<ROL::SampleGenerator<RealT>>
+      sampler = buildSampler.get(nsamp,bman);
+
+    // Set up optimization problem.
+    ROL::Ptr<ROL::StochasticProblem<RealT>>
+      prob = ROL::makePtr<ROL::StochasticProblem<RealT>>(obj,zp);
+    if (bnd->isActivated()) prob->addBoundConstraint(bnd);
+    if ( minType == "Compliance" ) {
+      if ( volEq ) prob->addLinearConstraint("Volume",icon,imul);
+      else         prob->addLinearConstraint("Volume",icon,imul,ibnd);
+    }
+    else if ( minType == "Volume" )
+      prob->addConstraint("Compliance",icon,imul,ibnd);
+    if ( minType == "Volume" ) {
+      ROL::Ptr<ROL::BatchManager<RealT>>
+        cbman = ROL::makePtr<ROL::SingletonTeuchosBatchManager<RealT,int>>(comm);
+      parlist->sublist("SOL").sublist("Compliance") = parlist->sublist("SOL");
+      prob->makeConstraintStochastic("Compliance",*parlist,sampler,cbman);
+    }
+    else {
+      parlist->sublist("SOL").sublist("Objective") = parlist->sublist("SOL");
+      prob->makeObjectiveStochastic(*parlist,sampler);
+    }
+    prob->setProjectionAlgorithm(*parlist);
+    prob->finalize(false,true,*outStream);
+
+    // Check derivatives.
+    bool derivCheck = parlist->sublist("Problem").get("Check derivatives",false);
+    if (derivCheck) prob->check(true,*outStream);
+
+    // Solve optimization problem.
+    Teuchos::Time algoTimer("Algorithm Time", true);
+    ROL::Solver<RealT> solver(prob,*parlist);
+    solver.solve(*outStream);
+    algoTimer.stop();
+    *outStream << "Total optimization time = " << algoTimer.totalElapsedTime() << " seconds.\n";
+
+    // Output.
+    if (useFilter && !usePhaseField)
+      ROL::dynamicPtrCast<TopOptFilteredComplianceObjective<RealT,DeviceT>>(robj_com)->summarize(*outStream);
+    else
+      ROL::dynamicPtrCast<TopOptComplianceObjective<RealT,DeviceT>>(robj_com)->summarize(*outStream);
+    if (minType == "Compliance") con_vol->summarize(*outStream);
+    else                         obj_vol->summarize(*outStream);
+    assembler->printMeshData(*outStream);
+    up->zero(); pp->zero();
+    for (int i = 0; i < sampler->numMySamples(); ++i) {
+      robj_com->setParameter(sampler->getMyPoint(i));
+      if (useFilter && !usePhaseField)
+        ROL::dynamicPtrCast<TopOptFilteredComplianceObjective<RealT,DeviceT>>(robj_com)->solveState(*up,*zp);
+      else
+        ROL::dynamicPtrCast<TopOptComplianceObjective<RealT,DeviceT>>(robj_com)->solveState(*up,*zp);
+      pp->axpy(sampler->getMyWeight(i),*up);
+    }
+    sampler->sumAll(*pp,*up);
+    assembler->outputTpetraVector(u_ptr,"mean_state.txt");
+    assembler->outputTpetraVector(z_ptr,"density.txt");
+    int nsamp_dist = parlist->sublist("Problem").get("Number of output samples",1000);
+    ROL::Ptr<ROL::SampleGenerator<RealT>>
+      sampler_dist = buildSampler.get(nsamp_dist,bman);
+    if (minType != "Volume")
+      TopOptPrintCDF<RealT>(*obj,*zp,*sampler_dist,nsamp_dist,comm,"obj_samples.txt");
+    else
+      TopOptPrintCDF<RealT>(*robj_com,*zp,*sampler_dist,nsamp_dist,comm,"obj_samples.txt");
+
+    // Get a summary from the time monitor.
+    Teuchos::TimeMonitor::summarize();
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/topo-opt/elasticity/example_03.cpp b/packages/rol/example/PDE-OPT/topo-opt/elasticity/example_03.cpp
index 616554eb9fca..fc03dbd24777 100644
--- a/packages/rol/example/PDE-OPT/topo-opt/elasticity/example_03.cpp
+++ b/packages/rol/example/PDE-OPT/topo-opt/elasticity/example_03.cpp
@@ -16,8 +16,7 @@
 #include "Teuchos_Comm.hpp"
 #include "Teuchos_Time.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -57,7 +56,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int> > comm
     = Tpetra::getDefaultComm();
   ROL::Ptr<const Teuchos::Comm<int> > serial_comm
diff --git a/packages/rol/example/PDE-OPT/topo-opt/elasticity/example_03K.cpp b/packages/rol/example/PDE-OPT/topo-opt/elasticity/example_03K.cpp
new file mode 100644
index 000000000000..62ae9e7773d6
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/topo-opt/elasticity/example_03K.cpp
@@ -0,0 +1,441 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_03.cpp
+    \brief Solve the stochastic stuctural topology optimization problem
+           using Mean-Plus-CVaR for a variety of confidence levels and
+           convex-combination parameters.
+*/
+
+#include "Teuchos_Comm.hpp"
+#include "Teuchos_Time.hpp"
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+
+#include "ROL_Solver.hpp"
+#include "ROL_StochasticProblem.hpp"
+#include "ROL_Bounds.hpp"
+#include "ROL_LinearCombinationObjective.hpp"
+#include "ROL_TpetraTeuchosBatchManager.hpp"
+#include "ROL_SingletonTeuchosBatchManager.hpp"
+
+#include "../../TOOLS/pdeobjectiveK.hpp"
+#include "../../TOOLS/pdevectorK.hpp"
+#include "../../TOOLS/meshreaderK.hpp"
+
+#include "src/mesh_topo-optK.hpp"
+#include "src/pde_elasticityK.hpp"
+#include "src/sampler.hpp"
+
+#include "../src/pde_filterK.hpp"
+#include "../src/filtered_compliance_robjK.hpp"
+#include "../src/volume_conK.hpp"
+#include "../src/volume_objK.hpp"
+#include "../src/obj_volumeK.hpp"
+#include "../src/obj_phasefieldK.hpp"
+#include "../src/printCDF.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+int main(int argc, char *argv[]) {
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  Kokkos::ScopeGuard kokkosScope (argc, argv);
+  ROL::Ptr<const Teuchos::Comm<int> > comm
+    = Tpetra::getDefaultComm();
+  ROL::Ptr<const Teuchos::Comm<int> > serial_comm
+    = ROL::makePtr<Teuchos::SerialComm<int>>();
+  const int myRank = comm->getRank();
+  if ((iprint > 0) && (myRank == 0))
+    outStream = ROL::makePtrFromRef(std::cout);
+  else
+    outStream = ROL::makePtrFromRef(bhs);
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+    RealT tol(1e-8);
+
+    /*** Read in XML input ***/
+    std::string filename = "input_ex03.xml";
+    auto parlist = ROL::getParametersFromXmlFile(filename);
+
+    // Retrieve parameters.
+    const std::string example = parlist->sublist("Problem").get("Example", "Default");
+    const RealT volFraction   = parlist->sublist("Problem").get("Volume Fraction", 0.4);
+    const RealT initDens      = parlist->sublist("Problem").get("Initial Density",volFraction);
+    const RealT cmpFactor     = parlist->sublist("Problem").get("Compliance Factor", 1.1);
+    int probDim               = parlist->sublist("Problem").get("Problem Dimension", 2);
+    const std::string minType = parlist->sublist("Problem").get("Minimization Type", "Volume");
+    const bool usePhaseField  = parlist->sublist("Problem").get("Use Phase Field", false);
+    bool useFilter            = parlist->sublist("Problem").get("Use Filter", true);
+    const bool normalizeObj   = parlist->sublist("Problem").get("Normalize Compliance", true);
+    const bool volEq          = parlist->sublist("Problem").get("Use Volume Equality Constraint",true);
+    std::string hessAppr      = parlist->sublist("Problem").get("Hessian Approximation","None");
+    hessAppr                  = (minType == "Compliance" ? hessAppr : "None");
+    if (example == "2D Wheel"                   ||
+        example == "2D Truss"                   ||
+        example == "2D Cantilever with 1 Load"  ||
+        example == "2D Cantilever with 3 Loads" ||
+        example == "2D Beams"                   ||
+        example == "2D Carrier Plate")
+      probDim = 2;
+    else if (example == "3D Cantilever" ||
+             example == "3D L Beam")
+      probDim = 3;
+    if (usePhaseField) useFilter = false;
+
+    /*** Initialize main data structure. ***/
+    TEUCHOS_TEST_FOR_EXCEPTION(probDim<2 || probDim>3, std::invalid_argument,
+      ">>> PDE-OPT/topo-opt/elasticity/example_01.cpp: Problem dim is not 2 or 3!");
+    ROL::Ptr<MeshManager<RealT,DeviceT>> meshMgr;
+    if (probDim == 2) meshMgr = ROL::makePtr<MeshManager_TopoOpt<RealT,DeviceT>>(*parlist);
+    else              meshMgr = ROL::makePtr<MeshReader<RealT,DeviceT>>(*parlist);
+    // Initialize PDE describing elasticity equations.
+    auto pde = ROL::makePtr<PDE_Elasticity<RealT,DeviceT>>(*parlist);
+
+    // Initialize the filter PDE.
+    ROL::Ptr<PDE<RealT,DeviceT>> pdeFilter = pde;
+
+    // Initialize reduced compliance objective function.
+    ROL::Ptr<ROL::Objective<RealT>> robj_com;
+    ROL::Ptr<Assembler<RealT,DeviceT>> assembler, assemblerFilter;
+    if (useFilter && !usePhaseField) {
+      pdeFilter = ROL::makePtr<PDE_Filter<RealT,DeviceT>>(*parlist);
+      pde->setDensityFields(pdeFilter->getFields());
+      robj_com = ROL::makePtr<TopOptFilteredComplianceObjective<RealT,DeviceT>>(
+                 pde,pdeFilter,meshMgr,comm,*parlist,*outStream);
+      assembler = ROL::dynamicPtrCast<TopOptFilteredComplianceObjective<RealT,DeviceT>>(robj_com)->getAssembler();
+      assemblerFilter = ROL::dynamicPtrCast<TopOptFilteredComplianceObjective<RealT,DeviceT>>(robj_com)->getFilterAssembler();
+    }
+    else {
+      robj_com = ROL::makePtr<TopOptComplianceObjective<RealT,DeviceT>>(
+                 pde,meshMgr,comm,*parlist,*outStream);
+      assembler = ROL::dynamicPtrCast<TopOptComplianceObjective<RealT,DeviceT>>(robj_com)->getAssembler();
+      assemblerFilter = assembler;
+    }
+
+    // Create vectors.
+    auto u_ptr = assembler->createStateVector();         u_ptr->putScalar(0.0);
+    auto p_ptr = assembler->createStateVector();         p_ptr->putScalar(0.0);
+    auto r_ptr = assembler->createResidualVector();      r_ptr->putScalar(0.0);
+    auto z_ptr = assemblerFilter->createControlVector(); z_ptr->putScalar(1.0);
+    auto up = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(u_ptr,pde,assembler,*parlist);
+    auto pp = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(p_ptr,pde,assembler,*parlist);
+    auto rp = ROL::makePtr<PDE_DualSimVector<RealT,DeviceT>>(r_ptr,pde,assembler,*parlist);
+    auto zp = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(z_ptr,pdeFilter,assemblerFilter,*parlist);
+
+    // Build volume objective function.
+    ROL::Ptr<QoI<RealT,DeviceT>> qoi_vol;
+    if (minType == "Compliance" && volEq) {
+      if (useFilter && !usePhaseField)
+        qoi_vol = ROL::makePtr<QoI_Volume_TopoOpt<RealT,DeviceT>>(ROL::dynamicPtrCast<PDE_Filter<RealT,DeviceT>>(pdeFilter)->getDensityFE(),
+                                                          volFraction);
+      else
+        qoi_vol = ROL::makePtr<QoI_Volume_TopoOpt<RealT,DeviceT>>(pde->getDensityFE(),
+                                                          volFraction);
+    }
+    else {
+      if (useFilter && !usePhaseField)
+        qoi_vol = ROL::makePtr<QoI_Volume_TopoOpt<RealT,DeviceT>>(ROL::dynamicPtrCast<PDE_Filter<RealT,DeviceT>>(pdeFilter)->getDensityFE());
+      else
+        qoi_vol = ROL::makePtr<QoI_Volume_TopoOpt<RealT,DeviceT>>(pde->getDensityFE());
+    }
+    auto obj_vol = ROL::makePtr<TopOptVolumeObjective<RealT,DeviceT>>(qoi_vol,assemblerFilter,zp);
+    auto con_vol = ROL::makePtr<TopOptVolumeConstraint<RealT,DeviceT>>(qoi_vol,assemblerFilter,zp);
+
+    // Compute volume
+    zp->setScalar(1.0);
+    RealT vol = obj_vol->value(*zp,tol);
+
+    // Normalize compliance objective function
+    RealT cs(1);
+    zp->setScalar(initDens);
+    if (normalizeObj) {
+      if (useFilter && !usePhaseField)
+        cs = ROL::dynamicPtrCast<TopOptFilteredComplianceObjective<RealT,DeviceT>>(robj_com)->normalize(*zp,tol);
+      else
+        cs = ROL::dynamicPtrCast<TopOptComplianceObjective<RealT,DeviceT>>(robj_com)->normalize(*zp,tol);
+    }
+
+    // Output problem details
+    *outStream << std::endl;
+    *outStream << "Problem Data"          << std::endl;
+    *outStream << "  Example:           " << example << std::endl;
+    *outStream << "  Dimension:         " << probDim << std::endl;
+    *outStream << "  Minimize Type:     " << minType << std::endl;
+    *outStream << "  Use Phase Field:   " << usePhaseField << std::endl;
+    if (!usePhaseField) {
+      *outStream << "  SIMP Power:        "
+                 << parlist->sublist("Problem").get("SIMP Power",3.0) << std::endl;
+      *outStream << "  Use Filter:        " << useFilter << std::endl;
+    }
+    if (minType == "Volume") {
+      *outStream << "  Compliance Factor: " << cmpFactor << std::endl;
+    }
+    else if (minType == "Compliance") {
+      *outStream << "  Volume Fraction:   " << volFraction << std::endl;
+      *outStream << "  Initial Density:   " << initDens    << std::endl;
+      *outStream << "  Use Equality:      " << volEq       << std::endl;
+      *outStream << "  Hessian Approx:    " << hessAppr    << std::endl;
+    }
+    *outStream << "  Domain Volume:     " << vol << std::endl;
+    *outStream << "  Compliance Scale:  " << cs  << std::endl;
+    *outStream << std::endl;
+
+    // Create objective, constraint, multiplier and bounds
+    ROL::Ptr<ROL::Objective<RealT>>       obj;
+    ROL::Ptr<ROL::Constraint<RealT>>      icon;
+    ROL::Ptr<ROL::Vector<RealT>>          iup, ilp;
+    ROL::Ptr<ROL::Vector<RealT>>          imul;
+    ROL::Ptr<ROL::BoundConstraint<RealT>> ibnd;
+    if (!usePhaseField) {
+      if (minType == "Volume") {
+        obj  = obj_vol;
+        icon = ROL::makePtr<ROL::ConstraintFromObjective<RealT>>(robj_com);
+        // Set upper bound to compliance for solid beam.
+        zp->setScalar(1.0);
+        RealT comp = robj_com->value(*zp,tol);
+        ilp  = ROL::makePtr<ROL::SingletonVector<RealT>>(0);
+        iup  = ROL::makePtr<ROL::SingletonVector<RealT>>(cmpFactor*comp);
+        imul = ROL::makePtr<ROL::SingletonVector<RealT>>(0);
+        ibnd = ROL::makePtr<ROL::Bounds<RealT>>(ilp,iup);
+      }
+      else if (minType == "Compliance") {
+        obj  = robj_com;
+        icon = con_vol;
+        imul = ROL::makePtr<ROL::SingletonVector<RealT>>(0);
+        if (volEq) {
+          ilp  = ROL::nullPtr;
+          iup  = ROL::nullPtr;
+          ibnd = ROL::nullPtr;
+        }
+        else {
+          // Set upper bound to fraction of total volume.
+          ilp  = ROL::makePtr<ROL::SingletonVector<RealT>>(0);
+          iup  = ROL::makePtr<ROL::SingletonVector<RealT>>(volFraction*vol);
+          ibnd = ROL::makePtr<ROL::Bounds<RealT>>(ilp,iup);
+        }
+      }
+      else if (minType == "Total") {
+        std::vector<ROL::Ptr<ROL::Objective<RealT>>> obj_vec(2,ROL::nullPtr);
+        obj_vec[0] = robj_com;
+        obj_vec[1] = obj_vol;
+        std::vector<RealT> weights(2,0.0);
+        weights[0] = static_cast<RealT>(1);
+        weights[1] = parlist->sublist("Problem").get("Volume Objective Scale",0.04096);
+        obj  = ROL::makePtr<ROL::LinearCombinationObjective<RealT>>(weights,obj_vec);
+        icon = ROL::nullPtr;
+        ilp  = ROL::nullPtr;
+        iup  = ROL::nullPtr;
+        imul = ROL::nullPtr;
+        ibnd = ROL::nullPtr;
+      }
+      else {
+        throw ROL::Exception::NotImplemented("Unknown minimization type!");
+      }
+    }
+    else {
+      // Build Modica-Mortola Energy objective function.
+      RealT penParam = parlist->sublist("Problem").get("Phase Field Penalty Parameter",1e-1);
+      ROL::Ptr<QoI<RealT,DeviceT>>
+        qoi_pfe = ROL::makePtr<QoI_ModicaMortolaEnergy_PhaseField<RealT,DeviceT>>(pde->getDensityFE(),
+                                                                          penParam);
+      ROL::Ptr<IntegralOptObjective<RealT,DeviceT>>
+        obj_pfe = ROL::makePtr<IntegralOptObjective<RealT,DeviceT>>(qoi_pfe,assembler);
+      // Get weights for linear combination objective.
+      std::vector<RealT> weights(1,0.0);
+      weights[0] = parlist->sublist("Problem").get("Phase Field Energy Objective Scale",0.00064);
+      std::vector<ROL::Ptr<ROL::Objective<RealT>>> obj_vec(1,ROL::nullPtr);
+      obj_vec[0] = obj_pfe;
+      if (minType == "Volume") {
+        weights.push_back(parlist->sublist("Problem").get("Volume Objective Scale",0.04096));
+        obj_vec.push_back(obj_vol);
+        obj  = ROL::makePtr<ROL::LinearCombinationObjective<RealT>>(weights,obj_vec);
+        icon = ROL::makePtr<ROL::ConstraintFromObjective<RealT>>(robj_com);
+        // Set upper bound to compliance for solid beam.
+        RealT comp = robj_com->value(*zp,tol);
+        ilp  = ROL::makePtr<ROL::SingletonVector<RealT>>(0);
+        iup  = ROL::makePtr<ROL::SingletonVector<RealT>>(cmpFactor*comp);
+        imul = ROL::makePtr<ROL::SingletonVector<RealT>>(0);
+        ibnd = ROL::makePtr<ROL::Bounds<RealT>>(ilp,iup);
+      }
+      else if (minType == "Compliance") {
+        weights.push_back(static_cast<RealT>(1));
+        obj_vec.push_back(robj_com);
+        obj  = ROL::makePtr<ROL::LinearCombinationObjective<RealT>>(weights,obj_vec);
+        icon = con_vol;
+        imul = ROL::makePtr<ROL::SingletonVector<RealT>>(0);
+        if (volEq) {
+          ilp  = ROL::nullPtr;
+          iup  = ROL::nullPtr;
+          ibnd = ROL::nullPtr;
+        }
+        else {
+          // Set upper bound to fraction of total volume.
+          ilp  = ROL::makePtr<ROL::SingletonVector<RealT>>(0);
+          iup  = ROL::makePtr<ROL::SingletonVector<RealT>>(volFraction*vol);
+          ibnd = ROL::makePtr<ROL::Bounds<RealT>>(ilp,iup);
+        }
+      }
+      else if (minType == "Total") {
+        weights.push_back(static_cast<RealT>(1));
+        obj_vec.push_back(robj_com);
+        weights.push_back(parlist->sublist("Problem").get("Volume Objective Scale",0.04096));
+        obj_vec.push_back(obj_vol);
+        obj  = ROL::makePtr<ROL::LinearCombinationObjective<RealT>>(weights,obj_vec);
+        icon = ROL::nullPtr;
+        ilp  = ROL::nullPtr;
+        iup  = ROL::nullPtr;
+        imul = ROL::nullPtr;
+        ibnd = ROL::nullPtr;
+      }
+      else {
+        throw ROL::Exception::NotImplemented("Unknown minimization type!");
+      }
+    }
+
+    // Initialize bound constraints.
+    RealT lval = (usePhaseField ? -1.0 : 0.0), uval = 1.0;
+    ROL::Ptr<ROL::Vector<RealT>> lop = zp->clone(); lop->setScalar(lval);
+    ROL::Ptr<ROL::Vector<RealT>> hip = zp->clone(); hip->setScalar(uval);
+    ROL::Ptr<ROL::BoundConstraint<RealT>>
+      bnd = ROL::makePtr<ROL::Bounds<RealT>>(lop,hip);
+    if (usePhaseField) bnd->deactivate();
+
+    // Build sampler.
+    BuildSampler<RealT> buildSampler(parlist->sublist("Problem"),probDim,example);
+    //int stochDim = buildSampler.getDimension();
+    int nsamp      = parlist->sublist("Problem").get("Number of samples", 1);
+    int nsamp_dist = parlist->sublist("Problem").get("Number of output samples",1000);
+    ROL::Ptr<ROL::BatchManager<RealT>>
+      bman = ROL::makePtr<ROL::TpetraTeuchosBatchManager<RealT>>(comm);
+    ROL::Ptr<ROL::SampleGenerator<RealT>>
+      sampler = buildSampler.get(nsamp,bman);
+    ROL::Ptr<ROL::SampleGenerator<RealT>>
+      sampler_dist = buildSampler.get(nsamp_dist,bman);
+
+    // Set up and solve.
+    int cnt = 0;
+    std::vector<RealT> volume, quantile;
+    bool derivCheck = parlist->sublist("Problem").get("Check derivatives",false);
+    parlist->sublist("SOL").set("Type","Risk Averse");
+    parlist->sublist("SOL").sublist("Risk Measure").set("Name","CVaR");
+    parlist->sublist("SOL").sublist("Risk Measure").sublist("CVaR").set("Smoothing Parameter",1e-4);
+    parlist->sublist("SOL").sublist("Risk Measure").sublist("CVaR").sublist("Distribution").set("Name","Parabolic");
+    parlist->sublist("SOL").sublist("Risk Measure").sublist("CVaR").sublist("Distribution").sublist("Parabolic").set("Lower Bound",0.0);
+    parlist->sublist("SOL").sublist("Risk Measure").sublist("CVaR").sublist("Distribution").sublist("Parabolic").set("Upper Bound",1.0);
+    std::vector<RealT> lambda
+      = ROL::getArrayFromStringParameter<RealT>(parlist->sublist("Problem"), "Convex Combination Parameters");
+    std::vector<RealT> beta
+      = ROL::getArrayFromStringParameter<RealT>(parlist->sublist("Problem"), "Confidence Levels");
+    ROL::Ptr<ROL::StochasticProblem<RealT>> prob;
+    for (unsigned i = 0; i < lambda.size(); ++i) {
+      parlist->sublist("SOL").sublist("Risk Measure").sublist("CVaR").set("Convex Combination Parameter",lambda[i]);
+      for (unsigned j = 0; j < beta.size(); ++j) {
+        if (cnt!=0) parlist->sublist("SOL").set("Initial Statistic",quantile[cnt]);
+        *outStream << std::endl << "  lambda = " << lambda[i]
+                   << "  beta = " << beta[j] << std::endl << std::endl;
+        parlist->sublist("SOL").sublist("Risk Measure").sublist("CVaR").set("Confidence Level",beta[j]);
+        // Set up optimization problem
+        prob = ROL::makePtr<ROL::StochasticProblem<RealT>>(obj,zp);
+        if (bnd->isActivated()) prob->addBoundConstraint(bnd);
+        if ( minType == "Compliance" ) {
+          if ( volEq ) prob->addLinearConstraint("Volume",icon,imul);
+          else         prob->addLinearConstraint("Volume",icon,imul,ibnd);
+        }
+        else if ( minType == "Volume" )
+          prob->addConstraint("Compliance",icon,imul,ibnd);
+        if ( minType == "Volume" ) {
+          ROL::Ptr<ROL::BatchManager<RealT>>
+            cbman = ROL::makePtr<ROL::SingletonTeuchosBatchManager<RealT,int>>(comm);
+          parlist->sublist("SOL").sublist("Compliance") = parlist->sublist("SOL");
+          prob->makeConstraintStochastic("Compliance",*parlist,sampler,cbman);
+        }
+        else {
+          parlist->sublist("SOL").sublist("Objective") = parlist->sublist("SOL");
+          prob->makeObjectiveStochastic(*parlist,sampler);
+        }
+        prob->setProjectionAlgorithm(*parlist);
+        prob->finalize(false,true,*outStream);
+
+        // Check derivatives.
+        if (derivCheck) prob->check(true,*outStream);
+
+        // Solve optimization problem
+        Teuchos::Time algoTimer("Algorithm Time", true);
+        ROL::Solver<RealT> solver(prob,*parlist);
+        solver.solve(*outStream);
+        algoTimer.stop();
+        *outStream << "Total optimization time = " << algoTimer.totalElapsedTime() << " seconds.\n";
+
+        // Output.
+        volume.push_back(obj_vol->value(*zp,tol));
+        quantile.push_back(prob->getSolutionStatistic((minType=="Volume") ? 1 : 0));
+        std::stringstream nameDens;
+        nameDens << "density_CVaR_lambda" << lambda[i] << "_beta" << beta[j] << ".txt";
+        assembler->outputTpetraVector(z_ptr,nameDens.str().c_str());
+        std::stringstream nameObj;
+        nameObj << "obj_samples_CVaR_lambda" << lambda[i] << "_beta" << beta[j] << ".txt";
+        if (minType != "Volume")
+          TopOptPrintCDF<RealT>(*obj,*zp,*sampler_dist,nsamp_dist,comm,nameObj.str());
+        else
+          TopOptPrintCDF<RealT>(*robj_com,*zp,*sampler_dist,nsamp_dist,comm,nameObj.str());
+        cnt++;
+      }
+    }
+
+    // Output.
+    assembler->printMeshData(*outStream);
+    const int rank = Teuchos::rank<int>(*comm);
+    if ( rank==0 ) {
+      std::stringstream nameVOL, nameVAR;
+      nameVOL << "vol.txt";
+      nameVAR << "var.txt";
+      std::ofstream fileVOL, fileVAR;
+      fileVOL.open(nameVOL.str());
+      fileVAR.open(nameVAR.str());
+      fileVOL << std::scientific << std::setprecision(15);
+      fileVAR << std::scientific << std::setprecision(15);
+      int size = quantile.size();
+      for (int i = 0; i < size; ++i) {
+        fileVOL << std::setw(25) << std::left << volume[i] << std::endl;
+        fileVAR << std::setw(25) << std::left << quantile[i] << std::endl;
+      }
+      fileVOL.close();
+      fileVAR.close();
+    }
+
+    // Get a summary from the time monitor.
+    Teuchos::TimeMonitor::summarize();
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/topo-opt/elasticity/src/dirichletK.hpp b/packages/rol/example/PDE-OPT/topo-opt/elasticity/src/dirichletK.hpp
new file mode 100644
index 000000000000..987c1df98df0
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/topo-opt/elasticity/src/dirichletK.hpp
@@ -0,0 +1,215 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  dirichlet.hpp
+    \brief Implements Dirichlet boundary conditions for the structural
+           topology optimization problem.
+*/
+
+#ifndef ROL_PDEOPT_ELASTICITY_DIRICHLETK_HPP
+#define ROL_PDEOPT_ELASTICITY_DIRICHLETK_HPP
+
+#include "ROL_Ptr.hpp"
+#include "ROL_ParameterList.hpp"
+#include "../../../TOOLS/feK.hpp"
+#include <vector>
+
+template<class Real, class DeviceType>
+class Dirichlet {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+
+private:
+  std::vector<int> sidesets_, types_;
+  std::vector<std::vector<scalar_view>> bdryCellNodes_;
+  std::vector<std::vector<std::vector<int>>> bdryCellLocIds_;
+  std::vector<std::vector<int>> fidx_;
+
+  std::vector<int> getConstrainedDimensions(int type, int dim) const {
+    std::vector<int> conDim;
+    if ( type == 1 ) {
+      conDim.push_back(0);
+    }
+    else if ( type == 2 ) {
+      conDim.push_back(1);
+    }
+    else if ( type == 3 && dim > 2 ) {
+      conDim.push_back(2);
+    }
+    else if ( type == 4 && dim > 2 ) {
+      conDim.push_back(0);
+      conDim.push_back(1);
+    }
+    else if ( type == 5 && dim > 2 ) {
+      conDim.push_back(0);
+      conDim.push_back(2);
+    }
+    else if ( type == 6 && dim > 2 ) {
+      conDim.push_back(1);
+      conDim.push_back(2);
+    }
+    else {
+      for (int i = 0; i < dim; ++i) {
+        conDim.push_back(i);
+      }
+    }
+    return conDim;
+  }
+
+public:
+  Dirichlet(ROL::ParameterList &parlist, std::string ex = "Default") {
+    if (ex == "2D Cantilever with 1 Load"  ||
+        ex == "3D Cantilever"              ||
+        ex == "2D Truss"                   ||
+        ex == "2D Beams") {
+      sidesets_.push_back(3);
+      types_.push_back(0);
+    }
+    else if (ex == "2D Cantilever with 3 Loads") {
+      sidesets_.push_back(8);
+      types_.push_back(0);
+    }
+    else if (ex == "2D Wheel") {
+      sidesets_.push_back(0);
+      sidesets_.push_back(4);
+      types_.push_back(0);
+      types_.push_back(2);
+    }
+    else if (ex == "2D Carrier Plate") {
+      sidesets_.push_back(0);
+      types_.push_back(0);
+    }
+    else if (ex == "3D L Beam") {
+      sidesets_.push_back(0);
+      types_.push_back(0);
+    }
+    else {
+      // Grab sidesets
+      sidesets_ = ROL::getArrayFromStringParameter<int>(parlist.sublist("Dirichlet"), "Sidesets");
+      // Grab type
+      types_ = ROL::getArrayFromStringParameter<int>(parlist.sublist("Dirichlet"), "Types");
+    }
+  }
+
+  void setCellNodes(const std::vector<std::vector<scalar_view>> &bdryCellNodes,
+                    const std::vector<std::vector<std::vector<int>>> &bdryCellLocIds,
+                    const std::vector<std::vector<int>> &fidx) {
+    bdryCellNodes_  = bdryCellNodes;
+    bdryCellLocIds_ = bdryCellLocIds;
+    fidx_           = fidx;
+  }
+
+  void applyResidual(std::vector<scalar_view> &R,
+                     const std::vector<scalar_view> &U) const {
+    const int d = R.size();
+    const int numSideSets = sidesets_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        const std::vector<int> conDim = getConstrainedDimensions(types_[i],d);
+        const int numConDim = conDim.size();
+        const int numLocalSideIds = bdryCellLocIds_[sidesets_[i]].size();
+        for (int j = 0; j < numLocalSideIds; ++j) {
+          const int numCellsSide = bdryCellLocIds_[sidesets_[i]][j].size();
+          const int numBdryDofs = fidx_[j].size();
+          for (int k = 0; k < numCellsSide; ++k) {
+            const int cidx = bdryCellLocIds_[sidesets_[i]][j][k];
+            for (int l = 0; l < numBdryDofs; ++l) {
+              for (int m=0; m < numConDim; ++m) {
+                (R[conDim[m]])(cidx,fidx_[j][l]) = (U[conDim[m]])(cidx,fidx_[j][l]);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void applyJacobian1(std::vector<std::vector<scalar_view>> &J) const {
+    const int d = J.size();
+    const int f = J[0][0].extent_int(1);
+    const int numSideSets = sidesets_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        const std::vector<int> conDim = getConstrainedDimensions(types_[i],d);
+        const int numConDim = conDim.size();
+        const int numLocalSideIds = bdryCellLocIds_[sidesets_[i]].size();
+        for (int j = 0; j < numLocalSideIds; ++j) {
+          const int numCellsSide = bdryCellLocIds_[sidesets_[i]][j].size();
+          const int numBdryDofs = fidx_[j].size();
+          for (int k = 0; k < numCellsSide; ++k) {
+            const int cidx = bdryCellLocIds_[sidesets_[i]][j][k];
+            for (int l = 0; l < numBdryDofs; ++l) {
+              for (int m=0; m < f; ++m) {
+                for (int n=0; n < numConDim; ++n) {
+                  for (int p=0; p < d; ++p) {
+                    (J[conDim[n]][p])(cidx,fidx_[j][l],m) = static_cast<Real>(0);
+                  }
+                  (J[conDim[n]][conDim[n]])(cidx,fidx_[j][l],fidx_[j][l]) = static_cast<Real>(1);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void applyJacobian2(std::vector<std::vector<scalar_view>> &J) const {
+    const int d = J.size();
+    const int f = J[0][0].extent_int(2);
+    const int numSideSets = sidesets_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        const std::vector<int> conDim = getConstrainedDimensions(types_[i],d);
+        const int numConDim = conDim.size();
+        const int numLocalSideIds = bdryCellLocIds_[sidesets_[i]].size();
+        for (int j = 0; j < numLocalSideIds; ++j) {
+          const int numCellsSide = bdryCellLocIds_[sidesets_[i]][j].size();
+          const int numBdryDofs = fidx_[j].size();
+          for (int k = 0; k < numCellsSide; ++k) {
+            const int cidx = bdryCellLocIds_[sidesets_[i]][j][k];
+            for (int l = 0; l < numBdryDofs; ++l) {
+              for (int m=0; m < f; ++m) {
+                for (int n=0; n < numConDim; ++n) {
+                  (J[conDim[n]][0])(cidx,fidx_[j][l],m) = static_cast<Real>(0);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void applyMultiplier(std::vector<scalar_view> &L) const {
+    const int d = L.size();
+    const int numSideSets = sidesets_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        const std::vector<int> conDim = getConstrainedDimensions(types_[i],d);
+        const int numConDim = conDim.size();
+        const int numLocalSideIds = bdryCellLocIds_[sidesets_[i]].size();
+        for (int j = 0; j < numLocalSideIds; ++j) {
+          const int numCellsSide = bdryCellLocIds_[sidesets_[i]][j].size();
+          const int numBdryDofs = fidx_[j].size();
+          for (int k = 0; k < numCellsSide; ++k) {
+            const int cidx = bdryCellLocIds_[sidesets_[i]][j][k];
+            for (int l = 0; l < numBdryDofs; ++l) {
+              for (int m=0; m < numConDim; ++m) {
+                (L[conDim[m]])(cidx,fidx_[j][l]) = static_cast<Real>(0);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/topo-opt/elasticity/src/loadK.hpp b/packages/rol/example/PDE-OPT/topo-opt/elasticity/src/loadK.hpp
new file mode 100644
index 000000000000..ae2316f42951
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/topo-opt/elasticity/src/loadK.hpp
@@ -0,0 +1,213 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  load.hpp
+    \brief Implements points loads for the structural topology
+           optimization problem.
+*/
+
+#ifndef ROL_PDEOPT_ELASTICITY_LOADK_HPP
+#define ROL_PDEOPT_ELASTICITY_LOADK_HPP
+
+#include "ROL_Ptr.hpp"
+#include "ROL_ParameterList.hpp"
+#include "../../../TOOLS/feK.hpp"
+#include <vector>
+
+template<class Real, class DeviceType>
+class Load {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+
+private:
+  std::vector<Real> loadMagnitude_;
+  std::vector<Real> loadPolar_, loadAzimuth_;
+  std::vector<Real> loadX_, loadY_, loadZ_;
+  std::vector<Real> loadWidthX_, loadWidthY_, loadWidthZ_;
+
+protected:
+  Real DegreesToRadians(Real deg) const {
+    return deg * static_cast<Real>(M_PI) / static_cast<Real>(180);
+  }
+
+public:
+  virtual ~Load() {}
+
+  Load(void) {}
+
+  Load(ROL::ParameterList & parlist, std::string ex = "Default") {
+    if (ex == "2D Cantilever with 1 Load") {
+      loadMagnitude_.push_back(static_cast<Real>(1));
+      loadPolar_.push_back(static_cast<Real>(270));
+      loadAzimuth_.push_back(static_cast<Real>(0));
+      loadX_.push_back(static_cast<Real>(1));
+      loadY_.push_back(static_cast<Real>(0));
+      loadZ_.push_back(static_cast<Real>(0));
+      loadWidthX_.push_back(static_cast<Real>(0.125));
+      loadWidthY_.push_back(static_cast<Real>(0.125));
+      loadWidthZ_.push_back(static_cast<Real>(0.125));
+    }
+    if (ex == "3D Cantilever") {
+      loadMagnitude_.push_back(static_cast<Real>(1));
+      loadPolar_.push_back(static_cast<Real>(270));
+      loadAzimuth_.push_back(static_cast<Real>(90));
+      loadX_.push_back(static_cast<Real>(1));
+      loadY_.push_back(static_cast<Real>(0));
+      loadZ_.push_back(static_cast<Real>(0));
+      loadWidthX_.push_back(static_cast<Real>(0.125));
+      loadWidthY_.push_back(static_cast<Real>(0.125));
+      loadWidthZ_.push_back(static_cast<Real>(0.125));
+    }
+    else if (ex == "2D Truss") {
+      loadMagnitude_.push_back(static_cast<Real>(1));
+      loadPolar_.push_back(static_cast<Real>(270));
+      loadAzimuth_.push_back(static_cast<Real>(0));
+      loadX_.push_back(static_cast<Real>(2));
+      loadY_.push_back(static_cast<Real>(0.5));
+      loadZ_.push_back(static_cast<Real>(0));
+      loadWidthX_.push_back(static_cast<Real>(1e-2));
+      loadWidthY_.push_back(static_cast<Real>(1e-2));
+      loadWidthZ_.push_back(static_cast<Real>(1e-2));
+    }
+    else if (ex == "2D Beams") {
+      loadMagnitude_.push_back(static_cast<Real>(1));
+      loadMagnitude_.push_back(static_cast<Real>(1));
+      loadPolar_.push_back(static_cast<Real>(180));
+      loadPolar_.push_back(static_cast<Real>(180));
+      loadAzimuth_.push_back(static_cast<Real>(0));
+      loadAzimuth_.push_back(static_cast<Real>(0));
+      loadX_.push_back(static_cast<Real>(2));
+      loadX_.push_back(static_cast<Real>(2));
+      loadY_.push_back(static_cast<Real>(0));
+      loadY_.push_back(static_cast<Real>(1));
+      loadZ_.push_back(static_cast<Real>(0));
+      loadZ_.push_back(static_cast<Real>(0));
+      loadWidthX_.push_back(static_cast<Real>(1e-2));
+      loadWidthX_.push_back(static_cast<Real>(1e-2));
+      loadWidthY_.push_back(static_cast<Real>(1e-2));
+      loadWidthY_.push_back(static_cast<Real>(1e-2));
+      loadWidthZ_.push_back(static_cast<Real>(1e-2));
+      loadWidthZ_.push_back(static_cast<Real>(1e-2));
+    }
+    else if (ex == "2D Cantilever with 3 Loads" ||
+             ex == "2D Carrier Plate"           ||
+             ex == "2D Wheel"                   ||
+             ex == "3D L Beam") {
+      loadMagnitude_.clear();
+      loadPolar_.clear();
+      loadAzimuth_.clear();
+      loadX_.clear();
+      loadY_.clear();
+      loadZ_.clear();
+      loadWidthX_.clear();
+      loadWidthY_.clear();
+      loadWidthZ_.clear();
+    }
+    else {
+      if (parlist.isSublist("Load")) {
+        // Grab Magnitudes
+        loadMagnitude_ = ROL::getArrayFromStringParameter<double>(parlist.sublist("Load"), "Magnitude");
+        // Grab Polar Angle
+        loadPolar_ = ROL::getArrayFromStringParameter<double>(parlist.sublist("Load"), "Polar Angle");
+        // Grab Azimuth Angle
+        loadAzimuth_ = ROL::getArrayFromStringParameter<double>(parlist.sublist("Load"), "Azimuth Angle");
+        // Grab X Location
+        loadX_ = ROL::getArrayFromStringParameter<double>(parlist.sublist("Load"), "X Location");
+        // Grab Y Location
+        loadY_ = ROL::getArrayFromStringParameter<double>(parlist.sublist("Load"), "Y Location");
+        // Grab Z Location
+        loadZ_ = ROL::getArrayFromStringParameter<double>(parlist.sublist("Load"), "Z Location");
+        // Grab X Width
+        loadWidthX_ = ROL::getArrayFromStringParameter<double>(parlist.sublist("Load"), "X Width");
+        // Grab Y Width
+        loadWidthY_ = ROL::getArrayFromStringParameter<double>(parlist.sublist("Load"), "Y Width");
+        // Grab Z Width
+        loadWidthZ_ = ROL::getArrayFromStringParameter<double>(parlist.sublist("Load"), "Z Width");
+      }
+    }
+
+    int polarSize = loadPolar_.size();
+    for (int i=0; i<polarSize; ++i)
+      loadPolar_[i] = DegreesToRadians(loadPolar_[i]);
+
+    int azimuthSize = loadAzimuth_.size();
+    for (int i=0; i<azimuthSize; ++i)
+      loadAzimuth_[i] = DegreesToRadians(loadAzimuth_[i]);
+  }
+
+  bool isNull(void) const {
+    return (loadMagnitude_.size()==0);
+  }
+
+  virtual Real loadFunc(const std::vector<Real> &coords,
+                        const int                dir,
+                        const std::vector<Real> &param) const {
+    Real val(0);
+    const Real half(0.5);
+    const int numLoads  = loadMagnitude_.size();
+    const int paramSize = param.size();
+    const int d         = coords.size();
+    for (int i = 0; i < numLoads; ++i) {
+      Real loadMagNoise(0), loadAngNoise0(0);
+      if (paramSize > 0 + d*i)
+        loadMagNoise  = param[0 + d*i];
+      if (paramSize > 1 + d*i)
+        loadAngNoise0 = DegreesToRadians(param[1 + d*i]);
+      const Real loadMagnitude = loadMagnitude_[i] + loadMagNoise;
+      const Real loadAngle0    = loadPolar_[i] + loadAngNoise0;
+      const Real Gx = std::exp(-half*std::pow(coords[0]-loadX_[i],2)/std::pow(loadWidthX_[i],2));
+      const Real Gy = std::exp(-half*std::pow(coords[1]-loadY_[i],2)/std::pow(loadWidthY_[i],2));
+
+      if (d==2) {
+        if (dir==0)
+          val += loadMagnitude*std::cos(loadAngle0)*Gx*Gy;
+        if (dir==1)
+          val += loadMagnitude*std::sin(loadAngle0)*Gx*Gy;
+      }
+      if (d==3) {
+        Real loadAngNoise1(0);
+        if (paramSize > 2 + d*i)
+          loadAngNoise1 = DegreesToRadians(param[2 + d*i]);
+
+        const Real loadAngle1 = loadAzimuth_[i] + loadAngNoise1;
+        const Real Gz = std::exp(-half*std::pow(coords[2]-loadZ_[i],2)/std::pow(loadWidthZ_[i],2));
+
+        if (dir==0)
+          val += loadMagnitude*std::sin(loadAngle0)*std::cos(loadAngle1)*Gx*Gy*Gz;
+        if (dir==1)
+          val += loadMagnitude*std::sin(loadAngle0)*std::sin(loadAngle1)*Gx*Gy*Gz;
+        if (dir==2)
+          val += loadMagnitude*std::cos(loadAngle0)*Gx*Gy*Gz;
+      }
+    }
+    return val;
+  }
+
+  void compute(std::vector<scalar_view> &load,
+               const ROL::Ptr<FE<Real,DeviceType>> &fe,
+               const std::vector<Real> &param,
+               const Real scale = 1) const {
+    // Retrieve dimensions.
+    int c = fe->gradN().extent_int(0);
+    int p = fe->gradN().extent_int(2);
+    int d = fe->gradN().extent_int(3);
+    std::vector<Real> coord(d);
+
+    for (int i=0; i<c; ++i) {
+      for (int j=0; j<p; ++j) {
+        for (int k=0; k<d; ++k)
+          coord[k] = (fe->cubPts())(i,j,k);
+        for (int k=0; k<d; ++k)
+          (load[k])(i,j) = loadFunc(coord, k, param)*scale;
+      }
+    }
+  }
+};
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/topo-opt/elasticity/src/materialtensorK.hpp b/packages/rol/example/PDE-OPT/topo-opt/elasticity/src/materialtensorK.hpp
new file mode 100644
index 000000000000..42ab7eb28635
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/topo-opt/elasticity/src/materialtensorK.hpp
@@ -0,0 +1,343 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  obj.hpp
+    \brief Provides the interface for local (cell-based) objective function computations.
+*/
+
+#ifndef PDEOPT_MATERIALTENSORK_HPP
+#define PDEOPT_MATERIALTENSORK_HPP
+
+#include "../../../TOOLS/feK.hpp"
+
+template <class Real, class DeviceType>
+class MaterialTensor {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+
+private:
+  ROL::Ptr<FE<Real,DeviceType>> fe_;
+
+  // Problem parameters.
+  Real youngsModulus_;
+  Real poissonRatio_;
+  bool isPlainStress_;
+  Real minDensity_;
+  Real maxDensity_;
+  Real powerSIMP_;
+  bool usePhaseField_;
+  bool useProjection_;
+  Real beta_, eta_;
+  std::vector<std::vector<Real>> materialMat_;
+
+  // Precomputed quantities.
+  std::vector<scalar_view> BMat_;
+  std::vector<scalar_view> BdetJMat_;
+  std::vector<scalar_view> NMat_;
+  std::vector<scalar_view> NdetJMat_;
+  std::vector<scalar_view> CBdetJMat_;
+
+  void computeMaterialTensor(const int d) {
+    int matd = (d*(d+1))/2;
+    std::vector<Real> tmpvec(matd);
+    materialMat_.resize(matd, tmpvec);
+    if ((d==2) && (isPlainStress_)) {
+      Real one(1), half(0.5);
+      Real factor1 = youngsModulus_ /(one-poissonRatio_*poissonRatio_);
+      materialMat_[0][0] = factor1;
+      materialMat_[0][1] = factor1 * poissonRatio_;
+      materialMat_[1][0] = factor1 * poissonRatio_;
+      materialMat_[1][1] = factor1;
+      materialMat_[2][2] = factor1 * half * (one-poissonRatio_);
+    }
+    else if ((d==2) && (!isPlainStress_)) {
+      Real one(1), two(2), half(0.5);
+      Real factor2 = youngsModulus_ /(one+poissonRatio_)/(one-two*poissonRatio_);
+      materialMat_[0][0] = factor2 * (one-poissonRatio_);
+      materialMat_[0][1] = factor2 * poissonRatio_;
+      materialMat_[1][0] = factor2 * poissonRatio_;
+      materialMat_[1][1] = factor2 * (one-poissonRatio_);
+      materialMat_[2][2] = factor2 * half * (one-two*poissonRatio_);
+    }
+    else {
+      Real one(1), two(2), half(0.5);
+      Real lam = youngsModulus_*poissonRatio_/(one+poissonRatio_)/(one-two*poissonRatio_);
+      Real mu = half*youngsModulus_/(one+poissonRatio_);
+      materialMat_[0][0] = lam + two*mu;
+      materialMat_[0][1] = lam;
+      materialMat_[0][2] = lam;
+      materialMat_[1][0] = lam;
+      materialMat_[1][1] = lam + two*mu;
+      materialMat_[1][2] = lam;
+      materialMat_[2][0] = lam;
+      materialMat_[2][1] = lam;
+      materialMat_[2][2] = lam + two*mu;
+      materialMat_[3][3] = mu;
+      materialMat_[4][4] = mu;
+      materialMat_[5][5] = mu;
+    }
+  }
+
+  void computeNBmats(void) {
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    int p = fe_->gradN().extent_int(2);
+    int d = fe_->gradN().extent_int(3);
+    int matd = materialMat_.size();
+
+    for (int i=0; i<d; ++i) {
+      BMat_.push_back(scalar_view("BMat", c,f,p,matd));
+      BdetJMat_.push_back(scalar_view("BdetJMat", c,f,p,matd));
+      CBdetJMat_.push_back(scalar_view("CBdetJMat", c,f,p,matd));
+      NMat_.push_back(scalar_view("NMat", c,f,p,d));
+      NdetJMat_.push_back(scalar_view("NdetJMat", c,f,p,d));
+    }
+
+    if (d==2) {
+      for (int i=0; i<c; ++i) {
+        for (int j=0; j<f; ++j) {
+      	  for (int k=0; k<p; ++k) {
+            (NMat_[0])(i, j, k, 0) = (fe_->N())(i, j, k);
+            (NMat_[1])(i, j, k, 1) = (fe_->N())(i, j, k);
+            (NdetJMat_[0])(i, j, k, 0) = (fe_->NdetJ())(i, j, k);
+            (NdetJMat_[1])(i, j, k, 1) = (fe_->NdetJ())(i, j, k);
+                
+            (BMat_[0])(i, j, k, 0) = (fe_->gradN())(i, j, k, 0);
+            (BMat_[1])(i, j, k, 1) = (fe_->gradN())(i, j, k, 1);
+            (BMat_[0])(i, j, k, 2) = (fe_->gradN())(i, j, k, 1);
+            (BMat_[1])(i, j, k, 2) = (fe_->gradN())(i, j, k, 0);
+
+            (BdetJMat_[0])(i, j, k, 0) = (fe_->gradNdetJ())(i, j, k, 0);
+            (BdetJMat_[1])(i, j, k, 1) = (fe_->gradNdetJ())(i, j, k, 1);
+            (BdetJMat_[0])(i, j, k, 2) = (fe_->gradNdetJ())(i, j, k, 1);
+            (BdetJMat_[1])(i, j, k, 2) = (fe_->gradNdetJ())(i, j, k, 0);
+          }
+        }
+      }
+    }
+
+    if(d==3) {
+      for (int i=0; i<c; ++i) {
+        for (int j=0; j<f; ++j) {
+          for (int k=0; k<p; ++k) {
+            (NMat_[0])(i, j, k, 0) = (fe_->N())(i, j, k);
+            (NMat_[1])(i, j, k, 1) = (fe_->N())(i, j, k);
+            (NMat_[2])(i, j, k, 2) = (fe_->N())(i, j, k);
+            (NdetJMat_[0])(i, j, k, 0) = (fe_->NdetJ())(i, j, k);
+            (NdetJMat_[1])(i, j, k, 1) = (fe_->NdetJ())(i, j, k);
+            (NdetJMat_[2])(i, j, k, 2) = (fe_->NdetJ())(i, j, k);
+            
+            (BMat_[0])(i, j, k, 0) = (fe_->gradN())(i, j, k, 0);
+            (BMat_[1])(i, j, k, 1) = (fe_->gradN())(i, j, k, 1);
+            (BMat_[2])(i, j, k, 2) = (fe_->gradN())(i, j, k, 2);
+            (BMat_[1])(i, j, k, 3) = (fe_->gradN())(i, j, k, 2);
+            (BMat_[2])(i, j, k, 3) = (fe_->gradN())(i, j, k, 1);
+            (BMat_[0])(i, j, k, 4) = (fe_->gradN())(i, j, k, 2);
+            (BMat_[2])(i, j, k, 4) = (fe_->gradN())(i, j, k, 0);
+            (BMat_[0])(i, j, k, 5) = (fe_->gradN())(i, j, k, 1);
+            (BMat_[1])(i, j, k, 5) = (fe_->gradN())(i, j, k, 0);
+                
+            (BdetJMat_[0])(i, j, k, 0) = (fe_->gradNdetJ())(i, j, k, 0);
+            (BdetJMat_[1])(i, j, k, 1) = (fe_->gradNdetJ())(i, j, k, 1);
+            (BdetJMat_[2])(i, j, k, 2) = (fe_->gradNdetJ())(i, j, k, 2);
+            (BdetJMat_[1])(i, j, k, 3) = (fe_->gradNdetJ())(i, j, k, 2);
+            (BdetJMat_[2])(i, j, k, 3) = (fe_->gradNdetJ())(i, j, k, 1);
+            (BdetJMat_[0])(i, j, k, 4) = (fe_->gradNdetJ())(i, j, k, 2);
+            (BdetJMat_[2])(i, j, k, 4) = (fe_->gradNdetJ())(i, j, k, 0);
+            (BdetJMat_[0])(i, j, k, 5) = (fe_->gradNdetJ())(i, j, k, 1);
+            (BdetJMat_[1])(i, j, k, 5) = (fe_->gradNdetJ())(i, j, k, 0);
+          }
+        }
+      }
+    } 
+
+    for (int i=0; i<c; ++i) {
+      for (int j=0; j<f; ++j) {
+        for (int k=0; k<p; ++k) {
+          for (int l=0; l<d; ++l) {
+            for (int m=0; m<matd; ++m) {
+              for (int n=0; n<matd; ++n) {
+                (CBdetJMat_[l])(i,j,k,m) += materialMat_[m][n] * (BdetJMat_[l])(i,j,k,n);
+              }
+            }
+          }
+        }
+      }
+    }
+
+  }
+
+public:
+  MaterialTensor(ROL::ParameterList &parlist) {
+    // Problem parameters.
+    youngsModulus_  = parlist.get("Young's Modulus",      1.0);
+    poissonRatio_   = parlist.get("Poisson Ratio",        0.3);
+    isPlainStress_  = parlist.get("Use Plain Stress",     true);
+    minDensity_     = parlist.get("Minimum Density",      1e-4);
+    maxDensity_     = parlist.get("Maximum Density",      1.0);
+    powerSIMP_      = parlist.get("SIMP Power",           3.0);
+    usePhaseField_  = parlist.get("Use Phase Field",      false);
+    useProjection_  = parlist.get("Use Projection",       false);
+    beta_           = parlist.get("Projection Parameter", 1.0);
+    eta_            = parlist.get("Projection Threshold", 0.5);
+  }
+
+  void setFE(ROL::Ptr<FE<Real,DeviceType>> &fe) {
+    fe_ = fe;
+    int d = fe_->gradN().extent_int(3);
+    computeMaterialTensor(d);
+    computeNBmats();
+  }
+
+  void applyTensor(scalar_view & out, const scalar_view inData) const {
+    int c = fe_->gradN().extent_int(0);
+    int p = fe_->gradN().extent_int(2);
+    int matd = materialMat_.size();
+
+    for (int i=0; i<c; ++i) {
+      for (int j=0; j<p; ++j) {
+        for (int k=0; k<matd; ++k) {
+          for (int l=0; l<matd; ++l) {
+            out(i,j,k) += materialMat_[k][l] * inData(i,j,l);
+          }
+        }
+      }
+    }
+  }
+
+  void computeUmat(scalar_view & UMat, std::vector<scalar_view> & gradU) const {
+    int c = fe_->gradN().extent_int(0);
+    int p = fe_->gradN().extent_int(2);
+    int d = fe_->gradN().extent_int(3);
+    int matd = materialMat_.size();
+
+    UMat = scalar_view("Umat", c,p,matd);
+
+    if (d==2) {
+      for (int i=0; i<c; ++i) {
+        for (int j=0; j<p; ++j) {
+          UMat(i, j, 0) = (gradU[0])(i, j, 0);
+          UMat(i, j, 1) = (gradU[1])(i, j, 1);
+          UMat(i, j, 2) = (gradU[0])(i, j, 1) + (gradU[1])(i, j, 0);
+        }
+      }
+    }
+
+    if(d==3) {
+      for (int i=0; i<c; ++i) {
+        for (int j=0; j<p; ++j) {
+          UMat(i, j, 0) = (gradU[0])(i, j, 0);
+          UMat(i, j, 1) = (gradU[1])(i, j, 1);
+          UMat(i, j, 2) = (gradU[2])(i, j, 2);
+          UMat(i, j, 3) = (gradU[1])(i, j, 2) + (gradU[2])(i, j, 1);
+          UMat(i, j, 4) = (gradU[0])(i, j, 2) + (gradU[2])(i, j, 0);
+          UMat(i, j, 5) = (gradU[0])(i, j, 1) + (gradU[1])(i, j, 0);
+        }
+      }
+    } 
+
+  }
+
+  void computeDensity(scalar_view & rho,
+                      const scalar_view & Z,
+                      const int deriv = 0) const {
+    // Retrieve dimensions.
+    int c = rho.extent_int(0);
+    int p = rho.extent_int(1);
+
+    Real z(0);
+    const Real half(0.5), one(1), two(2), diff = maxDensity_-minDensity_;
+    for (int i=0; i<c; ++i) {
+      for (int j=0; j<p; ++j) {
+        z = Z(i,j);
+        if (!usePhaseField_) {
+          if (!useProjection_) {
+            if (deriv==0) {
+              rho(i,j) = minDensity_ + diff*std::pow(z, powerSIMP_);
+            }
+            else if (deriv==1) {
+              rho(i,j) = powerSIMP_*diff*std::pow(z, powerSIMP_-one);
+            }
+            else if (deriv==2) {
+              rho(i,j) = powerSIMP_*(powerSIMP_-one)*diff*std::pow(z, powerSIMP_-two);
+            }
+          }
+          else {
+            if (deriv==0) {
+              Real pz = project(z,0);
+              rho(i,j) = minDensity_ + diff*std::pow(pz, powerSIMP_);
+            }
+            else if (deriv==1) {
+              Real pz = project(z,0), pz1 = project(z,1);
+              rho(i,j) = powerSIMP_*diff*std::pow(pz, powerSIMP_-one)*pz1;
+            }
+            else if (deriv==2) {
+              Real pz = project(z,0), pz1 = project(z,1), pz2 = project(z,2);
+              rho(i,j) = powerSIMP_*(powerSIMP_-one)*diff*std::pow(pz, powerSIMP_-two)*pz1*pz1
+                         +powerSIMP_*diff*std::pow(pz, powerSIMP_-one)*pz2;
+            }
+          }
+        }
+        else {
+          z += one;
+          z *= half;
+          if (deriv==0) {
+            rho(i,j) = minDensity_ + diff*std::pow(z, 2);
+          }
+          else if (deriv==1) { 
+            rho(i,j) = diff*z;
+          }
+          else if (deriv==2) {
+            rho(i,j) = half*diff;
+          }
+        }
+      }
+    }
+  }
+
+  const scalar_view B(const int i) const {
+    return BMat_[i];
+  }
+
+  const scalar_view BdetJ(const int i) const {
+    return BdetJMat_[i];
+  }
+
+  const scalar_view N(const int i) const {
+    return NMat_[i];
+  }
+
+  const scalar_view NdetJ(const int i) const{
+    return NdetJMat_[i];
+  }
+
+  const scalar_view CBdetJ(const int i) const {
+    return CBdetJMat_[i];
+  }
+
+  int getMatrixDim(void) const {
+    return materialMat_.size();
+  }
+
+private:
+  Real project(Real rho, int deriv) const {
+    const Real one(1), tbe(std::tanh(beta_*eta_)), tbe1(std::tanh(beta_*(one-eta_)));
+    if (deriv==1) {
+      const Real tber(std::tanh(beta_*(rho-eta_)));
+      return (one-tber*tber)/(tbe+tbe1)*beta_;
+    }
+    else if (deriv==2) {
+      const Real tber(std::tanh(beta_*(rho-eta_))), two(2);
+      return -two*tber*(one-tber*tber)/(tbe+tbe1)*beta_*beta_;
+    }
+    return (tbe+std::tanh(beta_*(rho-eta_)))/(tbe+tbe1);
+  }
+};
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/topo-opt/elasticity/src/mesh_topo-optK.hpp b/packages/rol/example/PDE-OPT/topo-opt/elasticity/src/mesh_topo-optK.hpp
new file mode 100644
index 000000000000..78fad3cd7830
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/topo-opt/elasticity/src/mesh_topo-optK.hpp
@@ -0,0 +1,341 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+#ifndef MESHMANAGER_ELASTICITYK_HPP
+#define MESHMANAGER_ELASTICITYK_HPP
+
+#include "../../../TOOLS/meshmanagerK.hpp"
+
+template <class Real, class DeviceType>
+class MeshManager_TopoOpt : public MeshManager_Rectangle<Real,DeviceType> {
+private:
+  int nx_;
+  int ny_;
+  Real height_;
+  Real width_;
+  std::string name_;
+  ROL::Ptr<std::vector<std::vector<std::vector<int>>>>  meshSideSets_;
+
+public: 
+
+  MeshManager_TopoOpt(ROL::ParameterList &parlist) : MeshManager_Rectangle<Real,DeviceType>(parlist)
+  {
+    nx_     = parlist.sublist("Geometry").get("NX", 3);
+    ny_     = parlist.sublist("Geometry").get("NY", 3);
+    height_ = parlist.sublist("Geometry").get("Height", 1.0);
+    width_  = parlist.sublist("Geometry").get("Width", 2.0);
+    name_   = parlist.sublist("Problem").get("Example","Default");
+    computeSideSets();
+  }
+
+  void computeSideSets() {
+    if (name_ == "2D Cantilever with 3 Loads") {
+      cantilever_computeSideSets();
+    }
+    else if (name_ == "2D Carrier Plate") {
+      carrierplate_computeSideSets();
+    }
+    else if (name_ == "2D Wheel") {
+      wheel_computeSideSets();
+    }
+    else {
+      default_computeSideSets();
+    }
+  }
+
+private:
+
+  void cantilever_computeSideSets() {
+    int numSideSets = 9;
+    meshSideSets_ = ROL::makePtr<std::vector<std::vector<std::vector<int>>>>(numSideSets);
+
+    Real pf(1.0/6.0);
+    int np = static_cast<int>(pf * static_cast<Real>(nx_));
+    int nf = nx_-5*np;
+
+    // Bottom
+    (*meshSideSets_)[0].resize(4);
+    (*meshSideSets_)[0][0].resize(nf);
+    (*meshSideSets_)[0][1].resize(0);
+    (*meshSideSets_)[0][2].resize(0);
+    (*meshSideSets_)[0][3].resize(0);
+    (*meshSideSets_)[1].resize(4);
+    (*meshSideSets_)[1][0].resize(np);
+    (*meshSideSets_)[1][1].resize(0);
+    (*meshSideSets_)[1][2].resize(0);
+    (*meshSideSets_)[1][3].resize(0);
+    (*meshSideSets_)[2].resize(4);
+    (*meshSideSets_)[2][0].resize(np);
+    (*meshSideSets_)[2][1].resize(0);
+    (*meshSideSets_)[2][2].resize(0);
+    (*meshSideSets_)[2][3].resize(0);
+    (*meshSideSets_)[3].resize(4);
+    (*meshSideSets_)[3][0].resize(np);
+    (*meshSideSets_)[3][1].resize(0);
+    (*meshSideSets_)[3][2].resize(0);
+    (*meshSideSets_)[3][3].resize(0);
+    (*meshSideSets_)[4].resize(4);
+    (*meshSideSets_)[4][0].resize(np);
+    (*meshSideSets_)[4][1].resize(0);
+    (*meshSideSets_)[4][2].resize(0);
+    (*meshSideSets_)[4][3].resize(0);
+    (*meshSideSets_)[5].resize(4);
+    (*meshSideSets_)[5][0].resize(np);
+    (*meshSideSets_)[5][1].resize(0);
+    (*meshSideSets_)[5][2].resize(0);
+    (*meshSideSets_)[5][3].resize(0);
+    // Right
+    (*meshSideSets_)[6].resize(4);
+    (*meshSideSets_)[6][0].resize(0);
+    (*meshSideSets_)[6][1].resize(ny_);
+    (*meshSideSets_)[6][2].resize(0);
+    (*meshSideSets_)[6][3].resize(0);
+    // Top
+    (*meshSideSets_)[7].resize(4);
+    (*meshSideSets_)[7][0].resize(0);
+    (*meshSideSets_)[7][1].resize(0);
+    (*meshSideSets_)[7][2].resize(nx_);
+    (*meshSideSets_)[7][3].resize(0);
+    // Left
+    (*meshSideSets_)[8].resize(4);
+    (*meshSideSets_)[8][0].resize(0);
+    (*meshSideSets_)[8][1].resize(0);
+    (*meshSideSets_)[8][2].resize(0);
+    (*meshSideSets_)[8][3].resize(ny_);
+    
+    for (int i=0; i<nf; ++i) {
+      (*meshSideSets_)[0][0][i] = i;
+    }
+    for (int i=0; i<np; ++i) {
+      (*meshSideSets_)[1][0][i] = i + nf;
+    }
+    for (int i=0; i<np; ++i) {
+      (*meshSideSets_)[2][0][i] = i + (nf+np);
+    }
+    for (int i=0; i<np; ++i) {
+      (*meshSideSets_)[3][0][i] = i + (nf+2*np);
+    }
+    for (int i=0; i<np; ++i) {
+      (*meshSideSets_)[4][0][i] = i + (nf+3*np);
+    }
+    for (int i=0; i<np; ++i) {
+      (*meshSideSets_)[5][0][i] = i + (nf+4*np);
+    }
+    for (int i=0; i<ny_; ++i) {
+      (*meshSideSets_)[6][1][i] = (i+1)*nx_-1;
+    }
+    for (int i=0; i<nx_; ++i) {
+      (*meshSideSets_)[7][2][i] = i + nx_*(ny_-1);
+    }
+    for (int i=0; i<ny_; ++i) {
+      (*meshSideSets_)[8][3][i] = i*nx_;
+    }
+  } // cantilever_computeSideSets
+
+  void carrierplate_computeSideSets() {
+    int numSideSets = 6;
+    meshSideSets_ = ROL::makePtr<std::vector<std::vector<std::vector<int>>>>(numSideSets);
+
+    Real pf(0.25);
+    int np = static_cast<int>(pf * static_cast<Real>(nx_));
+    int nc = nx_-2*np;
+
+    // Bottom
+    (*meshSideSets_)[0].resize(4);
+    (*meshSideSets_)[0][0].resize(nx_);
+    (*meshSideSets_)[0][1].resize(0);
+    (*meshSideSets_)[0][2].resize(0);
+    (*meshSideSets_)[0][3].resize(0);
+    // Right
+    (*meshSideSets_)[1].resize(4);
+    (*meshSideSets_)[1][0].resize(0);
+    (*meshSideSets_)[1][1].resize(ny_);
+    (*meshSideSets_)[1][2].resize(0);
+    (*meshSideSets_)[1][3].resize(0);
+    // Top
+    (*meshSideSets_)[2].resize(4);
+    (*meshSideSets_)[2][0].resize(0);
+    (*meshSideSets_)[2][1].resize(0);
+    (*meshSideSets_)[2][2].resize(np);
+    (*meshSideSets_)[2][3].resize(0);
+    (*meshSideSets_)[3].resize(4);
+    (*meshSideSets_)[3][0].resize(0);
+    (*meshSideSets_)[3][1].resize(0);
+    (*meshSideSets_)[3][2].resize(nc);
+    (*meshSideSets_)[3][3].resize(0);
+    (*meshSideSets_)[4].resize(4);
+    (*meshSideSets_)[4][0].resize(0);
+    (*meshSideSets_)[4][1].resize(0);
+    (*meshSideSets_)[4][2].resize(np);
+    (*meshSideSets_)[4][3].resize(0);
+    // Left
+    (*meshSideSets_)[5].resize(4);
+    (*meshSideSets_)[5][0].resize(0);
+    (*meshSideSets_)[5][1].resize(0);
+    (*meshSideSets_)[5][2].resize(0);
+    (*meshSideSets_)[5][3].resize(ny_);
+    
+    for (int i=0; i<nx_; ++i) {
+      (*meshSideSets_)[0][0][i] = i;
+    }
+    for (int i=0; i<ny_; ++i) {
+      (*meshSideSets_)[1][1][i] = (i+1)*nx_-1;
+    }
+    for (int i=0; i<np; ++i) {
+      (*meshSideSets_)[2][2][i] = i + nx_*(ny_-1);
+    }
+    for (int i=0; i<nc; ++i) {
+      (*meshSideSets_)[3][2][i] = i + nx_*(ny_-1) + np;
+    }
+    for (int i=0; i<np; ++i) {
+      (*meshSideSets_)[4][2][i] = i + nx_*(ny_-1) + (np+nc);
+    }
+    for (int i=0; i<ny_; ++i) {
+      (*meshSideSets_)[5][3][i] = i*nx_;
+    }
+  } // carrierplate_computeSideSets
+
+  void wheel_computeSideSets() {
+    int numSideSets = 8;
+    meshSideSets_ = ROL::makePtr<std::vector<std::vector<std::vector<int>>>>(numSideSets);
+
+    Real pf(0.125);
+    Real patchFrac = (pf < width_) ? pf/width_ : pf;
+    int np = static_cast<int>(patchFrac * static_cast<Real>(nx_));
+    int nz = static_cast<int>(0.5*static_cast<Real>(nx_-4*np));
+    int nc = nx_-2*(np+nz);
+
+    // Bottom
+    (*meshSideSets_)[0].resize(4);
+    (*meshSideSets_)[0][0].resize(np);
+    (*meshSideSets_)[0][1].resize(0);
+    (*meshSideSets_)[0][2].resize(0);
+    (*meshSideSets_)[0][3].resize(0);
+    (*meshSideSets_)[1].resize(4);
+    (*meshSideSets_)[1][0].resize(nz);
+    (*meshSideSets_)[1][1].resize(0);
+    (*meshSideSets_)[1][2].resize(0);
+    (*meshSideSets_)[1][3].resize(0);
+    (*meshSideSets_)[2].resize(4);
+    (*meshSideSets_)[2][0].resize(nc);
+    (*meshSideSets_)[2][1].resize(0);
+    (*meshSideSets_)[2][2].resize(0);
+    (*meshSideSets_)[2][3].resize(0);
+    (*meshSideSets_)[3].resize(4);
+    (*meshSideSets_)[3][0].resize(nz);
+    (*meshSideSets_)[3][1].resize(0);
+    (*meshSideSets_)[3][2].resize(0);
+    (*meshSideSets_)[3][3].resize(0);
+    (*meshSideSets_)[4].resize(4);
+    (*meshSideSets_)[4][0].resize(np);
+    (*meshSideSets_)[4][1].resize(0);
+    (*meshSideSets_)[4][2].resize(0);
+    (*meshSideSets_)[4][3].resize(0);
+    // Right
+    (*meshSideSets_)[5].resize(4);
+    (*meshSideSets_)[5][0].resize(0);
+    (*meshSideSets_)[5][1].resize(ny_);
+    (*meshSideSets_)[5][2].resize(0);
+    (*meshSideSets_)[5][3].resize(0);
+    // Top
+    (*meshSideSets_)[6].resize(4);
+    (*meshSideSets_)[6][0].resize(0);
+    (*meshSideSets_)[6][1].resize(0);
+    (*meshSideSets_)[6][2].resize(nx_);
+    (*meshSideSets_)[6][3].resize(0);
+    // Left
+    (*meshSideSets_)[7].resize(4);
+    (*meshSideSets_)[7][0].resize(0);
+    (*meshSideSets_)[7][1].resize(0);
+    (*meshSideSets_)[7][2].resize(0);
+    (*meshSideSets_)[7][3].resize(ny_);
+    
+    for (int i=0; i<np; ++i) {
+      (*meshSideSets_)[0][0][i] = i;
+    }
+    for (int i=0; i<nz; ++i) {
+      (*meshSideSets_)[1][0][i] = i + np;
+    }
+    for (int i=0; i<nc; ++i) {
+      (*meshSideSets_)[2][0][i] = i + (np+nz);
+    }
+    for (int i=0; i<nz; ++i) {
+      (*meshSideSets_)[3][0][i] = i + (np+nz+nc);
+    }
+    for (int i=0; i<np; ++i) {
+      (*meshSideSets_)[4][0][i] = i + (np+nz+nc+nz);
+    }
+    for (int i=0; i<ny_; ++i) {
+      (*meshSideSets_)[5][1][i] = (i+1)*nx_-1;
+    }
+    for (int i=0; i<nx_; ++i) {
+      (*meshSideSets_)[6][2][i] = i + nx_*(ny_-1);
+    }
+    for (int i=0; i<ny_; ++i) {
+      (*meshSideSets_)[7][3][i] = i*nx_;
+    }
+  } // wheel_computeSideSets
+
+  void default_computeSideSets() {
+
+    int numSideSets = 4;
+    meshSideSets_ = ROL::makePtr<std::vector<std::vector<std::vector<int>>>>(numSideSets);
+
+    //
+    (*meshSideSets_)[0].resize(4);
+    (*meshSideSets_)[0][0].resize(nx_);
+    (*meshSideSets_)[0][1].resize(0);
+    (*meshSideSets_)[0][2].resize(0);
+    (*meshSideSets_)[0][3].resize(0);
+    //
+    (*meshSideSets_)[1].resize(4);
+    (*meshSideSets_)[1][0].resize(0);
+    (*meshSideSets_)[1][1].resize(ny_);
+    (*meshSideSets_)[1][2].resize(0);
+    (*meshSideSets_)[1][3].resize(0);
+    //
+    (*meshSideSets_)[2].resize(4);
+    (*meshSideSets_)[2][0].resize(0);
+    (*meshSideSets_)[2][1].resize(0);
+    (*meshSideSets_)[2][2].resize(nx_);
+    (*meshSideSets_)[2][3].resize(0);
+    //
+    (*meshSideSets_)[3].resize(4);
+    (*meshSideSets_)[3][0].resize(0);
+    (*meshSideSets_)[3][1].resize(0);
+    (*meshSideSets_)[3][2].resize(0);
+    (*meshSideSets_)[3][3].resize(ny_);
+    
+    for (int i=0; i<nx_; ++i) {
+      (*meshSideSets_)[0][0][i] = i;
+    }
+    for (int i=0; i<ny_; ++i) {
+      (*meshSideSets_)[1][1][i] = (i+1)*nx_-1;
+    }
+    for (int i=0; i<nx_; ++i) {
+      (*meshSideSets_)[2][2][i] = i + nx_*(ny_-1);
+    }
+    for (int i=0; i<ny_; ++i) {
+      (*meshSideSets_)[3][3][i] = i*nx_;
+    }
+  } // default_computeSideSets
+
+  ROL::Ptr<std::vector<std::vector<std::vector<int>>>> getSideSets(
+              const bool verbose = false,
+              std::ostream & outStream = std::cout) const { 
+    if ( verbose ) {
+      outStream << "Mesh_TopoOpt: getSideSets called" << std::endl;
+      outStream << "Mesh_TopoOpt: numSideSets = " << meshSideSets_->size() << std::endl;
+    }
+    return meshSideSets_;
+  }
+  
+}; // MeshManager_TopoOpt
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/topo-opt/elasticity/src/pde_elasticityK.hpp b/packages/rol/example/PDE-OPT/topo-opt/elasticity/src/pde_elasticityK.hpp
new file mode 100644
index 000000000000..9ea3c0e3a47f
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/topo-opt/elasticity/src/pde_elasticityK.hpp
@@ -0,0 +1,639 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  pde_elasticity.hpp
+    \brief Implements the local PDE interface for the structural topology
+           optimization problem.
+*/
+
+#ifndef PDE_TOPO_OPT_ELASTICITY_HPP
+#define PDE_TOPO_OPT_ELASTICITY_HPP
+
+#include "../../../TOOLS/pdeK.hpp"
+#include "../../../TOOLS/feK.hpp"
+#include "../../../TOOLS/fieldhelperK.hpp"
+
+#include "dirichletK.hpp"
+#include "tractionK.hpp"
+#include "loadK.hpp"
+#include "materialtensorK.hpp"
+
+#include "Intrepid2_HVOL_C0_FEM.hpp"
+#include "Intrepid2_HGRAD_QUAD_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_QUAD_C2_FEM.hpp"
+#include "Intrepid2_HGRAD_HEX_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_HEX_C2_FEM.hpp"
+#include "Intrepid2_DefaultCubatureFactory.hpp"
+#include "Intrepid2_FunctionSpaceTools.hpp"
+#include "Intrepid2_CellTools.hpp"
+
+#include "ROL_Ptr.hpp"
+
+template <class Real, class DeviceType>
+class PDE_Elasticity : public PDE<Real, DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  // Finite element basis information
+  basis_ptr basisPtr_, basisPtrDens_;
+  std::vector<basis_ptr> basisPtrs_, basisPtrsDens_;
+  // Cell cubature information
+  ROL::Ptr<Intrepid2::Cubature<DeviceType,Real,Real>> cellCub_, bdryCub_;
+  // Cell node information
+  scalar_view volCellNodes_;
+  std::vector<std::vector<scalar_view>> bdryCellNodes_;
+  std::vector<std::vector<std::vector<int>>> bdryCellLocIds_;
+  // Finite element definition
+  ROL::Ptr<fe_type> fe_, feDens_;
+  std::vector<std::vector<ROL::Ptr<fe_type>>> feBdry_;
+  // Local degrees of freedom on boundary, for each side of the reference cell (first index).
+  std::vector<std::vector<int>> fidx_;
+  // Coordinates of degrees freedom on boundary cells.
+  // Indexing:  [sideset number][local side id](cell number, value at dof)
+  std::vector<std::vector<scalar_view>> bdryCellDofValues_;
+  // Field pattern, offsets, etc.
+  std::vector<std::vector<int>> fieldPattern_;   // local Field/DOF pattern; set from DOF manager 
+  int numFields_;                                // number of fields (equations in the PDE)
+  int numDofs_;                                  // total number of degrees of freedom for all (local) fields
+  std::vector<int> offset_;                      // for each field, a counting offset
+  std::vector<int> numFieldDofs_;                // for each field, number of degrees of freedom
+  std::vector<std::vector<int>> fieldPatternDens_;   // local Field/DOF pattern; set from DOF manager 
+  int numFieldsDens_;                                // number of fields (equations in the PDE)
+  int numDofsDens_;                                  // total number of degrees of freedom for all (local) fields
+  std::vector<int> offsetDens_;                      // for each field, a counting offset
+  std::vector<int> numFieldDofsDens_;                // for each field, number of degrees of freedom
+
+  ROL::Ptr<Load<Real,DeviceType>>            load_; 
+  ROL::Ptr<MaterialTensor<Real,DeviceType>>  matTensor_;
+  ROL::Ptr<Dirichlet<Real,DeviceType>>       dirichlet_;
+  ROL::Ptr<Traction<Real,DeviceType>>        traction_;
+  ROL::Ptr<FieldUtils::FieldInfo> fieldInfo_, fieldInfoDens_;
+
+  bool getFields2called_;
+
+public:
+  PDE_Elasticity(ROL::ParameterList &parlist) : getFields2called_(false) {
+    // Finite element fields.
+    int basisOrder     = parlist.sublist("Problem").get("Basis Order",1);
+    int basisOrderDens = parlist.sublist("Problem").get("Density Basis Order",0);
+    int cubDegree      = parlist.sublist("Problem").get("Cubature Degree",4);
+    int bdryCubDegree  = parlist.sublist("Problem").get("Boundary Cubature Degree",2);
+    int probDim        = parlist.sublist("Problem").get("Problem Dimension",2);
+    if (probDim > 3 || probDim < 2) {
+      TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument,
+        ">>> PDE-OPT/poisson/pde_poisson.hpp: Problem dimension is not 2 or 3!");
+    }
+    if (basisOrder > 2 || basisOrder < 1) {
+      TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument,
+        ">>> PDE-OPT/poisson/pde_poisson.hpp: Basis order is not 1 or 2!");
+    }
+    if (probDim == 2) {
+      if (basisOrder == 1)
+        basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceType,Real,Real>>();
+      else if (basisOrder == 2)
+        basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C2_FEM<DeviceType,Real,Real>>();
+    }
+    else if (probDim == 3) {
+      if (basisOrder == 1)
+        basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_HEX_C1_FEM<DeviceType,Real,Real>>();
+      else if (basisOrder == 2)
+        basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_HEX_C2_FEM<DeviceType,Real,Real>>();
+    }
+    basisPtrs_.clear(); basisPtrs_.resize(probDim,basisPtr_);
+    shards::CellTopology cellType = basisPtr_->getBaseCellTopology(); // get cell type from basis
+    if (basisOrderDens == 1)
+      basisPtrDens_ = ROL::makePtr<Intrepid2::Basis_HVOL_C0_FEM<DeviceType,Real,Real>>(cellType);
+    else {
+      if (probDim == 2)
+        basisPtrDens_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceType,Real,Real>>();
+      else if (probDim == 3)
+        basisPtrDens_ = ROL::makePtr<Intrepid2::Basis_HGRAD_HEX_C1_FEM<DeviceType,Real,Real>>();
+    }
+    basisPtrsDens_.clear(); basisPtrsDens_.push_back(basisPtrDens_); // Density component
+
+    // Quadrature rules.
+    Intrepid2::DefaultCubatureFactory cubFactory;                            // create cubature factory
+    cellCub_ = cubFactory.create<DeviceType,Real,Real>(cellType, cubDegree); // create default cubature
+
+    shards::CellTopology bdryCellType = cellType.getCellTopologyData(probDim-1, 0);
+    bdryCub_ = cubFactory.create<DeviceType,Real,Real>(bdryCellType, bdryCubDegree);
+
+    matTensor_ = ROL::makePtr<MaterialTensor<Real,DeviceType>>(parlist.sublist("Problem"));
+    std::string example = parlist.sublist("Problem").get("Example","Default");
+    load_    = ROL::makePtr<Load<Real,DeviceType>>(parlist.sublist("Problem"),example);
+    traction_= ROL::makePtr<Traction<Real,DeviceType>>(parlist.sublist("Problem"),example);
+    dirichlet_ = ROL::makePtr<Dirichlet<Real,DeviceType>>(parlist.sublist("Problem"),example);
+
+    numDofs_ = 0;
+    numFields_ = basisPtrs_.size();
+    offset_.resize(numFields_);
+    numFieldDofs_.resize(numFields_);
+    for (int i=0; i<numFields_; ++i) {
+      if (i==0)
+        offset_[i]  = 0;
+      else
+        offset_[i]  = offset_[i-1] + basisPtrs_[i-1]->getCardinality();
+      numFieldDofs_[i] = basisPtrs_[i]->getCardinality();
+      numDofs_ += numFieldDofs_[i];
+    }
+
+    numDofsDens_ = 0;
+    numFieldsDens_ = basisPtrsDens_.size();
+    offsetDens_.resize(numFieldsDens_);
+    numFieldDofsDens_.resize(numFieldsDens_);
+    for (int i=0; i<numFieldsDens_; ++i) {
+      if (i==0)
+        offsetDens_[i]  = 0;
+      else
+        offsetDens_[i]  = offsetDens_[i-1] + basisPtrsDens_[i-1]->getCardinality();
+      numFieldDofsDens_[i] = basisPtrsDens_[i]->getCardinality();
+      numDofsDens_ += numFieldDofsDens_[i];
+    }
+  }
+
+  void residual(scalar_view &res,
+                const scalar_view u_coeff,
+                const scalar_view z_coeff = scalar_view(),
+                const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    int p = fe_->gradN().extent_int(2);
+    int d = fe_->gradN().extent_int(3);
+    int matd = matTensor_->getMatrixDim();
+ 
+    // Initialize residuals.
+    std::vector<scalar_view> R(d);
+    for (int i=0; i<d; ++i) R[i] = scalar_view("res",c,f);
+
+    // Split u_coeff into components.
+    std::vector<scalar_view> U, Z;
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoDens_);
+
+    // Evaluate/interpolate finite element fields on cells.
+    std::vector<scalar_view> gradDisp_eval(d);
+    scalar_view UMat;
+    scalar_view rho("rho", c, p);
+    scalar_view valZ_eval("valZ_eval", c, p);
+    scalar_view rhoUMat("rhoUMat", c, p, matd);
+    for (int i=0; i<d; ++i) {
+      gradDisp_eval[i] =  scalar_view("gradDisp_eval", c, p, d);
+      fe_->evaluateGradient(gradDisp_eval[i], U[i]);
+    }
+    feDens_->evaluateValue(valZ_eval, Z[0]);
+
+    // EVALUATE MATERIAL TENSOR
+    matTensor_->computeUmat(UMat, gradDisp_eval);
+    matTensor_->computeDensity(rho, valZ_eval);
+    fst::scalarMultiplyDataData(rhoUMat, rho, UMat);
+    for (int i=0; i<d; ++i)
+      fst::integrate(R[i],rhoUMat,matTensor_->CBdetJ(i),false);
+
+    // EVALUATE LOAD
+    if (!load_->isNull()) {
+      std::vector<scalar_view> load(d);
+      for (int i=0; i<d; ++i)
+        load[i] = scalar_view("load", c, p);
+      load_->compute(load, fe_, PDE<Real,DeviceType>::getParameter(), static_cast<Real>(-1));
+      for (int i=0; i<d; ++i)
+        fst::integrate(R[i],load[i],fe_->NdetJ(),true);
+    }
+
+    // APPLY TRACTION CONDITIONS
+    if (!traction_->isNull()) {
+      traction_->apply(R, feBdry_, PDE<Real,DeviceType>::getParameter(), static_cast<Real>(-1));
+    }
+
+    // APPLY DIRICHLET CONDITIONS
+    dirichlet_->applyResidual(R,U);
+
+    // Combine the residuals.
+    FieldUtils::combineFieldCoeff<Real>(res, R, fieldInfo_);
+  }
+
+  void Jacobian_1(scalar_view &jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    int p = fe_->gradN().extent_int(2);
+    int d = fe_->gradN().extent_int(3);
+    int matd = matTensor_->getMatrixDim();
+ 
+    // Initialize Jacobians.
+    std::vector<std::vector<scalar_view>> J(d);
+    for (int i=0; i<d; ++i) {
+      for (int j=0; j<d; ++j)
+        J[i].push_back(scalar_view("jac1",c,f,f));
+    }
+
+    // Split z_coeff into components.
+    std::vector<scalar_view> Z;
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoDens_);
+
+    std::vector<scalar_view> rhoBMat(d);
+    scalar_view rho("rho", c, p);
+    scalar_view valZ_eval("valZ_eval", c, p);
+
+    feDens_->evaluateValue(valZ_eval, Z[0]);
+    matTensor_->computeDensity(rho, valZ_eval);
+    for (int i=0; i<d; ++i) {
+      rhoBMat[i] =  scalar_view("rhoBMat", c, f, p, matd);
+      fst::scalarMultiplyDataField(rhoBMat[i], rho, matTensor_->B(i));
+    }
+
+    /*** Evaluate weak form of the Jacobian. ***/
+    for (int i=0; i<d; ++i) {
+      for (int j=0; j<d; ++j)
+        fst::integrate(J[i][j],rhoBMat[i],matTensor_->CBdetJ(j),false);
+    }
+
+    // APPLY DIRICHLET CONDITIONS
+    dirichlet_->applyJacobian1(J);
+
+    // Combine the jacobians.
+    FieldUtils::combineFieldCoeff<Real>(jac, J, fieldInfo_, fieldInfo_);
+  }
+
+  void Jacobian_2(scalar_view &jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    int c  = fe_->gradN().extent_int(0);
+    int f  = fe_->gradN().extent_int(1);
+    int fd = feDens_->gradN().extent_int(1);
+    int p  = fe_->gradN().extent_int(2);
+    int d  = fe_->gradN().extent_int(3);
+    int matd = matTensor_->getMatrixDim();
+
+    // Initialize Jacobians.
+    std::vector<std::vector<scalar_view>> J(d);
+    for (int i=0; i<d; ++i)
+      J[i].push_back(scalar_view("jac1",c,f,fd));
+
+    // Split u_coeff into components.
+    std::vector<scalar_view> U, Z;
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoDens_);
+
+    // Evaluate/interpolate finite element fields on cells.
+    std::vector<scalar_view> gradDisp_eval(d), CBrhoUMat(d);
+    scalar_view UMat;
+    scalar_view rho("rho", c, p);
+    scalar_view valZ_eval("valZ_eval", c, p);
+    scalar_view rhoUMat("rhoUMat", c, p, matd);
+    for (int i=0; i<d; ++i) {
+      CBrhoUMat[i]     = scalar_view("CBrhoUMat", c, f, p);
+      gradDisp_eval[i] = scalar_view("gradDisp_eval", c, p, d);
+      fe_->evaluateGradient(gradDisp_eval[i], U[i]);
+    }
+    feDens_->evaluateValue(valZ_eval, Z[0]);
+    matTensor_->computeUmat(UMat, gradDisp_eval);
+    matTensor_->computeDensity(rho, valZ_eval, 1);  // first derivative
+    fst::scalarMultiplyDataData(rhoUMat, rho, UMat);
+
+    for (int i=0; i<d; ++i)
+      fst::dotMultiplyDataField(CBrhoUMat[i], rhoUMat, matTensor_->CBdetJ(i));
+
+    /*** Evaluate weak form of the residual. ***/
+    for (int i=0; i<d; ++i)
+      fst::integrate(J[i][0],CBrhoUMat[i],feDens_->N(),false);
+
+    // APPLY DIRICHLET CONDITIONS
+    dirichlet_->applyJacobian2(J);
+
+    // Combine the jacobians.
+    FieldUtils::combineFieldCoeff<Real>(jac, J, fieldInfo_, fieldInfoDens_);
+  }
+
+  void Hessian_11(scalar_view &hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_TopoOpt::Hessian_11): Hessian is zero.");
+  }
+
+  void Hessian_12(scalar_view &hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    int c  = fe_->gradN().extent_int(0);
+    int f  = fe_->gradN().extent_int(1);
+    int fd = feDens_->gradN().extent_int(1);
+    int p  = fe_->gradN().extent_int(2);
+    int d  = fe_->gradN().extent_int(3);
+    int matd = matTensor_->getMatrixDim();
+
+    // Initialize Hessians.
+    std::vector<std::vector<scalar_view>> J(1);
+    for (int i=0; i<d; ++i)
+      J[0].push_back(scalar_view("hes12",c,fd,f));
+
+    // Split u_coeff into components.
+    std::vector<scalar_view> L, Z;
+    FieldUtils::splitFieldCoeff<Real>(L, l_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoDens_);
+
+    // Apply Dirichlet conditions to the multipliers.
+    dirichlet_->applyMultiplier(L);
+
+    // Evaluate/interpolate finite element fields on cells.
+    std::vector<scalar_view> gradDisp_eval(d), CBrhoLMat(d);
+    scalar_view LMat;
+    scalar_view rho("rho", c, p);
+    scalar_view valZ_eval("valZ_eval", c, p);
+    scalar_view rhoLMat("rhoLMat", c, p, matd);
+    for (int i=0; i<d; ++i) {
+      CBrhoLMat[i]     = scalar_view("CBrhoLMat", c, f, p);
+      gradDisp_eval[i] = scalar_view("gradDisp_eval", c, p, d);
+      fe_->evaluateGradient(gradDisp_eval[i], L[i]);
+    }
+    feDens_->evaluateValue(valZ_eval, Z[0]);
+    matTensor_->computeUmat(LMat, gradDisp_eval);
+    matTensor_->computeDensity(rho, valZ_eval, 1);  // first derivative
+    fst::scalarMultiplyDataData(rhoLMat, rho, LMat);
+
+    for (int i=0; i<d; ++i)
+      fst::dotMultiplyDataField(CBrhoLMat[i], rhoLMat, matTensor_->CBdetJ(i));
+
+    /*** Evaluate weak form of the residual. ***/
+    for (int i=0; i<d; ++i)
+      fst::integrate(J[0][i],feDens_->N(),CBrhoLMat[i],false);
+
+    // Combine the Hessians.
+    FieldUtils::combineFieldCoeff<Real>(hess, J, fieldInfoDens_, fieldInfo_);
+
+  }
+
+  void Hessian_21(scalar_view &hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    int c  = fe_->gradN().extent_int(0);
+    int f  = fe_->gradN().extent_int(1);
+    int fd = feDens_->gradN().extent_int(1);
+    int p  = fe_->gradN().extent_int(2);
+    int d  = fe_->gradN().extent_int(3);
+    int matd = matTensor_->getMatrixDim();
+
+    // Initialize Hessians.
+    std::vector<std::vector<scalar_view>> J(d);
+    for (int i=0; i<d; ++i)
+      J[i].push_back(scalar_view("hes21",c,f,fd));
+
+    // Split u_coeff into components.
+    std::vector<scalar_view> L, Z;
+    FieldUtils::splitFieldCoeff<Real>(L, l_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoDens_);
+
+    // Apply Dirichlet conditions to the multipliers.
+    dirichlet_->applyMultiplier(L);
+
+    // Evaluate/interpolate finite element fields on cells.
+    std::vector<scalar_view> gradDisp_eval(d), CBrhoLMat(d);
+    scalar_view LMat;
+    scalar_view rho("rho", c, p);
+    scalar_view valZ_eval("valZ_eval", c, p);
+    scalar_view rhoLMat("rhoLMat", c, p, matd);
+    for (int i=0; i<d; ++i) {
+      CBrhoLMat[i]     = scalar_view("CBrhoLMat", c, f, p);
+      gradDisp_eval[i] = scalar_view("gradDisp_eval", c, p, d);
+      fe_->evaluateGradient(gradDisp_eval[i], L[i]);
+    }
+    feDens_->evaluateValue(valZ_eval, Z[0]);
+    matTensor_->computeUmat(LMat, gradDisp_eval);
+    matTensor_->computeDensity(rho, valZ_eval, 1);  // first derivative
+    fst::scalarMultiplyDataData(rhoLMat, rho, LMat);
+
+    for (int i=0; i<d; ++i)
+      fst::dotMultiplyDataField(CBrhoLMat[i], rhoLMat, matTensor_->CBdetJ(i));
+
+    /*** Evaluate weak form of the residual. ***/
+    for (int i=0; i<d; ++i)
+      fst::integrate(J[i][0],CBrhoLMat[i],feDens_->N(),false);
+
+    // Combine the Hessians.
+    FieldUtils::combineFieldCoeff(hess, J, fieldInfo_, fieldInfoDens_);
+  }
+
+  void Hessian_22(scalar_view &hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> &z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    int c  = fe_->gradN().extent_int(0);
+    int fd = feDens_->gradN().extent_int(1);
+    int p  = fe_->gradN().extent_int(2);
+    int d  = fe_->gradN().extent_int(3);
+    int matd = matTensor_->getMatrixDim();
+
+    // Initialize Hessians.
+    std::vector<std::vector<scalar_view>> J(1);
+    J[0].push_back(scalar_view("hes22",c,fd,fd));
+
+    // Split u_coeff into components.
+    std::vector<scalar_view> U, L, Z;
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(L, l_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoDens_);
+
+    // Apply Dirichlet conditions to the multipliers.
+    dirichlet_->applyMultiplier(L);
+
+    // Evaluate/interpolate finite element fields on cells.
+    std::vector<scalar_view> gradDispU_eval(d), gradDispL_eval(d);
+    scalar_view UMat, LMat;
+    scalar_view rho("rho", c, p);
+    scalar_view valZ_eval("valZ_eval", c, p);
+    scalar_view rhoLMat("rhoLMat", c, p, matd);
+    scalar_view CUMat("CUMat", c, p, matd);
+    scalar_view CUrhoLMat("CUrhoLMat", c, p);
+    scalar_view NCUrhoLMat("NCUrhoLMat", c, fd, p);
+    for (int i=0; i<d; ++i) {
+      gradDispU_eval[i] = scalar_view("gradDispU_eval", c, p, d);
+      fe_->evaluateGradient(gradDispU_eval[i], U[i]);
+      gradDispL_eval[i] = scalar_view("gradDispL_eval", c, p, d);
+      fe_->evaluateGradient(gradDispL_eval[i], L[i]);
+    }
+    feDens_->evaluateValue(valZ_eval, Z[0]);
+    matTensor_->computeUmat(UMat, gradDispU_eval);
+    matTensor_->computeUmat(LMat, gradDispL_eval);
+    matTensor_->computeDensity(rho, valZ_eval, 2);  // second derivative
+    fst::scalarMultiplyDataData(rhoLMat, rho, LMat);
+    matTensor_->applyTensor(CUMat, UMat);
+
+    fst::dotMultiplyDataData(CUrhoLMat, rhoLMat, CUMat);
+    fst::scalarMultiplyDataField(NCUrhoLMat, CUrhoLMat, feDens_->N());
+
+    /*** Evaluate weak form of the residual. ***/
+    fst::integrate(J[0][0],NCUrhoLMat,feDens_->NdetJ(),false);
+
+    // Combine the Hessians.
+    FieldUtils::combineFieldCoeff(hess, J, fieldInfoDens_, fieldInfoDens_);
+  }
+
+  void RieszMap_1(scalar_view & riesz) override {
+    // Retrieve dimensions.
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    int d = fe_->gradN().extent_int(3);
+ 
+    // Initialize Jacobians.
+    std::vector<std::vector<scalar_view>> J(d);
+    for (int i=0; i<d; ++i) {
+      for (int j=0; j<d; ++j)
+        J[i].push_back(scalar_view("riesz1",c,f,f));
+    }
+
+    for (int i=0; i<d; ++i) {
+      Kokkos::deep_copy(J[i][i],fe_->stiffMat());
+      rst::add(J[i][i],fe_->massMat());
+    }
+
+    // Combine the jacobians.
+    FieldUtils::combineFieldCoeff<Real>(riesz, J, fieldInfo_, fieldInfo_);
+  }
+
+  void RieszMap_2(scalar_view & riesz) override {
+    // Retrieve dimensions.
+    int c = feDens_->gradN().extent_int(0);
+    int f = feDens_->gradN().extent_int(1);
+
+    riesz = scalar_view("riesz2",c,f,f);
+    Kokkos::deep_copy(riesz,feDens_->massMat());
+  }
+
+  // This must be called before getFields2
+  void setDensityFields(const std::vector<basis_ptr> &basisPtrs) {
+    if (getFields2called_) {
+      TEUCHOS_TEST_FOR_EXCEPTION(getFields2called_, std::invalid_argument,
+        ">>> PDE-OPT/topo-opt/elasticity/src/pde_elasticity.hpp: Must call before getFields2!");
+    }
+    else {
+      basisPtrDens_  = basisPtrs[0];
+      basisPtrsDens_ = basisPtrs;
+
+      numDofsDens_ = 0;
+      numFieldsDens_ = basisPtrsDens_.size();
+      offsetDens_.resize(numFieldsDens_);
+      numFieldDofsDens_.resize(numFieldsDens_);
+      for (int i=0; i<numFieldsDens_; ++i) {
+        if (i==0)
+          offsetDens_[i]  = 0;
+        else
+          offsetDens_[i]  = offsetDens_[i-1] + basisPtrsDens_[i-1]->getCardinality();
+        numFieldDofsDens_[i] = basisPtrsDens_[i]->getCardinality();
+        numDofsDens_ += numFieldDofsDens_[i];
+      }
+    }
+  }
+
+  std::vector<basis_ptr> getFields() override {
+    return basisPtrs_;
+  }
+
+  std::vector<basis_ptr> getFields2() override {
+    getFields2called_ = true;
+    return basisPtrsDens_;
+  }
+
+  void setCellNodes(const scalar_view &volCellNodes,
+                    const std::vector<std::vector<scalar_view>> &bdryCellNodes,
+                    const std::vector<std::vector<std::vector<int>>> &bdryCellLocIds) override {
+    volCellNodes_ = volCellNodes;
+    bdryCellNodes_ = bdryCellNodes;
+    bdryCellLocIds_ = bdryCellLocIds;
+    // Finite element definition.
+    fe_ = ROL::makePtr<fe_type>(volCellNodes_,basisPtr_,cellCub_);
+    feDens_ = ROL::makePtr<fe_type>(volCellNodes_,basisPtrDens_,cellCub_,false);
+    fidx_ = fe_->getBoundaryDofs();
+    if (!traction_->isNull()) {
+      traction_->setCellNodes(bdryCellNodes_,bdryCellLocIds_);
+    }
+    dirichlet_->setCellNodes(bdryCellNodes_,bdryCellLocIds_,fidx_);
+    matTensor_->setFE(fe_);
+    // Construct boundary FE
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      feBdry_.resize(numSideSets);
+      for (int i = 0; i < numSideSets; ++i) {
+        int numLocSides = bdryCellNodes[i].size();
+        feBdry_[i].resize(numLocSides);
+        for (int j = 0; j < numLocSides; ++j) {
+          if (bdryCellNodes[i][j] != scalar_view()) {
+            feBdry_[i][j] = ROL::makePtr<fe_type>(bdryCellNodes[i][j],basisPtr_,bdryCub_,j);
+          }
+        }
+      }
+    }
+  }
+
+  void setFieldPattern(const std::vector<std::vector<int>> &fieldPattern,
+                       const std::vector<std::vector<int>> &fieldPattern2) override {
+    fieldPattern_     = fieldPattern;
+    fieldPatternDens_ = fieldPattern2;
+    fieldInfo_     = ROL::makePtr<FieldUtils::FieldInfo>(numFields_, numDofs_, numFieldDofs_, fieldPattern_);
+    fieldInfoDens_ = ROL::makePtr<FieldUtils::FieldInfo>(numFieldsDens_, numDofsDens_, numFieldDofsDens_, fieldPatternDens_);
+  }
+
+  const ROL::Ptr<fe_type> getStateFE(void) const {
+    return fe_;
+  }
+
+  const ROL::Ptr<fe_type> getDensityFE(void) const {
+    return feDens_;
+  }
+
+  const std::vector<std::vector<ROL::Ptr<fe_type>>> getBdryFE(void) const {
+    return feBdry_;
+  }
+
+  const std::vector<std::vector<std::vector<int>>> getBdryCellLocIds(void) const {
+    return bdryCellLocIds_;
+  }
+
+  const ROL::Ptr<FieldUtils::FieldInfo> getStateFieldInfo(void) const {
+    return fieldInfo_;
+  }
+
+  const ROL::Ptr<FieldUtils::FieldInfo> getDensityFieldInfo(void) const {
+    return fieldInfoDens_;
+  }
+
+  const ROL::Ptr<Load<Real,DeviceType>> getLoad(void) const {
+    return load_;
+  }
+
+  const ROL::Ptr<Traction<Real,DeviceType>> getTraction(void) const {
+    return traction_;
+  }
+
+  const ROL::Ptr<MaterialTensor<Real,DeviceType>> getMaterialTensor(void) const {
+    return matTensor_;
+  }
+
+}; // PDE_TopoOpt
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/topo-opt/elasticity/src/sampler.hpp b/packages/rol/example/PDE-OPT/topo-opt/elasticity/src/sampler.hpp
index 346eeba8576a..02f9a10424fc 100644
--- a/packages/rol/example/PDE-OPT/topo-opt/elasticity/src/sampler.hpp
+++ b/packages/rol/example/PDE-OPT/topo-opt/elasticity/src/sampler.hpp
@@ -27,17 +27,17 @@ class BuildSampler {
   std::vector<ROL::Ptr<ROL::Distribution<Real>>> distVec_;
 
 public:
-  BuildSampler(Teuchos::ParameterList &parlist, int probDim, std::string ex = "Default") : stochDim_(0) {
+  BuildSampler(ROL::ParameterList &parlist, int probDim, std::string ex = "Default") : stochDim_(0) {
     if (ex == "2D Wheel") {
       stochDim_ = probDim;
       // Magnitude distribution.
-      Teuchos::ParameterList magList;
+      ROL::ParameterList magList;
       magList.sublist("Distribution").set("Name","Uniform");
       magList.sublist("Distribution").sublist("Uniform").set("Lower Bound",-0.5);
       magList.sublist("Distribution").sublist("Uniform").set("Upper Bound", 0.5);
       distVec_.push_back(ROL::DistributionFactory<Real>(magList));
       // Polar angle distribution.
-      Teuchos::ParameterList polList;
+      ROL::ParameterList polList;
       polList.sublist("Distribution").set("Name","Truncated Gaussian");
       polList.sublist("Distribution").sublist("Truncated Gaussian").set("Mean",                0.0);
       polList.sublist("Distribution").sublist("Truncated Gaussian").set("Standard Deviation", 17.0);
@@ -48,19 +48,19 @@ class BuildSampler {
     else if (ex == "2D Truss" || ex == "3D Cantilever") {
       stochDim_ = probDim;
       // Magnitude distribution.
-      Teuchos::ParameterList magList;
+      ROL::ParameterList magList;
       magList.sublist("Distribution").set("Name","Uniform");
       magList.sublist("Distribution").sublist("Uniform").set("Lower Bound",-0.5);
       magList.sublist("Distribution").sublist("Uniform").set("Upper Bound", 0.5);
       distVec_.push_back(ROL::DistributionFactory<Real>(magList));
       // Polar angle distribution.
-      Teuchos::ParameterList polList;
+      ROL::ParameterList polList;
       polList.sublist("Distribution").set("Name","Uniform");
       polList.sublist("Distribution").sublist("Uniform").set("Lower Bound",-45.0);
       polList.sublist("Distribution").sublist("Uniform").set("Upper Bound", 45.0);
       distVec_.push_back(ROL::DistributionFactory<Real>(polList));
       if (probDim == 3) {
-        Teuchos::ParameterList aziList;
+        ROL::ParameterList aziList;
         aziList.sublist("Distribution").set("Name","Uniform");
         aziList.sublist("Distribution").sublist("Uniform").set("Lower Bound",-45.0);
         aziList.sublist("Distribution").sublist("Uniform").set("Upper Bound", 45.0);
@@ -70,25 +70,25 @@ class BuildSampler {
     else if (ex == "2D Beams") {
       stochDim_ = 2*probDim;
       // Load 1: Magnitude distribution.
-      Teuchos::ParameterList magList1;
+      ROL::ParameterList magList1;
       magList1.sublist("Distribution").set("Name","Uniform");
       magList1.sublist("Distribution").sublist("Uniform").set("Lower Bound",-0.25);
       magList1.sublist("Distribution").sublist("Uniform").set("Upper Bound", 0.25);
       distVec_.push_back(ROL::DistributionFactory<Real>(magList1));
       // Load 1: Polar angle distribution.
-      Teuchos::ParameterList polList1;
+      ROL::ParameterList polList1;
       polList1.sublist("Distribution").set("Name","Uniform");
       polList1.sublist("Distribution").sublist("Uniform").set("Lower Bound",-25.0);
       polList1.sublist("Distribution").sublist("Uniform").set("Upper Bound", 25.0);
       distVec_.push_back(ROL::DistributionFactory<Real>(polList1));
       // Load 2: Magnitude distribution.
-      Teuchos::ParameterList magList2;
+      ROL::ParameterList magList2;
       magList2.sublist("Distribution").set("Name","Uniform");
       magList2.sublist("Distribution").sublist("Uniform").set("Lower Bound",-0.75);
       magList2.sublist("Distribution").sublist("Uniform").set("Upper Bound", 0.75);
       distVec_.push_back(ROL::DistributionFactory<Real>(magList2));
       // Load 2: Polar angle distribution.
-      Teuchos::ParameterList polList2;
+      ROL::ParameterList polList2;
       polList2.sublist("Distribution").set("Name","Uniform");
       polList2.sublist("Distribution").sublist("Uniform").set("Lower Bound",-45.0);
       polList2.sublist("Distribution").sublist("Uniform").set("Upper Bound", 45.0);
@@ -104,16 +104,16 @@ class BuildSampler {
           std::stringstream sli;
           sli << "Stochastic Load " << i;
           // Magnitude distribution.
-          Teuchos::ParameterList magList;
+          ROL::ParameterList magList;
           magList.sublist("Distribution") = parlist.sublist(sli.str()).sublist("Magnitude");
           distVec_.push_back(ROL::DistributionFactory<Real>(magList));
           // Polar angle distribution.
-          Teuchos::ParameterList polList;
+          ROL::ParameterList polList;
           polList.sublist("Distribution") = parlist.sublist(sli.str()).sublist("Polar Angle");
           distVec_.push_back(ROL::DistributionFactory<Real>(polList));
           // Azimuth angle distribution.
           if (probDim == 3) {
-            Teuchos::ParameterList aziList;
+            ROL::ParameterList aziList;
             aziList.sublist("Distribution") = parlist.sublist(sli.str()).sublist("Azimuth Angle");
             distVec_.push_back(ROL::DistributionFactory<Real>(aziList));
           }
@@ -128,16 +128,16 @@ class BuildSampler {
           std::stringstream sli;
           sli << "Stochastic Traction " << i;
           // Magnitude distribution.
-          Teuchos::ParameterList magList;
+          ROL::ParameterList magList;
           magList.sublist("Distribution") = parlist.sublist(sli.str()).sublist("Magnitude");
           distVec_.push_back(ROL::DistributionFactory<Real>(magList));
           // Polar angle distribution.
-          Teuchos::ParameterList polList;
+          ROL::ParameterList polList;
           polList.sublist("Distribution") = parlist.sublist(sli.str()).sublist("Polar Angle");
           distVec_.push_back(ROL::DistributionFactory<Real>(polList));
           // Azimuth angle distribution.
           if (probDim == 3) {
-            Teuchos::ParameterList aziList;
+            ROL::ParameterList aziList;
             aziList.sublist("Distribution") = parlist.sublist(sli.str()).sublist("Azimuth Angle");
             distVec_.push_back(ROL::DistributionFactory<Real>(aziList));
           }
diff --git a/packages/rol/example/PDE-OPT/topo-opt/elasticity/src/tractionK.hpp b/packages/rol/example/PDE-OPT/topo-opt/elasticity/src/tractionK.hpp
new file mode 100644
index 000000000000..292a827e66c7
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/topo-opt/elasticity/src/tractionK.hpp
@@ -0,0 +1,293 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  traction.hpp
+    \brief Implements traction loads for the structural topology
+           optimization problem.
+*/
+
+#ifndef ROL_PDEOPT_ELASTICITY_TRACTIONK_HPP
+#define ROL_PDEOPT_ELASTICITY_TRACTIONK_HPP
+
+#include "ROL_Ptr.hpp"
+#include "ROL_ParameterList.hpp"
+
+#include "../../../TOOLS/feK.hpp"
+
+#include <vector>
+
+template<class Real, class DeviceType>
+class Traction {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+
+private:
+  std::vector<int> sidesets_, sideids_;
+  std::vector<Real> loadMagnitude_;
+  std::vector<Real> loadPolar_, loadAzimuth_;
+  std::vector<std::vector<scalar_view>> bdryCellNodes_;
+  std::vector<std::vector<std::vector<int>>> bdryCellLocIds_;
+  int dim_, offset_;
+
+protected:
+  Real DegreesToRadians(const Real deg) const {
+    return deg * static_cast<Real>(M_PI) / static_cast<Real>(180);
+  }
+
+public:
+  virtual ~Traction() {}
+
+  Traction(void) {};
+
+  Traction(ROL::ParameterList & parlist, std::string ex = "Default") {
+    if (ex == "2D Wheel") {
+      dim_ = 2;
+      offset_ = 0;
+      sidesets_.push_back(2);
+      sideids_.push_back(0);
+      loadMagnitude_.push_back(static_cast<Real>(1));
+      loadPolar_.push_back(static_cast<Real>(270));
+      loadAzimuth_.push_back(static_cast<Real>(0));
+    }
+    else if (ex == "2D Cantilever with 1 Load" ||
+             ex == "2D Truss") {
+      dim_ = 2;
+      offset_ = 2;
+      sidesets_.clear();
+      sideids_.clear();
+      loadMagnitude_.clear();
+      loadPolar_.clear();
+      loadAzimuth_.clear();
+    }
+    else if (ex == "2D Cantilever with 3 Loads") {
+      dim_ = 2;
+      offset_ = 0;
+      sidesets_.push_back(1);
+      sidesets_.push_back(3);
+      sidesets_.push_back(5);
+      sideids_.push_back(0);
+      sideids_.push_back(0);
+      sideids_.push_back(0);
+      loadMagnitude_.push_back(static_cast<Real>(1));
+      loadMagnitude_.push_back(static_cast<Real>(1));
+      loadMagnitude_.push_back(static_cast<Real>(1));
+      loadPolar_.push_back(static_cast<Real>(270));
+      loadPolar_.push_back(static_cast<Real>(270));
+      loadPolar_.push_back(static_cast<Real>(270));
+      loadAzimuth_.push_back(static_cast<Real>(0));
+      loadAzimuth_.push_back(static_cast<Real>(0));
+      loadAzimuth_.push_back(static_cast<Real>(0));
+    }
+    else if (ex == "2D Beams") {
+      dim_ = 2;
+      offset_ = 4;
+      sidesets_.clear();
+      sideids_.clear();
+      loadMagnitude_.clear();
+      loadPolar_.clear();
+      loadAzimuth_.clear();
+    }
+    else if (ex == "2D Carrier Plate") {
+      dim_ = 2;
+      offset_ = 0;
+      sidesets_.push_back(2);
+      sidesets_.push_back(4);
+      sideids_.push_back(2);
+      sideids_.push_back(2);
+      loadMagnitude_.push_back(static_cast<Real>(1.0));
+      loadMagnitude_.push_back(static_cast<Real>(0.5));
+      loadPolar_.push_back(static_cast<Real>(270));
+      loadPolar_.push_back(static_cast<Real>(270));
+      loadAzimuth_.push_back(static_cast<Real>(0));
+      loadAzimuth_.push_back(static_cast<Real>(0));
+    }
+    else if (ex == "3D Cantilever") {
+      dim_ = 3;
+      offset_ = 3;
+      sidesets_.clear();
+      sideids_.clear();
+      loadMagnitude_.clear();
+      loadPolar_.clear();
+      loadAzimuth_.clear();
+    }
+    else if (ex == "3D L Beam") {
+      dim_ = 3;
+      offset_ = 0;
+      sidesets_.push_back(1);
+      sideids_.push_back(1);
+      loadMagnitude_.push_back(static_cast<Real>(100));
+      loadPolar_.push_back(static_cast<Real>(270));
+      loadAzimuth_.push_back(static_cast<Real>(90));
+    }
+    else {
+      if (parlist.isSublist("Traction")) {
+        // Grab problem dimension
+        dim_ = parlist.get("Problem Dimension",2);
+
+        offset_ = 0;
+        if (parlist.isSublist("Load")) {
+          std::vector<Real> loadMag
+            = ROL::getArrayFromStringParameter<double>(parlist.sublist("Load"), "Magnitude");
+          offset_ = dim_*static_cast<int>(loadMag.size());
+        }
+
+        // Grab Sidesets
+        sidesets_ = ROL::getArrayFromStringParameter<int>(parlist.sublist("Traction"), "Sidesets");
+
+        // Grab Side IDs
+        sideids_ = ROL::getArrayFromStringParameter<int>(parlist.sublist("Traction"), "Side IDs");
+
+        // Grab Magnitudes
+        loadMagnitude_ = ROL::getArrayFromStringParameter<double>(parlist.sublist("Traction"), "Magnitude");
+
+        // Grab Polar Angle
+        loadPolar_ = ROL::getArrayFromStringParameter<double>(parlist.sublist("Traction"), "Polar Angle");
+
+        if (dim_ == 3) {
+          // Grab Azimuth Angle
+          loadAzimuth_ = ROL::getArrayFromStringParameter<double>(parlist.sublist("Traction"), "Azimuth Angle");
+        }
+      }
+    }
+
+    int polarSize = loadPolar_.size();
+    for (int i=0; i<polarSize; ++i) {
+      loadPolar_[i] = DegreesToRadians(loadPolar_[i]);
+    }
+
+    int azimuthSize = loadAzimuth_.size();
+    for (int i=0; i<azimuthSize; ++i) {
+      loadAzimuth_[i] = DegreesToRadians(loadAzimuth_[i]);
+    }
+  }
+
+  bool isNull(void) const {
+    return (sidesets_.size()==0);
+  }
+
+  void setCellNodes(const std::vector<std::vector<scalar_view>> &bdryCellNodes,
+                    const std::vector<std::vector<std::vector<int>>> &bdryCellLocIds) {
+    bdryCellNodes_  = bdryCellNodes;
+    bdryCellLocIds_ = bdryCellLocIds;
+  }
+
+  virtual Real evaluate(const int                dir,
+                        const int                sideset,
+                        const std::vector<Real> &param) const {
+//    const int numSideSets = sidesets_.size();
+    const int paramSize   = param.size();
+    Real val(0);
+    Real loadMagNoise(0), loadAngNoise0(0);
+    if (paramSize > 0 + dim_*sideset) {
+      loadMagNoise  = param[offset_ + 0 + dim_*sideset];
+    }
+    if (paramSize > 1 + dim_*sideset) {
+      loadAngNoise0 = DegreesToRadians(param[offset_ + 1 + dim_*sideset]);
+    }
+    const Real loadMagnitude = loadMagnitude_[sideset] + loadMagNoise;
+    const Real loadAngle0    = loadPolar_[sideset]     + loadAngNoise0;
+
+    if (dim_==2) {
+      if (dir==0) {
+        val = loadMagnitude*std::cos(loadAngle0);
+      }
+      if (dir==1) {
+        val = loadMagnitude*std::sin(loadAngle0);
+      }
+    }
+    if (dim_==3) {
+      Real loadAngNoise1(0);
+      if (paramSize > 2 + dim_*sideset) {
+        loadAngNoise1 = DegreesToRadians(param[offset_ + 2 + dim_*sideset]);
+      }
+      const Real loadAngle1 = loadAzimuth_[sideset] + loadAngNoise1;
+
+      if (dir==0) {
+        val = loadMagnitude*std::sin(loadAngle0)*std::cos(loadAngle1);
+      }
+      if (dir==1) {
+        val = loadMagnitude*std::sin(loadAngle0)*std::sin(loadAngle1);
+      }
+      if (dir==2) {
+        val = loadMagnitude*std::cos(loadAngle0);
+      }
+    }
+    return val;
+  }
+
+  void compute(std::vector<std::vector<std::vector<scalar_view>>> &traction,
+               const std::vector<std::vector<ROL::Ptr<FE<Real,DeviceType>>>> &fe,
+               const std::vector<Real> &param,
+               const Real scale = Real(1)) const {
+    traction.clear();
+    traction.resize(bdryCellLocIds_.size());
+    const int numSideSets = sidesets_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        const int numLocalSideIds = bdryCellLocIds_[sidesets_[i]].size();
+        traction[sidesets_[i]].resize(numLocalSideIds);
+        for (int j = 0; j < numLocalSideIds; ++j) {
+          const int numCellsSide = bdryCellLocIds_[sidesets_[i]][j].size();
+          if (numCellsSide > 0) {
+            const int numCubPerSide = fe[sidesets_[i]][j]->cubPts().extent_int(1);
+            traction[sidesets_[i]][j].resize(dim_);
+            for (int k = 0; k < dim_; ++k) {
+              traction[sidesets_[i]][j][k]
+                = scalar_view("traction", numCellsSide, numCubPerSide);
+              for (int c = 0; c < numCellsSide; ++c) {
+                for (int p = 0; p < numCubPerSide; ++p) {
+                  (traction[sidesets_[i]][j][k])(c,p) = scale*evaluate(k,i,param);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void apply(std::vector<scalar_view> &R,
+             const std::vector<std::vector<ROL::Ptr<FE<Real,DeviceType>>>> &fe,
+             const std::vector<Real> &param,
+             const Real scale = Real(1)) const {
+    const int numSideSets = sidesets_.size();
+    const int nf          = R[0].extent_int(1);
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        const int numLocalSideIds = bdryCellLocIds_[sidesets_[i]].size();
+        for (int j = 0; j < numLocalSideIds; ++j) {
+          const int numCellsSide = bdryCellLocIds_[sidesets_[i]][j].size();
+          if (numCellsSide > 0) {
+            const int numCubPerSide = fe[sidesets_[i]][j]->cubPts().extent_int(1);
+            for (int k = 0; k < dim_; ++k) {
+              scalar_view traction("traction", numCellsSide, numCubPerSide);
+              for (int c = 0; c < numCellsSide; ++c) {
+                for (int p = 0; p < numCubPerSide; ++p) {
+                  traction(c,p) = scale*evaluate(k,i,param);
+                }
+              }
+              scalar_view tractionResidual("tractionResidual", numCellsSide, nf);
+              fst::integrate(tractionResidual,traction,fe[sidesets_[i]][j]->NdetJ(),false);
+              // Add traction residual to volume residual
+              for (int l = 0; l < numCellsSide; ++l) {
+                int cidx = bdryCellLocIds_[sidesets_[i]][j][l];
+                for (int m = 0; m < nf; ++m) {
+                  (R[k])(cidx,m) += tractionResidual(l,m);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/topo-opt/multimat/CMakeLists.txt b/packages/rol/example/PDE-OPT/topo-opt/multimat/CMakeLists.txt
index c2fe96d549c1..fa1b402f9575 100644
--- a/packages/rol/example/PDE-OPT/topo-opt/multimat/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/topo-opt/multimat/CMakeLists.txt
@@ -11,13 +11,13 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_01
     SOURCES example_01.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     MultMatTopoOptDataCopy
     SOURCE_FILES
       input_ex01.xml
diff --git a/packages/rol/example/PDE-OPT/topo-opt/multimat/example_01.cpp b/packages/rol/example/PDE-OPT/topo-opt/multimat/example_01.cpp
index 998ab4c58ac5..ab0e566fa8ab 100644
--- a/packages/rol/example/PDE-OPT/topo-opt/multimat/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/topo-opt/multimat/example_01.cpp
@@ -14,8 +14,7 @@
 
 #include "Teuchos_Comm.hpp"
 #include "Teuchos_Time.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -37,7 +36,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int>> comm
     = Tpetra::getDefaultComm();
   const int myRank = comm->getRank();
diff --git a/packages/rol/example/PDE-OPT/topo-opt/navier-stokes/CMakeLists.txt b/packages/rol/example/PDE-OPT/topo-opt/navier-stokes/CMakeLists.txt
index 5465ab589ba8..546ff5e70219 100644
--- a/packages/rol/example/PDE-OPT/topo-opt/navier-stokes/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/topo-opt/navier-stokes/CMakeLists.txt
@@ -11,13 +11,13 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
 
-  TRIBITS_ADD_EXECUTABLE(
+  ROL_ADD_EXECUTABLE(
     example_01
     SOURCES example_01.cpp
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     TopOptNavierStokesDataCopy
     SOURCE_FILES
       input.xml plotresults.m mesh.jou
@@ -25,5 +25,33 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
   )
 
+ENDIF()
+
+IF(${PROJECT_NAME}_ENABLE_Intrepid2 AND
+   ${PROJECT_NAME}_ENABLE_Ifpack2   AND
+   ${PROJECT_NAME}_ENABLE_MueLu     AND
+   ${PROJECT_NAME}_ENABLE_Amesos2   AND
+   ${PROJECT_NAME}_ENABLE_Tpetra )
+
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/teuchos/src/sol)
+
+  # Need ROL_TpetraMultiVector.hpp
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/sol)
+
+  ROL_ADD_EXECUTABLE(
+    example_01_Kokkos
+    SOURCES example_01K.cpp
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_COPY_FILES_TO_BINARY_DIR(
+    TopOptNavierStokesDataCopyK
+    SOURCE_FILES
+      input.xml plotresults.m mesh.jou
+    SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}"
+    DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
+  )
+
 
 ENDIF()
diff --git a/packages/rol/example/PDE-OPT/topo-opt/navier-stokes/example_01.cpp b/packages/rol/example/PDE-OPT/topo-opt/navier-stokes/example_01.cpp
index dec92dea2c67..b786784f07c3 100644
--- a/packages/rol/example/PDE-OPT/topo-opt/navier-stokes/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/topo-opt/navier-stokes/example_01.cpp
@@ -14,8 +14,7 @@
 #include "Teuchos_Comm.hpp"
 #include "Teuchos_Time.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -48,7 +47,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int>> comm
     = Tpetra::getDefaultComm();
   const int myRank = comm->getRank();
diff --git a/packages/rol/example/PDE-OPT/topo-opt/navier-stokes/example_01K.cpp b/packages/rol/example/PDE-OPT/topo-opt/navier-stokes/example_01K.cpp
new file mode 100644
index 000000000000..b7da8a0193ad
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/topo-opt/navier-stokes/example_01K.cpp
@@ -0,0 +1,196 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_01.cpp
+    \brief Shows how to solve the Navier-Stokes control problem.
+*/
+
+#include "Teuchos_Comm.hpp"
+#include "Teuchos_Time.hpp"
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+
+#include "ROL_TpetraMultiVector.hpp"
+#include "ROL_Reduced_Objective_SimOpt.hpp"
+#include "ROL_Bounds.hpp"
+#include "ROL_Solver.hpp"
+#include "ROL_SingletonVector.hpp"
+#include "ROL_ConstraintFromObjective.hpp"
+
+#include "../../TOOLS/meshreaderK.hpp"
+#include "../../TOOLS/pdeconstraintK.hpp"
+#include "../../TOOLS/pdeobjectiveK.hpp"
+#include "../../TOOLS/pdevectorK.hpp"
+#include "../../TOOLS/integralconstraintK.hpp"
+
+#include "pde_navier-stokesK.hpp"
+#include "obj_navier-stokesK.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+int main(int argc, char *argv[]) {
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  Kokkos::ScopeGuard kokkosScope (argc, argv);
+  ROL::Ptr<const Teuchos::Comm<int>> comm
+    = Tpetra::getDefaultComm();
+  const int myRank = comm->getRank();
+  if ((iprint > 0) && (myRank == 0))
+    outStream = ROL::makePtrFromRef(std::cout);
+  else
+    outStream = ROL::makePtrFromRef(bhs);
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+
+    /*** Read in XML input ***/
+    std::string filename = "input.xml";
+    auto parlist = ROL::getParametersFromXmlFile(filename);
+
+    bool output = parlist->sublist("SimOpt").sublist("Solve").get("Output Iteration History",false);
+    output = (iprint > 0) && (myRank==0) && output;
+    parlist->sublist("SimOpt").sublist("Solve").set("Output Iteration History",output);
+
+    /*** Initialize main data structure. ***/
+    auto meshMgr  = ROL::makePtr<MeshReader<RealT,DeviceT>>(*parlist);
+    // Initialize PDE describing Navier-Stokes equations.
+    auto pde      = ROL::makePtr<PDE_NavierStokes<RealT,DeviceT>>(*parlist);
+    auto con      = ROL::makePtr<PDE_Constraint<RealT,DeviceT>>(pde,meshMgr,comm,*parlist,*outStream);
+    // Cast the constraint and get the assembler.
+    auto assembler = con->getAssembler();
+    con->setSolveParameters(*parlist);
+
+    // Create state vector and set to zeroes
+    auto u_ptr = assembler->createStateVector();    u_ptr->randomize();
+    auto p_ptr = assembler->createStateVector();    p_ptr->randomize();
+    auto z_ptr = assembler->createControlVector();  z_ptr->randomize();
+    auto r_ptr = assembler->createResidualVector(); r_ptr->putScalar(0.0);
+    auto up = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(u_ptr,pde,assembler,*parlist);
+    auto pp = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(p_ptr,pde,assembler,*parlist);
+    auto zp = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(z_ptr,pde,assembler,*parlist);
+    auto rp = ROL::makePtr<PDE_DualSimVector<RealT,DeviceT>>(r_ptr,pde,assembler,*parlist);
+    auto xp = ROL::makePtr<ROL::Vector_SimOpt<RealT>>(up,zp);
+
+    // Initialize quadratic objective function.
+    auto qoi = ROL::makePtr<QoI_Power_NavierStokes<RealT,DeviceT>>(*parlist,
+                                                    pde->getVelocityFE(),
+                                                    pde->getPressureFE(),
+                                                    pde->getControlFE(),
+                                                    pde->getStateFieldInfo(),
+                                                    pde->getControlFieldInfo());
+    auto obj  = ROL::makePtr<PDE_Objective<RealT,DeviceT>>(qoi,assembler);
+    auto robj = ROL::makePtr<ROL::Reduced_Objective_SimOpt<RealT>>(obj,con,up,zp,pp,true,false);
+
+    // Initialize volume constraint.
+    RealT volFraction = parlist->sublist("Problem").get("Volume Fraction",0.5);
+    auto qoi_vol = ROL::makePtr<QoI_Volume_NavierStokes<RealT,DeviceT>>(*parlist,
+                                                             pde->getControlFE(),
+                                                             pde->getControlFieldInfo());
+    auto obj_vol = ROL::makePtr<IntegralOptObjective<RealT,DeviceT>>(qoi_vol,assembler);
+    auto icon    = ROL::makePtr<ROL::ConstraintFromObjective<RealT>>(obj_vol);
+    auto iup     = ROL::makePtr<ROL::SingletonVector<RealT>>(0.0);
+    auto imul    = ROL::makePtr<ROL::SingletonVector<RealT>>(0.0);
+    auto ibnd    = ROL::makePtr<ROL::Bounds<RealT>>(*iup,false);
+
+    // Build bound constraint
+    auto lp = zp->clone(); lp->setScalar(0.0);
+    auto hp = zp->clone(); hp->setScalar(1.0);
+    auto bnd = ROL::makePtr<ROL::Bounds<RealT>>(lp, hp);
+    // Build optimization problem
+    auto optProb = ROL::makePtr<ROL::Problem<RealT>>(robj, zp);
+    optProb->addBoundConstraint(bnd);
+    optProb->addLinearConstraint("Volume",icon,imul); //, ibnd);
+    optProb->setProjectionAlgorithm(*parlist);
+    bool useProj = parlist->sublist("Problem").get("Project Linear Constraints",true);
+    optProb->finalize(!useProj,true,*outStream);
+
+    // Run derivative checks
+    bool checkDeriv = parlist->sublist("Problem").get("Check derivatives",false);
+    if ( checkDeriv ) {
+      auto dup = up->clone(); dup->randomize();
+      auto dzp = zp->clone(); dzp->randomize();
+      con->checkApplyJacobian_1(*up,*zp,*dup,*up,true,*outStream);
+      con->checkApplyJacobian_2(*up,*zp,*dzp,*up,true,*outStream);
+      con->checkInverseJacobian_1(*up,*up,*up,*zp,true,*outStream);
+      con->checkInverseAdjointJacobian_1(*up,*up,*up,*zp,true,*outStream);
+      con->checkApplyAdjointHessian_11(*up,*zp,*pp,*dup,*up,true,*outStream);
+      con->checkApplyAdjointHessian_21(*up,*zp,*pp,*dzp,*up,true,*outStream);
+      con->checkApplyAdjointHessian_12(*up,*zp,*pp,*dup,*zp,true,*outStream);
+      con->checkApplyAdjointHessian_22(*up,*zp,*pp,*dzp,*zp,true,*outStream);
+      obj->checkGradient_1(*up,*zp,*dup,true,*outStream);
+      obj->checkGradient_2(*up,*zp,*dzp,true,*outStream);
+      obj->checkHessVec_11(*up,*zp,*dup,true,*outStream);
+      obj->checkHessVec_12(*up,*zp,*dzp,true,*outStream);
+      obj->checkHessVec_21(*up,*zp,*dup,true,*outStream);
+      obj->checkHessVec_22(*up,*zp,*dzp,true,*outStream);
+      icon->checkApplyJacobian(*zp,*dzp,*imul,true,*outStream);
+      icon->checkApplyAdjointHessian(*zp,*imul,*dzp,*zp,true,*outStream);
+      robj->checkGradient(*zp,*dzp,true,*outStream);
+      robj->checkHessVec(*zp,*dzp,true,*outStream);
+      //optProb->check(*outStream);
+    }
+
+    // Solve optimization problem
+    up->zero(); pp->zero();
+    bool opt = parlist->sublist("Problem").get("Solve Optimization Problem",true);
+    if (opt) {
+      std::ifstream infile("control.txt");
+      if (infile.good()) assembler->inputTpetraVector(z_ptr,"control.txt");
+      else               zp->setScalar(volFraction);
+      Teuchos::Time algoTimer("Algorithm Time", true);
+      ROL::Solver<RealT> optSolver(optProb,*parlist);
+      optSolver.solve(*outStream);
+      algoTimer.stop();
+      *outStream << "Total optimization time = " << algoTimer.totalElapsedTime() << " seconds.\n";
+      con->outputTpetraVector(z_ptr,"control.txt");
+    }
+    else {
+      zp->setScalar(1.0);
+    }
+
+    // Output
+    assembler->printMeshData(*outStream);
+    RealT tol(1.e-8);
+    Teuchos::Array<RealT> res(1,0);
+    con->solve(*rp,*up,*zp,tol);
+    con->outputTpetraVector(u_ptr,"state.txt");
+    con->value(*rp,*up,*zp,tol);
+    r_ptr->norm2(res.view(0,1));
+    *outStream << "Residual Norm: " << res[0] << std::endl;
+    errorFlag += (res[0] > 1.e-6 ? 1 : 0);
+    //pdecon->outputTpetraData();
+
+    // Get a summary from the time monitor.
+    Teuchos::TimeMonitor::summarize();
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/topo-opt/navier-stokes/impermiabilityK.hpp b/packages/rol/example/PDE-OPT/topo-opt/navier-stokes/impermiabilityK.hpp
new file mode 100644
index 000000000000..bc378a8937a9
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/topo-opt/navier-stokes/impermiabilityK.hpp
@@ -0,0 +1,65 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+#ifndef IMPERMIABILITYK_HPP
+#define IMPERMIABILITYK_HPP
+
+#include <vector>
+#include "ROL_ParameterList.hpp"
+#include "ROL_Ptr.hpp"
+#include "../../TOOLS/feK.hpp"
+
+template<class Real, class DeviceType>
+class Impermiability {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+
+private:
+  Real alphaLo_, alphaHi_, q_;
+
+  Real value(const Real z) const {
+    const Real one(1);
+    return alphaHi_ + (alphaLo_-alphaHi_)*z*(one+q_)/(q_+z);
+  }
+
+  Real deriv1(const Real z) const {
+    const Real one(1), two(2);
+    return (alphaLo_-alphaHi_)*q_*(q_+one)/std::pow(q_+z,two);
+  }
+
+  Real deriv2(const Real z) const {
+    const Real one(1), two(2), three(3);
+    return -(alphaLo_-alphaHi_)*two*q_*(q_+one)/std::pow(q_+z,three);
+  }
+
+public:
+  Impermiability(ROL::ParameterList &list) {
+    alphaLo_   = list.sublist("Problem").get("Minimum Impermiability", 0.0);
+    alphaHi_   = list.sublist("Problem").get("Maximum Impermiability", 1e3);
+    q_         = list.sublist("Problem").get("RAMP Parameter", 1e5);
+  }
+
+  void compute(scalar_view &alpha,
+         const scalar_view z,
+         const scalar_view pts,
+         const int deriv=0) const {
+    const int c = pts.extent_int(0);
+    const int p = pts.extent_int(1);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        // Compute spatially dependent viscosity
+        if (deriv==1)      alpha(i,j) = deriv1(z(i,j));
+        else if (deriv==2) alpha(i,j) = deriv2(z(i,j));
+        else               alpha(i,j) = value(z(i,j));
+      }
+    }
+  }
+};
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/topo-opt/navier-stokes/obj_navier-stokesK.hpp b/packages/rol/example/PDE-OPT/topo-opt/navier-stokes/obj_navier-stokesK.hpp
new file mode 100644
index 000000000000..b45bcaabe26c
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/topo-opt/navier-stokes/obj_navier-stokesK.hpp
@@ -0,0 +1,457 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  obj.hpp
+    \brief Provides the interface for local (cell-based) objective function computations.
+*/
+
+#ifndef PDEOPT_QOI_NAVIERSTOKESK_HPP
+#define PDEOPT_QOI_NAVIERSTOKESK_HPP
+
+#include "../../TOOLS/qoiK.hpp"
+#include "pde_navier-stokesK.hpp"
+#include "impermiabilityK.hpp"
+
+template <class Real, class DeviceType>
+class QoI_Power_NavierStokes : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> feVel_, fePrs_, feCtrl_;
+  const ROL::Ptr<const FieldUtils::FieldInfo> fieldInfo_, fieldInfoCtrl_;
+
+  ROL::Ptr<Impermiability<Real,DeviceType>> imp_;
+  Real viscosity_;
+
+public:
+  QoI_Power_NavierStokes(ROL::ParameterList                          &list,
+                         const ROL::Ptr<fe_type>                     &feVel,
+                         const ROL::Ptr<fe_type>                     &fePrs,
+                         const ROL::Ptr<fe_type>                     &feCtrl,
+                         const ROL::Ptr<const FieldUtils::FieldInfo> &fieldInfo,
+                         const ROL::Ptr<const FieldUtils::FieldInfo> &fieldInfoCtrl)
+    : feVel_(feVel), fePrs_(fePrs), feCtrl_(feCtrl), fieldInfo_(fieldInfo), fieldInfoCtrl_(fieldInfoCtrl) {
+    viscosity_ = list.sublist("Problem").get("Viscosity",5e-3);
+    imp_ = ROL::makePtr<Impermiability<Real,DeviceType>>(list);
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c = feVel_->gradN().extent_int(0);
+    const int p = feVel_->gradN().extent_int(2);
+    const int d = feVel_->gradN().extent_int(3);
+    // Initialize output val
+    val = scalar_view("val", c);
+    // Get components of the control
+    std::vector<scalar_view> U, Z;
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoCtrl_);
+    // Evaluate on FE basis
+    scalar_view Zval("Zval", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view Uval("Uval", c, p);
+    scalar_view aUval("aUval", c, p);
+    scalar_view Ugrad("Ugrad", c, p, d);
+    scalar_view nUgrad("nUgrad", c, p, d);
+    feCtrl_->evaluateValue(Zval, Z[0]);
+    imp_->compute(alpha, Zval, feVel_->cubPts(), 0); 
+    for (int i = 0; i < d; ++i) {
+      // Evaluate on FE basis
+      feVel_->evaluateValue(Uval, U[i]);
+      feVel_->evaluateGradient(Ugrad, U[i]);
+      // Scale
+      fst::scalarMultiplyDataData(aUval, alpha, Uval);
+      rst::scale(nUgrad, Ugrad, viscosity_);
+      // Integrate
+      feVel_->computeIntegral(val, nUgrad, Ugrad, true);
+      feVel_->computeIntegral(val, aUval, Uval, true);
+    }
+    rst::scale(val, static_cast<Real>(0.5));
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fv = feVel_->gradN().extent_int(1);
+    const int fp = fePrs_->gradN().extent_int(1);
+    const int p  = feVel_->gradN().extent_int(2);
+    const int d  = feVel_->gradN().extent_int(3);
+    // Initialize output grad
+    std::vector<scalar_view> G;
+    G.resize(fieldInfo_->numFields);
+    for (int i = 0; i < d; ++i) {
+      G[i] = scalar_view("grad1", c, fv);
+    }
+    G[d] = scalar_view("grad1", c, fp);
+    // Get components of the control
+    std::vector<scalar_view> U, Z;
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoCtrl_);
+    // Create storage
+    scalar_view Zval("Zval", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view Uval("Uval", c, p);
+    scalar_view aUval("aUval", c, p);
+    scalar_view Ugrad("Ugrad", c, p, d);
+    feCtrl_->evaluateValue(Zval, Z[0]);
+    imp_->compute(alpha, Zval, feVel_->cubPts(), 0); 
+    for (int i = 0; i < d; ++i) {
+      // Evaluate on FE basis
+      feVel_->evaluateValue(Uval, U[i]);
+      feVel_->evaluateGradient(Ugrad, U[i]);
+      // Scale
+      fst::scalarMultiplyDataData(aUval, alpha, Uval);
+      rst::scale(Ugrad, viscosity_);
+      // Integrate
+      fst::integrate(G[i],Ugrad,feVel_->gradNdetJ(),false);
+      fst::integrate(G[i],aUval,feVel_->NdetJ(),true);
+    }
+    FieldUtils::combineFieldCoeff<Real>(grad, G, fieldInfo_);
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int p  = feVel_->gradN().extent_int(2);
+    const int d  = feVel_->gradN().extent_int(3);
+    // Initialize output grad
+    std::vector<scalar_view> G;
+    G.resize(fieldInfoCtrl_->numFields);
+    G[0] = scalar_view("grad2", c, fc);
+    // Get components of the control
+    std::vector<scalar_view> U, Z;
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoCtrl_);
+    // Create storage
+    scalar_view Zval("Zval", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view Uval("Uval", c, p);
+    scalar_view aUval("aUval", c, p);
+    scalar_view aU2val("aU2val", c, p);
+    feCtrl_->evaluateValue(Zval, Z[0]);
+    imp_->compute(alpha, Zval, feVel_->cubPts(), 1); 
+    for (int i = 0; i < d; ++i) {
+      // Evaluate on FE basis
+      feVel_->evaluateValue(Uval, U[i]);
+      // Scale
+      fst::scalarMultiplyDataData(aUval, alpha, Uval);
+      fst::scalarMultiplyDataData(aU2val, aUval, Uval);
+      // Integrate
+      fst::integrate(G[0],aU2val,feCtrl_->NdetJ(),true);
+    }
+    rst::scale(G[0], static_cast<Real>(0.5));
+    FieldUtils::combineFieldCoeff<Real>(grad, G, fieldInfoCtrl_);
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fv = feVel_->gradN().extent_int(1);
+    const int fp = fePrs_->gradN().extent_int(1);
+    const int p  = feVel_->gradN().extent_int(2);
+    const int d  = feVel_->gradN().extent_int(3);
+    // Initialize output grad
+    std::vector<scalar_view> H;
+    H.resize(fieldInfo_->numFields);
+    for (int i = 0; i < d; ++i) {
+      H[i] = scalar_view("hess11", c, fv);
+    }
+    H[d] = scalar_view("hess11", c, fp);
+    // Get components of the control
+    std::vector<scalar_view> V, Z;
+    FieldUtils::splitFieldCoeff<Real>(V, v_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoCtrl_);
+    // Create storage
+    scalar_view Zval("Zval", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view Vval("Vval", c, p);
+    scalar_view aVval("aVval", c, p);
+    scalar_view Vgrad("Vgrad", c, p, d);
+    feCtrl_->evaluateValue(Zval, Z[0]);
+    imp_->compute(alpha, Zval, feVel_->cubPts(), 0); 
+    for (int i = 0; i < d; ++i) {
+      // Evaluate on FE basis
+      feVel_->evaluateValue(Vval, V[i]);
+      feVel_->evaluateGradient(Vgrad, V[i]);
+      // Scale
+      fst::scalarMultiplyDataData(aVval, alpha, Vval);
+      rst::scale(Vgrad, viscosity_);
+      // Integrate
+      fst::integrate(H[i],Vgrad,feVel_->gradNdetJ(),false);
+      fst::integrate(H[i],aVval,feVel_->NdetJ(),true);
+    }
+    FieldUtils::combineFieldCoeff<Real>(hess, H, fieldInfo_);
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fv = feVel_->gradN().extent_int(1);
+    const int fp = fePrs_->gradN().extent_int(1);
+    const int p  = feVel_->gradN().extent_int(2);
+    const int d  = feVel_->gradN().extent_int(3);
+    // Initialize output grad
+    std::vector<scalar_view> H;
+    H.resize(fieldInfo_->numFields);
+    for (int i = 0; i < d; ++i) {
+      H[i] = scalar_view("hess12", c, fv);
+    }
+    H[d] = scalar_view("hess12", c, fp);
+    // Get components of the control
+    std::vector<scalar_view> V, U, Z;
+    FieldUtils::splitFieldCoeff<Real>(V, v_coeff, fieldInfoCtrl_);
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoCtrl_);
+    // Create storage
+    scalar_view Zval("Zval", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view Uval("Uval", c, p);
+    scalar_view aUval("aUval", c, p);
+    scalar_view Vval("Vval", c, p);
+    scalar_view aVval("aVval", c, p);
+    feCtrl_->evaluateValue(Vval, V[0]);
+    feCtrl_->evaluateValue(Zval, Z[0]);
+    imp_->compute(alpha, Zval, feVel_->cubPts(), 1); 
+    fst::scalarMultiplyDataData(aVval, alpha, Vval);
+    for (int i = 0; i < d; ++i) {
+      // Evaluate on FE basis
+      feVel_->evaluateValue(Uval, U[i]);
+      // Scale
+      fst::scalarMultiplyDataData(aUval, aVval, Uval);
+      // Integrate
+      fst::integrate(H[i],aUval,feVel_->NdetJ(),false);
+    }
+    FieldUtils::combineFieldCoeff<Real>(hess, H, fieldInfo_);
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int p  = feVel_->gradN().extent_int(2);
+    const int d  = feVel_->gradN().extent_int(3);
+    // Initialize output grad
+    std::vector<scalar_view> H;
+    H.resize(fieldInfoCtrl_->numFields);
+    H[0] = scalar_view("hess21", c, fc);
+    // Get components of the control
+    std::vector<scalar_view> V, U, Z;
+    FieldUtils::splitFieldCoeff<Real>(V, v_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoCtrl_);
+    // Create storage
+    scalar_view Zval("Zval", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view Uval("Uval", c, p);
+    scalar_view aUval("aUval", c, p);
+    scalar_view aUVval("aUVval", c, p);
+    scalar_view Vval("Vval", c, p);
+    feCtrl_->evaluateValue(Zval, Z[0]);
+    imp_->compute(alpha, Zval, feVel_->cubPts(), 1); 
+    for (int i = 0; i < d; ++i) {
+      // Evaluate on FE basis
+      feVel_->evaluateValue(Vval, V[i]);
+      feVel_->evaluateValue(Uval, U[i]);
+      // Scale
+      fst::scalarMultiplyDataData(aUval, alpha, Uval);
+      fst::scalarMultiplyDataData(aUVval, aUval, Vval);
+      // Integrate
+      fst::integrate(H[0],aUVval,feCtrl_->NdetJ(), true);
+    }
+    FieldUtils::combineFieldCoeff<Real>(hess, H, fieldInfoCtrl_);
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int p  = feVel_->gradN().extent_int(2);
+    const int d  = feVel_->gradN().extent_int(3);
+    // Initialize output grad
+    std::vector<scalar_view> H;
+    H.resize(fieldInfoCtrl_->numFields);
+    H[0] = scalar_view("hess22", c, fc);
+    // Get components of the control
+    std::vector<scalar_view> V, U, Z;
+    FieldUtils::splitFieldCoeff<Real>(V, v_coeff, fieldInfoCtrl_);
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoCtrl_);
+    // Create storage
+    scalar_view Zval("Zval", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view Uval("Uval", c, p);
+    scalar_view aUval("aUval", c, p);
+    scalar_view aU2val("aU2val", c, p);
+    scalar_view Vval("Vval", c, p);
+    scalar_view aVval("aVval", c, p);
+    feCtrl_->evaluateValue(Vval, V[0]);
+    feCtrl_->evaluateValue(Zval, Z[0]);
+    imp_->compute(alpha, Zval, feVel_->cubPts(), 2); 
+    fst::scalarMultiplyDataData(aVval, alpha, Vval);
+    for (int i = 0; i < d; ++i) {
+      // Evaluate on FE basis
+      feVel_->evaluateValue(Uval, U[i]);
+      // Scale
+      fst::scalarMultiplyDataData(aUval, aVval, Uval);
+      fst::scalarMultiplyDataData(aU2val, aUval, Uval);
+      // Integrate
+      fst::integrate(H[0],aU2val,feCtrl_->NdetJ(),true);
+    }
+    rst::scale(H[0], static_cast<Real>(0.5));
+    FieldUtils::combineFieldCoeff<Real>(hess, H, fieldInfoCtrl_);
+  }
+
+}; // QoI_Power_NavierStokes
+
+template <class Real, class DeviceType>
+class QoI_Volume_NavierStokes : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> fe_;
+  const ROL::Ptr<const FieldUtils::FieldInfo> fieldInfo_;
+
+  scalar_view weight_;
+  Real volFraction_;
+
+public:
+  QoI_Volume_NavierStokes(ROL::ParameterList &list,
+                          const ROL::Ptr<fe_type> &fe,
+                          const ROL::Ptr<const FieldUtils::FieldInfo> &fieldInfo)
+    : fe_(fe), fieldInfo_(fieldInfo) {
+    const Real one(1);
+    volFraction_   = list.sublist("Problem").get("Volume Fraction",0.5);
+    const int c = fe_->cubPts().extent_int(0);
+    const int p = fe_->cubPts().extent_int(1);
+    weight_ = scalar_view("weight_", c, p);
+    Kokkos::deep_copy(weight_,one);
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c = fe_->cubPts().extent_int(0);
+    const int p = fe_->cubPts().extent_int(1);
+    // Initialize output val
+    val = scalar_view("val", c);
+    // Get components of the control
+    std::vector<scalar_view> Z;
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfo_);
+    // Evaluate on FE basis
+    scalar_view Z0("Z0", c, p);
+    fe_->evaluateValue(Z0, Z[0]);
+    // Integrate the density minus volume fraction
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        Z0(i,j) -= volFraction_;
+      }
+    }
+    fe_->computeIntegral(val,weight_,Z0,true);
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Volume_NavierStokes::gradient_1 is zero.");
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    const int c = fe_->cubPts().extent_int(0);
+    const int f = fe_->N().extent_int(1);
+    // Initialize output grad
+    std::vector<scalar_view> G;
+    G.resize(fieldInfo_->numFields);
+    G[0] = scalar_view("grad2", c, f);
+    // Integrate density
+    fst::integrate(G[0],weight_,fe_->NdetJ(),false);
+    FieldUtils::combineFieldCoeff(grad, G, fieldInfo_);
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Volume_NavierStokes::HessVec_11 is zero.");
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Volume_NavierStokes::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Volume_NavierStokes::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Volume_NavierStokes::HessVec_22 is zero.");
+  }
+
+}; // QoI_Volume_NavierStokes
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/topo-opt/navier-stokes/pde_navier-stokesK.hpp b/packages/rol/example/PDE-OPT/topo-opt/navier-stokes/pde_navier-stokesK.hpp
new file mode 100644
index 000000000000..44b83f7f38ee
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/topo-opt/navier-stokes/pde_navier-stokesK.hpp
@@ -0,0 +1,944 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  pde_navier-stokes.hpp
+    \brief Implements the local PDE interface for the Navier-Stokes control problem.
+*/
+
+#ifndef PDE_NAVIERSTOKESK_HPP
+#define PDE_NAVIERSTOKESK_HPP
+
+#include "../../TOOLS/pdeK.hpp"
+#include "../../TOOLS/feK.hpp"
+#include "../../TOOLS/fieldhelperK.hpp"
+#include "impermiabilityK.hpp"
+
+#include "Intrepid2_HVOL_C0_FEM.hpp"
+#include "Intrepid2_HGRAD_QUAD_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_QUAD_C2_FEM.hpp"
+#include "Intrepid2_HGRAD_HEX_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_HEX_C2_FEM.hpp"
+#include "Intrepid2_DefaultCubatureFactory.hpp"
+#include "Intrepid2_FunctionSpaceTools.hpp"
+#include "Intrepid2_CellTools.hpp"
+
+#include "ROL_Ptr.hpp"
+
+template <class Real, class DeviceType>
+class PDE_NavierStokes : public PDE<Real, DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  // Finite element basis information
+  basis_ptr basisPtrVel_, basisPtrPrs_, basisPtrCtrl_;
+  std::vector<basis_ptr> basisPtrs_, basisPtrsCtrl_;
+  // Cell cubature information
+  ROL::Ptr<Intrepid2::Cubature<DeviceType,Real,Real>> cellCub_, bdryCub_;
+  // Cell node information
+  scalar_view volCellNodes_;
+  std::vector<std::vector<scalar_view>> bdryCellNodes_;
+  std::vector<std::vector<std::vector<int>>> bdryCellLocIds_;
+  // Finite element definition
+  ROL::Ptr<fe_type> feVel_, fePrs_, feCtrl_;
+  std::vector<std::vector<ROL::Ptr<fe_type>>> feVelBdry_;
+  // Local degrees of freedom on boundary, for each side of the reference cell (first index).
+  std::vector<std::vector<int>> fvidx_, fpidx_;
+  // Coordinates of degrees freedom on boundary cells.
+  // Indexing:  [sideset number][local side id](cell number, value at dof)
+  std::vector<std::vector<scalar_view>> bdryCellDofValues_;
+  // Field pattern, offsets, etc.
+  std::vector<std::vector<int>> fieldPattern_;  // local Field/DOF pattern; set from DOF manager 
+  int numFields_;                               // number of fields (equations in the PDE)
+  int numDofs_;                                 // total number of degrees of freedom for all (local) fields
+  std::vector<int> offset_;                     // for each field, a counting offset
+  std::vector<int> numFieldDofs_;               // for each field, number of degrees of freedom
+  std::vector<std::vector<int>> fieldPatternCtrl_;  // local Field/DOF pattern; set from DOF manager 
+  int numFieldsCtrl_;                               // number of fields (equations in the PDE)
+  int numDofsCtrl_;                                 // total number of degrees of freedom for all (local) fields
+  std::vector<int> offsetCtrl_;                     // for each field, a counting offset
+  std::vector<int> numFieldDofsCtrl_;               // for each field, number of degrees of freedom
+  
+  // Problem parameters.
+  Real viscosity_;
+
+  ROL::Ptr<Impermiability<Real,DeviceType>> imp_;
+  ROL::Ptr<FieldHelper<Real,DeviceType>> fieldHelper_;
+  ROL::Ptr<FieldUtils::FieldInfo> fieldInfo_, fieldInfoCtrl_;
+
+  // Extract velocity coefficients on boundary.
+  scalar_view getBoundaryCoeff(
+      const scalar_view cell_coeff,
+      int sideSet, int cell) const {
+    std::vector<int> bdryCellLocId = bdryCellLocIds_[sideSet][cell];
+    const int numCellsSide = bdryCellLocId.size();
+    const int f = basisPtrVel_->getCardinality();
+    
+    scalar_view bdry_coeff("bdry_coeff",numCellsSide,f);
+    for (int i = 0; i < numCellsSide; ++i) {
+      for (int j = 0; j < f; ++j) {
+        bdry_coeff(i, j) = cell_coeff(bdryCellLocId[i], j);
+      }
+    }
+    return bdry_coeff;
+  }
+
+  Real DirichletFunc(const std::vector<Real> & coords, int sideset, int locSideId, int dir) const {
+    const std::vector<Real> param = PDE<Real,DeviceType>::getParameter();
+    Real val(0);
+    if ((sideset==1) && (dir==0)) { // inflow
+      const Real one(1), four(4);
+      val = four*coords[1]*(one-coords[1]);
+    }
+    else if ((sideset==0) && (dir==0)) { // outflow
+      const Real one(1), two(2), three(3), c(108);
+      val = c*(coords[1]-one/three)*(two/three-coords[1]);
+    }
+    return val;
+  }
+
+  void computeDirichlet(void) {
+    // Compute Dirichlet values at DOFs.
+    int d = basisPtrVel_->getBaseCellTopology().getDimension();
+    int numSidesets = bdryCellLocIds_.size();
+    bdryCellDofValues_.resize(numSidesets);
+    for (int i=0; i<numSidesets; ++i) {
+      int numLocSides = bdryCellLocIds_[i].size();
+      bdryCellDofValues_[i].resize(numLocSides);
+      for (int j=0; j<numLocSides; ++j) {
+        int c = bdryCellLocIds_[i][j].size();
+        int f = basisPtrVel_->getCardinality();
+        bdryCellDofValues_[i][j] = scalar_view("bdryCellDofValues", c, f, d);
+        scalar_view coords("coords", c, f, d);
+        if (c > 0) {
+          feVel_->computeDofCoords(coords, bdryCellNodes_[i][j]);
+        }
+        for (int k=0; k<c; ++k) {
+          for (int l=0; l<f; ++l) {
+            std::vector<Real> dofpoint(d);
+            //std::cout << "Sideset " << i << " LocalSide " << j << "  Cell " << k << "  Field " << l << "  Coord ";
+            for (int m=0; m<d; ++m) {
+              dofpoint[m] = coords(k, l, m);
+              //std::cout << dofpoint[m] << "  ";
+            }
+
+            for (int m=0; m<d; ++m) {
+              (bdryCellDofValues_[i][j])(k, l, m) = DirichletFunc(dofpoint, i, j, m);
+              //std::cout << "  " << m << "-Value " << DirichletFunc(dofpoint, i, j, m);
+            }
+            //std::cout << std::endl;
+          }
+        }
+      }
+    }
+  }
+
+  Real viscosityFunc(const std::vector<Real> & coords) const {
+    return viscosity_;
+  }
+
+  void computeViscosity(scalar_view &nu) const {
+    int c = feVel_->gradN().extent_int(0);
+    int p = feVel_->gradN().extent_int(2);
+    int d = feVel_->gradN().extent_int(3);
+    std::vector<Real> pt(d);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for ( int k = 0; k < d; ++k) {
+          pt[k] = (feVel_->cubPts())(i,j,k);
+        }
+        // Compute spatially dependent viscosity
+        nu(i,j) = viscosityFunc(pt);
+      }
+    }
+  }
+
+public:
+  PDE_NavierStokes(ROL::ParameterList &parlist) {
+    // Finite element fields.
+    int cubDegree     = parlist.sublist("Problem").get("Cubature Degree",4);
+    int bdryCubDegree = parlist.sublist("Problem").get("Boundary Cubature Degree",2);
+    int probDim       = parlist.sublist("Problem").get("Problem Dimension",2);
+    int basisDegCtrl  = parlist.sublist("Problem").get("Density Basis Degree",1);
+    if (probDim > 3 || probDim < 2) {
+      TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument,
+        ">>> PDE-OPT/projects/navier-stokes-topopt/pde_navier-stokes.hpp: Problem dimension is not 2 or 3!");
+    }
+    if (probDim == 2) {
+      basisPtrVel_  = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C2_FEM<DeviceType,Real,Real>>();
+      basisPtrPrs_  = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceType,Real,Real>>();
+    }
+    else if (probDim == 3) {
+      basisPtrVel_  = ROL::makePtr<Intrepid2::Basis_HGRAD_HEX_C2_FEM<DeviceType,Real,Real>>();
+      basisPtrPrs_  = ROL::makePtr<Intrepid2::Basis_HGRAD_HEX_C1_FEM<DeviceType,Real,Real>>();
+    }
+    basisPtrs_.clear(); basisPtrsCtrl_.clear();
+    basisPtrs_.resize(probDim,basisPtrVel_);  // Velocity component
+    basisPtrs_.push_back(basisPtrPrs_);       // Pressure component
+    shards::CellTopology cellType = basisPtrs_[0]->getBaseCellTopology();        // get the cell type from any basis
+    if (basisDegCtrl == 1) {
+      if (probDim == 2)
+        basisPtrCtrl_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceType,Real,Real>>();
+      else if (probDim == 3)
+        basisPtrCtrl_ = ROL::makePtr<Intrepid2::Basis_HGRAD_HEX_C1_FEM<DeviceType,Real,Real>>();
+    }
+    else
+      basisPtrCtrl_ = ROL::makePtr<Intrepid2::Basis_HVOL_C0_FEM<DeviceType,Real,Real>>(cellType);
+    basisPtrsCtrl_.push_back(basisPtrCtrl_);  // Control component
+    // Quadrature rules.
+    Intrepid2::DefaultCubatureFactory cubFactory;                            // create cubature factory
+    cellCub_ = cubFactory.create<DeviceType,Real,Real>(cellType, cubDegree); // create default cubature
+
+    shards::CellTopology bdryCellType = cellType.getCellTopologyData(probDim-1, 0);
+    bdryCub_ = cubFactory.create<DeviceType,Real,Real>(bdryCellType, bdryCubDegree);
+
+    // Other problem parameters.
+    viscosity_ = parlist.sublist("Problem").get("Viscosity", 5e-3);
+    imp_       = ROL::makePtr<Impermiability<Real,DeviceType>>(parlist);
+
+    numDofs_ = 0;
+    numFields_ = basisPtrs_.size();
+    offset_.resize(numFields_);
+    numFieldDofs_.resize(numFields_);
+    for (int i=0; i<numFields_; ++i) {
+      if (i==0)
+        offset_[i]  = 0;
+      else
+        offset_[i]  = offset_[i-1] + basisPtrs_[i-1]->getCardinality();
+      numFieldDofs_[i] = basisPtrs_[i]->getCardinality();
+      numDofs_ += numFieldDofs_[i];
+    }
+    numDofsCtrl_ = 0;
+    numFieldsCtrl_ = basisPtrsCtrl_.size();
+    offsetCtrl_.resize(numFieldsCtrl_);
+    numFieldDofsCtrl_.resize(numFieldsCtrl_);
+    for (int i=0; i<numFieldsCtrl_; ++i) {
+      if (i==0)
+        offsetCtrl_[i]  = 0;
+      else
+        offsetCtrl_[i]  = offsetCtrl_[i-1] + basisPtrsCtrl_[i-1]->getCardinality();
+      numFieldDofsCtrl_[i] = basisPtrsCtrl_[i]->getCardinality();
+      numDofsCtrl_ += numFieldDofsCtrl_[i];
+    }
+  }
+
+  void residual(scalar_view & res,
+                const scalar_view u_coeff,
+                const scalar_view z_coeff = scalar_view(),
+                const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fv = feVel_->gradN().extent_int(1);
+    const int fp = fePrs_->gradN().extent_int(1);
+    const int p  = feVel_->gradN().extent_int(2);
+    const int d  = feVel_->gradN().extent_int(3);
+    // Initialize residuals.
+    std::vector<scalar_view> R;
+    R.resize(numFields_);
+    for (int i = 0; i < d; ++i) {
+      R[i] = scalar_view("res", c, fv);
+    }
+    R[d] = scalar_view("res", c, fp);
+    // Split coefficients into components.
+    std::vector<scalar_view> U, Z;
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoCtrl_);
+    // Evaluate/interpolate finite element fields on cells.
+    std::vector<scalar_view> nuGradVel(d), valVel(d), gradVel(d), valVelDotgradVel(d), alphaVel(d);
+    scalar_view nu("nu", c, p);
+    scalar_view valVel0("valVel0", c, p, d);
+    scalar_view valPres("valPres", c, p);
+    scalar_view divVel("divVel", c, p);
+    scalar_view valCtrl("valCtrl", c, p);
+    scalar_view alpha ("alpha", c, p);
+    computeViscosity(nu);
+    feCtrl_->evaluateValue(valCtrl, Z[0]);
+    imp_->compute(alpha, valCtrl, feVel_->cubPts(), 0);
+    for (int i = 0; i < d; ++i) {
+      nuGradVel[i]        = scalar_view("nuGradVel", c, p, d);
+      valVel[i]           = scalar_view("valVel", c, p);
+      gradVel[i]          = scalar_view("gradVel", c, p, d);
+      valVelDotgradVel[i] = scalar_view("valVelDotgradVel", c, p);
+      alphaVel[i]         = scalar_view("alphaVel", c, p);
+      // Evaluate on FE basis
+      feVel_->evaluateValue(valVel[i], U[i]);
+      feVel_->evaluateGradient(gradVel[i], U[i]);
+      // Multiply velocity gradients with viscosity.
+      fst::tensorMultiplyDataData(nuGradVel[i], nu, gradVel[i]);
+      // Multiply velocity with alpha
+      fst::scalarMultiplyDataData(alphaVel[i], alpha, valVel[i]);
+      // Assemble the velocity vector and its divergence.
+      for (int j = 0; j < c; ++j) {
+        for (int k = 0; k < p; ++k) {
+          valVel0(j,k,i) = valVel[i](j,k);
+          divVel(j,k)   += gradVel[i](j,k,i);
+        }
+      }
+    }
+    // Compute nonlinear terms in the Navier-Stokes equations.
+    for (int i = 0; i < d; ++i)
+      fst::dotMultiplyDataData(valVelDotgradVel[i], valVel0, gradVel[i]);
+    // Negative pressure
+    fePrs_->evaluateValue(valPres, U[d]);
+    rst::scale(valPres, static_cast<Real>(-1));
+    /*** Evaluate weak form of the residual. ***/
+    // Velocity equation.
+    for (int i = 0; i < d; ++i) {
+      fst::integrate(R[i],nuGradVel[i],feVel_->gradNdetJ(),false);
+      fst::integrate(R[i],valVelDotgradVel[i],feVel_->NdetJ(),true);
+      fst::integrate(R[i],valPres,feVel_->DNDdetJ(i),true);
+      fst::integrate(R[i],alphaVel[i],feVel_->NdetJ(),true);
+    }
+    // Pressure equation.
+    fst::integrate(R[d],divVel,fePrs_->NdetJ(),false);
+    rst::scale(R[d], static_cast<Real>(-1));
+    // Boundary conditions.
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      // APPLY DIRICHLET CONDITIONS
+      computeDirichlet();
+      for (int i = 0; i < numSideSets; ++i) {
+        if (i==1 /* in flow */ || i==2 /* no slip */ ) { //|| i==0 /* out flow */) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                //std::cout << "\n  i=" << i << "   cidx=" << cidx << "   j=" << j << "  l=" << l << "  " << fvidx_[j][l] << " " << (*bdryCellDofValues_[i][j])(k,fvidx_[j][l],0);
+                //std::cout << "\n  i=" << i << "   cidx=" << cidx << "   j=" << j << "  l=" << l << "  " << fvidx_[j][l] << " " << (*bdryCellDofValues_[i][j])(k,fvidx_[j][l],1);
+                for (int m=0; m < d; ++m) {
+                  (R[m])(cidx,fvidx_[j][l]) = (U[m])(cidx,fvidx_[j][l]) - (bdryCellDofValues_[i][j])(k,fvidx_[j][l],m);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    // Combine the residuals.
+    FieldUtils::combineFieldCoeff<Real>(res, R, fieldInfo_);
+  }
+
+  void Jacobian_1(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fv = feVel_->gradN().extent_int(1);
+    const int fp = fePrs_->gradN().extent_int(1);
+    const int p  = feVel_->gradN().extent_int(2);
+    const int d  = feVel_->gradN().extent_int(3);
+    // Initialize Jacobians.
+    std::vector<std::vector<scalar_view>> J(numFields_);
+    J[d].resize(numFields_);
+    for (int i = 0; i < d; ++i) {
+      J[i].resize(numFields_);
+      for (int j = 0; j < d; ++j)
+        J[i][j] = scalar_view("jac", c, fv, fv);
+      J[i][d] = scalar_view("jac", c, fv, fp);
+      J[d][i] = scalar_view("jac", c, fp, fv);
+    }
+    J[d][d] = scalar_view("jac", c, fp, fp);
+    // Split coefficients into components.
+    std::vector<scalar_view> U, Z;
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoCtrl_);
+    // Evaluate/interpolate finite element fields on cells.
+    std::vector<scalar_view> valVel(d), gradVel(d);
+    std::vector<std::vector<scalar_view>> dVel(d), dVelPhi(d);
+    scalar_view nu("nu", c, p);
+    scalar_view nuGradPhi("nuGradPhi", c, fv, p, d);
+    scalar_view valCtrl("valCtrl", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view alphaPhi("alphaPhi", c, fv, p);
+    scalar_view valVel0("valVel0", c, p, d);
+    scalar_view valVelDotgradPhi("valVelDotgradPhi", c, fv, p);
+    computeViscosity(nu);
+    feCtrl_->evaluateValue(valCtrl, Z[0]);
+    imp_->compute(alpha, valCtrl, feVel_->cubPts(), 0);
+    for (int i = 0; i < d; ++i) {
+      valVel[i]           = scalar_view("valVel", c, p);
+      gradVel[i]          = scalar_view("gradVel", c, p, d);
+      feVel_->evaluateValue(valVel[i], U[i]);
+      feVel_->evaluateGradient(gradVel[i], U[i]);
+      for (int j = 0; j < c; ++j) {
+        for (int k = 0; k < p; ++k) {
+          valVel0(j,k,i) = (valVel[i])(j,k);
+        }
+      }
+      dVel[i].resize(d);
+      dVelPhi[i].resize(d);
+      for (int j = 0; j < d; ++j) {
+        dVel[i][j]    = scalar_view("dVel", c, p);
+        dVelPhi[i][j] = scalar_view("dVelPhi", c, fv, p);
+        for (int k = 0; k < c; ++k) {
+          for (int l = 0; l < p; ++l) {
+            (dVel[i][j])(k,l) = (gradVel[i])(k,l,j);
+          }
+        }
+      }
+    }
+    // Multiply velocity gradients with viscosity.
+    fst::tensorMultiplyDataField(nuGradPhi, nu, feVel_->gradN());
+    // Compute nonlinear terms in the Navier-Stokes equations.
+    fst::dotMultiplyDataField(valVelDotgradPhi, valVel0, feVel_->gradN());
+    for (int i = 0; i < d; ++i) {
+      for (int j = 0; j < d; ++j) {
+        fst::scalarMultiplyDataField(dVelPhi[i][j], dVel[i][j], feVel_->N());
+      }
+    }
+    // Multiply velocity with alpha
+    fst::scalarMultiplyDataField(alphaPhi, alpha, feVel_->N());
+    // Negative pressure basis.
+    scalar_view negPrsN("negPrsN", c, fp, p);
+    Kokkos::deep_copy(negPrsN,fePrs_->N());
+    rst::scale(negPrsN, static_cast<Real>(-1));
+    /*** Evaluate weak form of the Jacobian. ***/
+    for (int i = 0; i < d; ++i) {
+      // Velocity equation.
+      fst::integrate(J[i][i],nuGradPhi,feVel_->gradNdetJ(),false);
+      fst::integrate(J[i][i],feVel_->NdetJ(),valVelDotgradPhi,true);
+      fst::integrate(J[i][i],alphaPhi,feVel_->NdetJ(),true);
+      for (int j = 0; j < d; ++j)
+        fst::integrate(J[i][j],feVel_->NdetJ(),dVelPhi[i][j],true);
+      fst::integrate(J[i][d],feVel_->DNDdetJ(i),negPrsN,false);
+      // Pressure equation.
+      fst::integrate(J[d][i],fePrs_->NdetJ(),feVel_->DND(i),false);
+      rst::scale(J[d][i], static_cast<Real>(-1));
+    }
+    // APPLY DIRICHLET CONDITIONS
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if (i==1 /* in flow */ || i==2 /* no slip */ ) { // || i==0 /* out flow */) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                for (int m=0; m < fv; ++m) {
+                  for (int n=0; n < d; ++n) {
+                    for (int q=0; q < d; ++q) {
+                      (J[n][q])(cidx,fvidx_[j][l],m) = static_cast<Real>(0);
+                    }
+                    (J[n][n])(cidx,fvidx_[j][l],fvidx_[j][l]) = static_cast<Real>(1);
+                  }
+                }
+                for (int m=0; m < fp; ++m) {
+                  for (int n=0; n < d; ++n) {
+                    (J[n][d])(cidx,fvidx_[j][l],m) = static_cast<Real>(0);
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    // Combine the jacobians.
+    FieldUtils::combineFieldCoeff<Real>(jac, J, fieldInfo_, fieldInfo_);
+  }
+
+
+  void Jacobian_2(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fv = feVel_->gradN().extent_int(1);
+    const int fp = fePrs_->gradN().extent_int(1);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int p  = feVel_->gradN().extent_int(2);
+    const int d  = feVel_->gradN().extent_int(3);
+    // Initialize Jacobians.
+    std::vector<std::vector<scalar_view>> J(numFields_);
+    for (int i = 0; i < d; ++i) {
+      J[i].resize(1);
+      J[i][0] = scalar_view("jac", c, fv, fc);
+    }
+    J[d].resize(1);
+    J[d][0] = scalar_view("jac", c, fp, fc);
+    // Split coefficients into components.
+    std::vector<scalar_view> U, Z;
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoCtrl_);
+    // Evaluate on FE basis.
+    scalar_view valCtrl("valCtrl", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view alphaPhi("alphaPhi", c, fc, p);
+    scalar_view valVel("valVel", c, p);
+    scalar_view alphaU("alphaU", c, fc, p);
+    feCtrl_->evaluateValue(valCtrl, Z[0]);
+    imp_->compute(alpha, valCtrl, feVel_->cubPts(), 1);
+    fst::scalarMultiplyDataField(alphaPhi, alpha, feCtrl_->N());
+    for (int i = 0; i < d; ++i) {
+      feVel_->evaluateValue(valVel, U[i]);
+      // Multiply velocity with alpha
+      fst::scalarMultiplyDataField(alphaU, valVel, alphaPhi);
+      // Integrate
+      fst::integrate(J[i][0],feVel_->NdetJ(),alphaU,false);
+    }
+    // APPLY DIRICHLET CONDITIONS
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if (i==1 /* in flow */ || i==2 /* no slip */ ) { // || i==0 /* out flow */) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                //std::cout << "\n   j=" << j << "  l=" << l << "  " << fidx[j][l];
+                for (int n=0; n < d; ++n) {
+                  for (int m=0; m < fc; ++m) {
+                    (J[n][0])(cidx,fvidx_[j][l],m) = static_cast<Real>(0);
+                  }
+                }
+                //for (int m=0; m < fc; ++m) {
+                //  (J[d][0])(cidx,fpidx_[j][l],m) = static_cast<Real>(0);
+                //}
+              }
+            }
+          }
+        }
+      }
+    }
+    // Combine the jacobians.
+    FieldUtils::combineFieldCoeff<Real>(jac, J, fieldInfo_, fieldInfoCtrl_);
+  }
+
+  void Hessian_11(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fv = feVel_->gradN().extent_int(1);
+    const int fp = fePrs_->gradN().extent_int(1);
+    const int p  = feVel_->gradN().extent_int(2);
+    const int d  = feVel_->gradN().extent_int(3);
+    // Initialize hessians.
+    std::vector<std::vector<scalar_view>> H;
+    H.resize(numFields_);
+    H[d].resize(numFields_);
+    for (int i = 0; i < d; ++i) {
+      H[i].resize(numFields_);
+      for (int j = 0; j < d; ++j) {
+        H[i][j] = scalar_view("hess", c, fv, fv);
+      }
+      H[i][d] = scalar_view("hess", c, fv, fp);
+      H[d][i] = scalar_view("hess", c, fp, fv);
+    }
+    H[d][d] = scalar_view("hess", c, fp, fp);
+    // Split coefficients into components.
+    std::vector<scalar_view> L;
+    FieldUtils::splitFieldCoeff<Real>(L, l_coeff, fieldInfo_);
+    // Apply Dirichlet conditions
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if (i==1 /* in flow */ || i==2 /* no slip */ ) { // || i==0 /* out flow */) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                //std::cout << "\n   j=" << j << "  l=" << l << "  " << fidx[j][l];
+                for (int m=0; m < d; ++m) {
+                  (L[m])(cidx,fvidx_[j][l]) = static_cast<Real>(0);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    // Evaluate/interpolate finite element fields on cells.
+    std::vector<scalar_view> Lval(d), LPhi(d);
+    for (int i = 0; i < d; ++i) {
+      Lval[i] = scalar_view("Lval", c, p);
+      LPhi[i] = scalar_view("Lphi", c, fv, p);
+      feVel_->evaluateValue(Lval[i], L[i]);
+      fst::scalarMultiplyDataField(LPhi[i], Lval[i], feVel_->N());
+    }
+    /*** Evaluate weak form of the Hessian. ***/
+    for (int i = 0; i < d; ++i) {
+      fst::integrate(H[i][i],LPhi[i],feVel_->DNDdetJ(i),false);
+      fst::integrate(H[i][i],feVel_->DNDdetJ(i),LPhi[i],true);
+      for (int j = 0; j < d; ++j) {
+        fst::integrate(H[i][j],feVel_->DNDdetJ(j),LPhi[i],false);
+        fst::integrate(H[i][j],LPhi[j],feVel_->DNDdetJ(i),true);
+      }
+    }
+    // Combine the Hessians.
+    FieldUtils::combineFieldCoeff<Real>(hess, H, fieldInfo_, fieldInfo_);
+  }
+
+  void Hessian_12(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fv = feVel_->gradN().extent_int(1);
+    const int fp = fePrs_->gradN().extent_int(1);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int p  = feVel_->gradN().extent_int(2);
+    const int d  = feVel_->gradN().extent_int(3);
+    // Initialize hessians.
+    std::vector<std::vector<scalar_view>> H(1);
+    H[0].resize(numFields_);
+    for (int i = 0; i < d; ++i) {
+      H[0][i] = scalar_view("hess", c, fc, fv);
+    }
+    H[0][d] = scalar_view("hess", c, fc, fp);
+    // Split coefficients into components.
+    std::vector<scalar_view> L, Z;
+    FieldUtils::splitFieldCoeff<Real>(L, l_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoCtrl_);
+    // Apply Dirichlet conditions
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if (i==1 /* in flow */ || i==2 /* no slip */ ) { // || i==0 /* out flow */) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                //std::cout << "\n   j=" << j << "  l=" << l << "  " << fidx[j][l];
+                for (int m=0; m < d; ++m) {
+                  (L[m])(cidx,fvidx_[j][l]) = static_cast<Real>(0);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    // Evaluate/interpolate finite element fields on cells.
+    scalar_view Z0("Z0", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view Lval("Lval", c, p);
+    scalar_view alphaL("alphaL", c, fc, p);
+    feCtrl_->evaluateValue(Z0, Z[0]);
+    imp_->compute(alpha, Z0, feVel_->cubPts(), 1);
+    /*** Evaluate weak form of the Hessian. ***/
+    // Velocity equation.
+    for (int i = 0; i < d; ++i) {
+      // Multiply velocity with alpha
+      feVel_->evaluateValue(Lval, L[i]);
+      for (int j = 0; j < c; ++j) {
+        for (int k = 0; k < fc; ++k) {
+          for (int l = 0; l < p; ++l) {
+            alphaL(j,k,l) = alpha(j,l) * (feCtrl_->N())(j,k,l) * Lval(j,l);
+          }
+        }
+      }
+      fst::integrate(H[0][i],alphaL,feVel_->NdetJ(),false);
+    }
+    // Combine the Hessians.
+    FieldUtils::combineFieldCoeff<Real>(hess, H, fieldInfoCtrl_, fieldInfo_);
+  }
+
+  void Hessian_21(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fv = feVel_->gradN().extent_int(1);
+    const int fp = fePrs_->gradN().extent_int(1);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int p  = feVel_->gradN().extent_int(2);
+    const int d  = feVel_->gradN().extent_int(3);
+    // Initialize hessians.
+    std::vector<std::vector<scalar_view>> H(numFields_);
+    for (int i = 0; i < d; ++i) {
+      H[i].resize(1);
+      H[i][0] = scalar_view("hess", c, fv, fc);
+    }
+    H[d].resize(1);
+    H[d][0] = scalar_view("hess", c, fp, fc);
+    // Split coefficients into components.
+    std::vector<scalar_view> L, Z;
+    FieldUtils::splitFieldCoeff<Real>(L, l_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoCtrl_);
+    // Apply Dirichlet conditions
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if (i==1 /* in flow */ || i==2 /* no slip */ ) { // || i==0 /* out flow */) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                //std::cout << "\n   j=" << j << "  l=" << l << "  " << fidx[j][l];
+                for (int m=0; m < d; ++m) {
+                  (L[m])(cidx,fvidx_[j][l]) = static_cast<Real>(0);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    // Evaluate/interpolate finite element fields on cells.
+    scalar_view Z0("Z0", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view Lval("Lval", c, p);
+    scalar_view alphaL("alphaL", c, fc, p);
+    feCtrl_->evaluateValue(Z0, Z[0]);
+    imp_->compute(alpha, Z0, feVel_->cubPts(), 1);
+    /*** Evaluate weak form of the Hessian. ***/
+    // Velocity equation.
+    for (int i = 0; i < d; ++i) {
+      // Multiply velocity with alpha
+      feVel_->evaluateValue(Lval, L[i]);
+      for (int j = 0; j < c; ++j) {
+        for (int k = 0; k < fc; ++k) {
+          for (int l = 0; l < p; ++l) {
+            alphaL(j,k,l) = alpha(j,l) * (feCtrl_->N())(j,k,l) * Lval(j,l);
+          }
+        }
+      }
+      fst::integrate(H[i][0],feVel_->NdetJ(),alphaL,false);
+    }
+    // Combine the Hessians.
+    FieldUtils::combineFieldCoeff<Real>(hess, H, fieldInfo_, fieldInfoCtrl_);
+  }
+
+  void Hessian_22(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    const int p  = feVel_->gradN().extent_int(2);
+    const int d  = feVel_->gradN().extent_int(3);
+    // Initialize hessians.
+    std::vector<std::vector<scalar_view>> H(1);
+    H[0].resize(1);
+    H[0][0] = scalar_view("hess", c, fc, fc);
+    // Split coefficients into components.
+    std::vector<scalar_view> L, U, Z;
+    FieldUtils::splitFieldCoeff<Real>(L, l_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(U, u_coeff, fieldInfo_);
+    FieldUtils::splitFieldCoeff<Real>(Z, z_coeff, fieldInfoCtrl_);
+    // Apply Dirichlet conditions
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if (i==1 /* in flow */ || i==2 /* no slip */ ) { // || i==0 /* out flow */) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numBdryDofs = fvidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                //std::cout << "\n   j=" << j << "  l=" << l << "  " << fidx[j][l];
+                for (int m=0; m < d; ++m) {
+                  (L[m])(cidx,fvidx_[j][l]) = static_cast<Real>(0);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    // Evaluate/interpolate finite element fields on cells.
+    std::vector<scalar_view> Lval(d), Uval(d);
+    scalar_view Z0("Z0", c, p);
+    scalar_view alpha("alpha", c, p);
+    scalar_view alphaUL("alphaUL", c, fc, p);
+    for (int i = 0; i < d; ++i) {
+      Lval[i] = scalar_view("Lval", c, p);
+      Uval[i] = scalar_view("Uval", c, p);
+      feVel_->evaluateValue(Lval[i], L[i]);
+      feVel_->evaluateValue(Uval[i], U[i]);
+    }
+    feCtrl_->evaluateValue(Z0, Z[0]);
+    imp_->compute(alpha, Z0, feVel_->cubPts(), 2);
+    // Multiply velocity with alpha
+    Real dot(0);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < fc; ++j) {
+        for (int k = 0; k < p; ++k) {
+          dot = static_cast<Real>(0);
+          for (int l = 0; l < d; ++l) {
+            dot += (Uval[l])(i,k) * (Lval[l])(i,k);
+          }
+          alphaUL(i,j,k) = alpha(i,k) * (feCtrl_->N())(i,j,k) * dot;
+        }
+      }
+    }
+    /*** Evaluate weak form of the Hessian. ***/
+    // Velocity equation.
+    fst::integrate(H[0][0],alphaUL,feCtrl_->NdetJ(),false);
+    // Combine hessians.
+    FieldUtils::combineFieldCoeff<Real>(hess, H, fieldInfoCtrl_, fieldInfoCtrl_);
+  }
+
+  void RieszMap_1(scalar_view & riesz) override {
+    // Retrieve dimensions.
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fv = feVel_->gradN().extent_int(1);
+    const int fp = fePrs_->gradN().extent_int(1);
+    const int d  = feVel_->gradN().extent_int(3);
+    // Initialize Riesz maps.
+    std::vector<std::vector<scalar_view>> J(numFields_);
+    J[d].resize(numFields_);
+    for (int i = 0; i < d; ++i) {
+      J[i].resize(numFields_);
+      for (int j = 0; j < d; ++j) {
+        J[i][j] = scalar_view("riesz1", c, fv, fv);
+      }
+      J[i][d] = scalar_view("riesz1", c, fv, fp);
+      J[d][i] = scalar_view("riesz1", c, fp, fv);
+    }
+    J[d][d] = scalar_view("riesz1", c, fp, fp);
+    // Build Riesz maps.
+    for (int i = 0; i < d; ++i) {
+      Kokkos::deep_copy(J[i][i],feVel_->stiffMat());
+      rst::add(J[i][i],feVel_->massMat());
+    }
+    Kokkos::deep_copy(J[d][d],fePrs_->massMat());
+    // Combine Riesz maps.
+    FieldUtils::combineFieldCoeff<Real>(riesz, J, fieldInfo_, fieldInfo_);
+  }
+
+  void RieszMap_2(scalar_view & riesz) override {
+    // Retrieve dimensions.
+    const int c  = feVel_->gradN().extent_int(0);
+    const int fc = feCtrl_->gradN().extent_int(1);
+    // Initialize residuals.
+    std::vector<std::vector<scalar_view>> J(1);
+    J[0].resize(1);
+    J[0][0] = scalar_view("riesz2", c, fc, fc);
+    Kokkos::deep_copy(J[0][0],feCtrl_->massMat());
+    // Combine Riesz maps.
+    FieldUtils::combineFieldCoeff<Real>(riesz, J, fieldInfoCtrl_, fieldInfoCtrl_);
+  }
+
+  std::vector<basis_ptr> getFields() override {
+    return basisPtrs_;
+  }
+
+  std::vector<basis_ptr> getFields2() override {
+    return basisPtrsCtrl_;
+  }
+
+  void setCellNodes(const scalar_view &volCellNodes,
+                    const std::vector<std::vector<scalar_view>> &bdryCellNodes,
+                    const std::vector<std::vector<std::vector<int>>> &bdryCellLocIds) override {
+    volCellNodes_ = volCellNodes;
+    bdryCellNodes_ = bdryCellNodes;
+    bdryCellLocIds_ = bdryCellLocIds;
+    // Finite element definition.
+    feVel_  = ROL::makePtr<fe_type>(volCellNodes_,basisPtrVel_,cellCub_);
+    fePrs_  = ROL::makePtr<fe_type>(volCellNodes_,basisPtrPrs_,cellCub_);
+    feCtrl_ = ROL::makePtr<fe_type>(volCellNodes_,basisPtrCtrl_,cellCub_,false);
+    fvidx_  = feVel_->getBoundaryDofs();
+    fpidx_  = fePrs_->getBoundaryDofs();
+    // Construct control boundary FE
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      feVelBdry_.resize(numSideSets);
+      for (int i = 0; i < numSideSets; ++i) {
+        int numLocSides = bdryCellNodes[i].size();
+        feVelBdry_[i].resize(numLocSides);
+        for (int j = 0; j < numLocSides; ++j) {
+          if (bdryCellNodes[i][j] != scalar_view()) {
+            feVelBdry_[i][j] = ROL::makePtr<fe_type>(bdryCellNodes[i][j],basisPtrVel_,bdryCub_,j);
+          }
+        }
+      }
+    }
+  }
+
+  void setFieldPattern(const std::vector<std::vector<int>> &fieldPattern,
+                       const std::vector<std::vector<int>> &fieldPattern2) override {
+    fieldPattern_ = fieldPattern;
+    fieldInfo_    = ROL::makePtr<FieldUtils::FieldInfo>(numFields_,numDofs_,numFieldDofs_,fieldPattern_);
+    fieldPatternCtrl_ = fieldPattern2;
+    fieldInfoCtrl_    = ROL::makePtr<FieldUtils::FieldInfo>(numFieldsCtrl_,numDofsCtrl_,numFieldDofsCtrl_,fieldPatternCtrl_);
+    fieldHelper_ = ROL::makePtr<FieldHelper<Real,DeviceType>>(numFields_, numDofs_, numFieldDofs_, fieldPattern_);
+  }
+
+  const ROL::Ptr<fe_type> getVelocityFE(void) const {
+    return feVel_;
+  }
+
+  const ROL::Ptr<fe_type> getPressureFE(void) const {
+    return fePrs_;
+  }
+
+  const ROL::Ptr<fe_type> getControlFE(void) const {
+    return feCtrl_;
+  }
+
+  const std::vector<ROL::Ptr<fe_type>> getVelocityBdryFE(const int sideset = -1) const {
+    int side = sideset;
+    if ( sideset < 0 || sideset > 2 ) {
+      //side = (useDirichletControl_) ? 6 : 10;
+      side = 0;
+    }
+    return feVelBdry_[side];
+  }
+
+  const std::vector<std::vector<int>> getBdryCellLocIds(const int sideset = -1) const {
+    int side = sideset;
+    if ( sideset < 0 || sideset > 2 ) {
+      //side = (useDirichletControl_) ? 6 : 10;
+      side = 0;
+    }
+    return bdryCellLocIds_[side];
+  }
+
+  const ROL::Ptr<FieldHelper<Real,DeviceType>> getFieldHelper(void) const {
+    return fieldHelper_;
+  }
+
+  const ROL::Ptr<const FieldUtils::FieldInfo> getStateFieldInfo(void) const {
+    return fieldInfo_;
+  }
+
+  const ROL::Ptr<const FieldUtils::FieldInfo> getControlFieldInfo(void) const {
+    return fieldInfoCtrl_;
+  }
+
+}; // PDE_NavierStokes
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/topo-opt/poisson/CMakeLists.txt b/packages/rol/example/PDE-OPT/topo-opt/poisson/CMakeLists.txt
index cd10cbc5d7e6..b64683899386 100644
--- a/packages/rol/example/PDE-OPT/topo-opt/poisson/CMakeLists.txt
+++ b/packages/rol/example/PDE-OPT/topo-opt/poisson/CMakeLists.txt
@@ -8,7 +8,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
   # Need ROL_TpetraMultiVector.hpp
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
 
-  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  ROL_ADD_EXECUTABLE_AND_TEST(
     example_01
     SOURCES example_01.cpp
     ARGS PrintItAll
@@ -18,7 +18,7 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     PoissonTopOptDataCopy
     SOURCE_FILES
       input_ex01.xml plotresults.m
@@ -26,5 +26,33 @@ IF(${PROJECT_NAME}_ENABLE_Intrepid AND
     DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
   )
 
+ENDIF()
+
+IF(${PROJECT_NAME}_ENABLE_Intrepid2 AND
+   ${PROJECT_NAME}_ENABLE_Ifpack2   AND
+   ${PROJECT_NAME}_ENABLE_MueLu     AND
+   ${PROJECT_NAME}_ENABLE_Amesos2   AND
+   ${PROJECT_NAME}_ENABLE_Tpetra )
+
+  # Need ROL_TpetraMultiVector.hpp
+  TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
+
+  ROL_ADD_EXECUTABLE_AND_TEST(
+    example_01_Kokkos
+    SOURCES example_01K.cpp
+    ARGS PrintItAll
+    NUM_MPI_PROCS 4
+    NUM_TOTAL_CORES_USED 4
+    PASS_REGULAR_EXPRESSION "TEST PASSED"
+    ADD_DIR_TO_NAME
+  )
+
+  ROL_COPY_FILES_TO_BINARY_DIR(
+    PoissonTopOptDataCopyK
+    SOURCE_FILES
+      input_ex01.xml plotresults.m
+    SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}"
+    DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
+  )
 
 ENDIF()
diff --git a/packages/rol/example/PDE-OPT/topo-opt/poisson/example_01.cpp b/packages/rol/example/PDE-OPT/topo-opt/poisson/example_01.cpp
index 586d5c3afab2..b35270edd36c 100644
--- a/packages/rol/example/PDE-OPT/topo-opt/poisson/example_01.cpp
+++ b/packages/rol/example/PDE-OPT/topo-opt/poisson/example_01.cpp
@@ -13,8 +13,7 @@
 
 #include "Teuchos_Comm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 #include "Tpetra_Version.hpp"
@@ -43,7 +42,7 @@ int main(int argc, char *argv[]) {
   ROL::nullstream bhs; // outputs nothing
 
   /*** Initialize communicator. ***/
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
   ROL::Ptr<const Teuchos::Comm<int>> comm
     = Tpetra::getDefaultComm();
   const int myRank = comm->getRank();
diff --git a/packages/rol/example/PDE-OPT/topo-opt/poisson/example_01K.cpp b/packages/rol/example/PDE-OPT/topo-opt/poisson/example_01K.cpp
new file mode 100644
index 000000000000..e4bee83d372f
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/topo-opt/poisson/example_01K.cpp
@@ -0,0 +1,189 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  example_01.cpp
+    \brief Shows how to solve the Poisson topology optimization problem.
+*/
+
+#include "Teuchos_Comm.hpp"
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Version.hpp"
+
+#include <iostream>
+#include <algorithm>
+
+#include "ROL_Solver.hpp"
+#include "ROL_Bounds.hpp"
+
+#include "../../TOOLS/pdevectorK.hpp"
+
+#include "pde_poisson_topOptK.hpp"
+#include "mesh_poisson_topOptK.hpp"
+#include "../src/filtered_compliance_robjK.hpp"
+#include "../src/volume_conK.hpp"
+#include "../src/obj_volumeK.hpp"
+#include "../src/pde_filterK.hpp"
+
+using RealT = double;
+using DeviceT = Kokkos::HostSpace;
+
+int main(int argc, char *argv[]) {
+  // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+  int iprint     = argc - 1;
+  ROL::Ptr<std::ostream> outStream;
+  ROL::nullstream bhs; // outputs nothing
+
+  /*** Initialize communicator. ***/
+  ROL::GlobalMPISession mpiSession (&argc, &argv, &bhs);
+  Kokkos::ScopeGuard kokkosScope (argc, argv);
+  ROL::Ptr<const Teuchos::Comm<int>> comm
+    = Tpetra::getDefaultComm();
+  const int myRank = comm->getRank();
+  if ((iprint > 0) && (myRank == 0)) {
+    outStream = ROL::makePtrFromRef(std::cout);
+  }
+  else {
+    outStream = ROL::makePtrFromRef(bhs);
+  }
+  int errorFlag  = 0;
+
+  // *** Example body.
+  try {
+    RealT tol(1.e-8);
+
+    /*** Read in XML input ***/
+    std::string filename = "input_ex01.xml";
+    auto parlist = ROL::getParametersFromXmlFile(filename);
+
+    // Retrieve parameters.
+    const RealT volFraction  = parlist->sublist("Problem").get("Volume Fraction", 0.4);
+    const RealT initDens     = parlist->sublist("Problem").get("Initial Density",volFraction);
+    const bool useFilter     = parlist->sublist("Problem").get("Use Filter", true);
+    const bool normalizeObj  = parlist->sublist("Problem").get("Normalize Compliance", true);
+    const bool volEq         = parlist->sublist("Problem").get("Use Volume Equality Constraint",true);
+
+    /*** Initialize main data structure. ***/
+    auto meshMgr = ROL::makePtr<MeshManager_Poisson_TopOpt<RealT,DeviceT>>(*parlist);
+
+    // Initialize PDE describe Poisson's equation
+    auto pde = ROL::makePtr<PDE_Poisson_TopOpt<RealT,DeviceT>>(*parlist);
+
+    // Initialize the filter PDE.
+    ROL::Ptr<PDE<RealT,DeviceT>> pdeFilter = pde;
+
+    // Initialize "filtered" of "unfiltered" constraint.
+    ROL::Ptr<ROL::Objective<RealT>> robj;
+    ROL::Ptr<Assembler<RealT,DeviceT>> assembler, assemblerFilter;
+    if (useFilter) {
+      pdeFilter = ROL::makePtr<PDE_Filter<RealT,DeviceT>>(*parlist);
+      pde->setDensityFields(pdeFilter->getFields());
+      robj = ROL::makePtr<TopOptFilteredComplianceObjective<RealT,DeviceT>>(
+             pde,pdeFilter,meshMgr,comm,*parlist,*outStream);
+      assembler = ROL::dynamicPtrCast<TopOptFilteredComplianceObjective<RealT,DeviceT>>(robj)->getAssembler();
+      assemblerFilter = ROL::dynamicPtrCast<TopOptFilteredComplianceObjective<RealT,DeviceT>>(robj)->getFilterAssembler();
+    }
+    else {
+      robj = ROL::makePtr<TopOptComplianceObjective<RealT,DeviceT>>(
+             pde,meshMgr,comm,*parlist,*outStream);
+      assembler = ROL::dynamicPtrCast<TopOptComplianceObjective<RealT,DeviceT>>(robj)->getAssembler();
+      assemblerFilter = assembler;
+    }
+
+    // Create vectors
+    auto u_ptr = assembler->createStateVector();         u_ptr->putScalar(0.0);
+    auto p_ptr = assembler->createStateVector();         p_ptr->putScalar(0.0);
+    auto r_ptr = assembler->createResidualVector();      r_ptr->putScalar(0.0);
+    auto z_ptr = assemblerFilter->createControlVector(); z_ptr->putScalar(1.0);
+    auto up = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(u_ptr,pde,assembler,*parlist);
+    auto pp = ROL::makePtr<PDE_PrimalSimVector<RealT,DeviceT>>(p_ptr,pde,assembler,*parlist);
+    auto rp = ROL::makePtr<PDE_DualSimVector<RealT,DeviceT>>(r_ptr,pde,assembler,*parlist);
+    auto zp = ROL::makePtr<PDE_PrimalOptVector<RealT,DeviceT>>(z_ptr,pdeFilter,assemblerFilter,*parlist);
+
+    // Build volume objective function.
+    ROL::Ptr<QoI<RealT,DeviceT>> qoi_vol;
+    if (useFilter)
+      qoi_vol = ROL::makePtr<QoI_Volume_TopoOpt<RealT,DeviceT>>(
+                  ROL::dynamicPtrCast<PDE_Filter<RealT,DeviceT>>(pdeFilter)->getDensityFE(),
+                  volFraction);
+    else
+      qoi_vol = ROL::makePtr<QoI_Volume_TopoOpt<RealT,DeviceT>>(pde->getDensityFE(), volFraction);
+    ROL::Ptr<ROL::Constraint<RealT>> con_vol = ROL::makePtr<TopOptVolumeConstraint<RealT,DeviceT>>(qoi_vol,assemblerFilter,zp);
+    ROL::Ptr<ROL::Vector<RealT>> imul = ROL::makePtr<ROL::SingletonVector<RealT>>(0);
+    ROL::Ptr<ROL::Vector<RealT>> ilp  = ROL::makePtr<ROL::SingletonVector<RealT>>(0);
+    ROL::Ptr<ROL::Vector<RealT>> iup  = ROL::makePtr<ROL::SingletonVector<RealT>>(0);
+    zp->zero(); con_vol->value(*ilp,*zp,tol);
+    ROL::Ptr<ROL::BoundConstraint<RealT>> ibnd = ROL::makePtr<ROL::Bounds<RealT>>(ilp,iup);
+
+    // Define bound constraint
+    auto zlo = zp->clone(); zlo->setScalar(0.0);
+    auto zhi = zp->clone(); zhi->setScalar(1.0);
+    ROL::Ptr<ROL::BoundConstraint<RealT>> bnd = ROL::makePtr<ROL::Bounds<RealT>>(zlo,zhi);
+
+    // Normalize compliance objective function
+    zp->setScalar(initDens);
+    RealT cs(1);
+    if (normalizeObj) {
+      if (useFilter)
+        cs = ROL::dynamicPtrCast<TopOptFilteredComplianceObjective<RealT,DeviceT>>(robj)->normalize(*zp,tol);
+      else
+        cs = ROL::dynamicPtrCast<TopOptComplianceObjective<RealT,DeviceT>>(robj)->normalize(*zp,tol);
+    }
+    *outStream << std::endl;
+    *outStream << "Problem Data"          << std::endl;
+    *outStream << "  SIMP Power:        "
+               << parlist->sublist("Problem").get("SIMP Power",3.0) << std::endl;
+    *outStream << "  Use Filter:        " << useFilter << std::endl;
+    *outStream << "  Volume Fraction:   " << volFraction << std::endl;
+    *outStream << "  Initial Density:   " << initDens    << std::endl;
+    *outStream << "  Use Equality:      " << volEq       << std::endl;
+    *outStream << "  Compliance Scale:  " << cs          << std::endl;
+    *outStream << std::endl;
+
+    // Initialize optimization problem.
+    auto opt = ROL::makePtr<ROL::Problem<RealT>>(robj,zp);
+    opt->addBoundConstraint(bnd);
+    if (volEq) opt->addLinearConstraint("Volume",con_vol,imul);
+    else       opt->addLinearConstraint("Volume",con_vol,imul,ibnd);
+    opt->setProjectionAlgorithm(*parlist);
+    opt->finalize(false,true,*outStream);
+
+    // Run derivative checks
+    bool checkDeriv = parlist->sublist("Problem").get("Check derivatives",false);
+    if ( checkDeriv ) opt->check(true,*outStream);
+
+    // Solve optimization problem
+    ROL::Solver<RealT> solver(opt,*parlist);
+    Teuchos::Time algoTimer("Algorithm Time", true);
+    solver.solve(*outStream);
+    algoTimer.stop();
+    *outStream << "Total optimization time = " << algoTimer.totalElapsedTime() << " seconds.\n";
+
+    // Output.
+    ROL::dynamicPtrCast<TopOptFilteredComplianceObjective<RealT,DeviceT>>(robj)->summarize(*outStream);
+    ROL::staticPtrCast<TopOptVolumeConstraint<RealT,DeviceT>>(con_vol)->summarize(*outStream);
+    ROL::dynamicPtrCast<TopOptFilteredComplianceObjective<RealT,DeviceT>>(robj)->printToFile(*zp,*outStream);
+
+    // Get a summary from the time monitor.
+    Teuchos::TimeMonitor::summarize();
+  }
+  catch (std::logic_error& err) {
+    *outStream << err.what() << "\n";
+    errorFlag = -1000;
+  }; // end try
+
+  if (errorFlag != 0)
+    std::cout << "End Result: TEST FAILED\n";
+  else
+    std::cout << "End Result: TEST PASSED\n";
+
+  return 0;
+}
diff --git a/packages/rol/example/PDE-OPT/topo-opt/poisson/mesh_poisson_topOptK.hpp b/packages/rol/example/PDE-OPT/topo-opt/poisson/mesh_poisson_topOptK.hpp
new file mode 100644
index 000000000000..c94a077b8322
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/topo-opt/poisson/mesh_poisson_topOptK.hpp
@@ -0,0 +1,95 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+#include "../../TOOLS/meshmanagerK.hpp"
+
+template <class Real, class DeviceType>
+class MeshManager_Poisson_TopOpt : public MeshManager_Rectangle<Real,DeviceType> {
+
+private:
+
+  int nx_;
+  int ny_;
+  ROL::Ptr<std::vector<std::vector<std::vector<int>>>>  meshSideSets_;
+
+public: 
+
+  MeshManager_Poisson_TopOpt(ROL::ParameterList &parlist) : MeshManager_Rectangle<Real,DeviceType>(parlist)
+  {
+    nx_ = parlist.sublist("Geometry").get("NX", 3);
+    ny_ = parlist.sublist("Geometry").get("NY", 3);
+    computeSideSets();
+  }
+
+
+  void computeSideSets() override {
+
+    int numSideSets = 4;
+    meshSideSets_ = ROL::makePtr<std::vector<std::vector<std::vector<int>>>>(numSideSets);
+
+    int ny3 = ny_ / 3;
+    int nyr = ny_ % 3;
+
+    int n1 = (nyr == 2) ? ny3+1 : ny3;
+    int n2 = (nyr == 1) ? ny3+1 : ny3;
+    int n3 = (nyr == 2) ? ny3+1 : ny3;
+
+    // Neumann sides
+    (*meshSideSets_)[0].resize(4);
+    (*meshSideSets_)[0][0].resize(nx_); // x = (0,1), y = {0}
+    (*meshSideSets_)[0][1].resize(ny_); // x = {1}, y = (0,1)
+    (*meshSideSets_)[0][2].resize(nx_); // x = (0,1), y = {1}
+    (*meshSideSets_)[0][3].resize(0);
+    // Neumann side
+    (*meshSideSets_)[1].resize(4);
+    (*meshSideSets_)[1][0].resize(0);
+    (*meshSideSets_)[1][1].resize(0);
+    (*meshSideSets_)[1][2].resize(0);
+    (*meshSideSets_)[1][3].resize(n1);
+    // Dirichlet side
+    (*meshSideSets_)[2].resize(4);
+    (*meshSideSets_)[2][0].resize(0);
+    (*meshSideSets_)[2][1].resize(0);
+    (*meshSideSets_)[2][2].resize(0);
+    (*meshSideSets_)[2][3].resize(n2);
+    // Neuman side
+    (*meshSideSets_)[3].resize(4);
+    (*meshSideSets_)[3][0].resize(0);
+    (*meshSideSets_)[3][1].resize(0);
+    (*meshSideSets_)[3][2].resize(0);
+    (*meshSideSets_)[3][3].resize(n3);
+    
+    for (int i=0; i<nx_; ++i)
+      (*meshSideSets_)[0][0][i] = i;
+    for (int i=0; i<ny_; ++i)
+      (*meshSideSets_)[0][1][i] = (i+1)*nx_-1;
+    for (int i=0; i<nx_; ++i)
+      (*meshSideSets_)[0][2][i] = i + nx_*(ny_-1);
+    for (int i=0; i<ny_; ++i) {
+      if ( i < n1 )
+        (*meshSideSets_)[1][3][i] = i*nx_;
+      if ( i >= n1 && i < n1+n2 )
+        (*meshSideSets_)[2][3][i-n1] = i*nx_;
+      if ( i >= n1+n2 )
+        (*meshSideSets_)[3][3][i-n1-n2] = i*nx_;
+    }
+
+  } // computeSideSets
+
+  ROL::Ptr<std::vector<std::vector<std::vector<int>>>> getSideSets(
+              const bool verbose = false,
+              std::ostream & outStream = std::cout) const override { 
+    if ( verbose ) {
+      outStream << "Mesh_Poisson_TopOpt: getSideSets called" << std::endl;
+      outStream << "Mesh_Poisson_TopOpt: numSideSets = " << meshSideSets_->size() << std::endl;
+    }
+    return meshSideSets_;
+  }
+  
+}; // MeshManager_TopoOpt
diff --git a/packages/rol/example/PDE-OPT/topo-opt/poisson/pde_poisson_topOptK.hpp b/packages/rol/example/PDE-OPT/topo-opt/poisson/pde_poisson_topOptK.hpp
new file mode 100644
index 000000000000..e849a195dc29
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/topo-opt/poisson/pde_poisson_topOptK.hpp
@@ -0,0 +1,553 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  pde_poisson_topOpt.hpp
+    \brief Implements the local PDE interface for the Poisson
+           topology optimization problem.
+*/
+
+#ifndef PDE_POISSON_TOPOPTK_HPP
+#define PDE_POISSON_TOPOPTK_HPP
+
+#include "../../TOOLS/pdeK.hpp"
+#include "../../TOOLS/feK.hpp"
+
+#include "Intrepid2_HVOL_C0_FEM.hpp"
+#include "Intrepid2_HGRAD_QUAD_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_QUAD_C2_FEM.hpp"
+#include "Intrepid2_DefaultCubatureFactory.hpp"
+#include "Intrepid2_FunctionSpaceTools.hpp"
+#include "Intrepid2_RealSpaceTools.hpp"
+#include "Intrepid2_CellTools.hpp"
+
+#include "ROL_Ptr.hpp"
+
+template <class Real, class DeviceType>
+class PDE_Poisson_TopOpt : public PDE<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+private:
+  // Finite element basis information
+  basis_ptr basisPtr_, basisPtrDens_;
+  std::vector<basis_ptr> basisPtrs_, basisPtrsDens_;
+  // Cell cubature information
+  ROL::Ptr<Intrepid2::Cubature<DeviceType,Real,Real>> cellCub_;
+  // Cell node information
+  scalar_view volCellNodes_;
+  std::vector<std::vector<scalar_view>> bdryCellNodes_;
+  std::vector<std::vector<std::vector<int>>> bdryCellLocIds_;
+  // Finite element definition
+  ROL::Ptr<fe_type> fe_vol_, fe_dens_;
+  // Local degrees of freedom on boundary, for each side of the reference cell (first index).
+  std::vector<std::vector<int>> fidx_;
+  // Coordinates of degrees freedom on boundary cells.
+  // Indexing:  [sideset number][local side id](cell number, value at dof)
+  std::vector<std::vector<scalar_view>> bdryCellDofValues_;
+  // Force function evaluated at cubature points
+  scalar_view force_eval_, nforce_eval_;
+  // Inputs
+  Real minConductivity_, SIMPpower_;
+  bool getFields2called_;
+
+  Real ForceFunc(const std::vector<Real> &x) const {
+    return static_cast<Real>(0.01);
+  }
+
+  Real dirichletFunc(const std::vector<Real> & coords, const int sideset, const int locSideId) const {
+    return static_cast<Real>(0);
+  }
+
+public:
+  PDE_Poisson_TopOpt(Teuchos::ParameterList &parlist) : getFields2called_(false) {
+    // Finite element fields.
+    int basisOrder     = parlist.sublist("Problem").get("Order of FE discretization",1);
+    int basisOrderDens = parlist.sublist("Problem").get("Density Basis Order",0);
+    TEUCHOS_TEST_FOR_EXCEPTION(basisOrder > 2 || basisOrder < 1, std::invalid_argument,
+      ">>> PDE-OPT/topopt/poisson/pde_poisson_topOpt.hpp: Basis order is not 1 or 2!");
+    TEUCHOS_TEST_FOR_EXCEPTION(basisOrderDens > 1 || basisOrderDens < 0, std::invalid_argument,
+      ">>> PDE-OPT/topopt/poisson/pde_poisson_topOpt.hpp: Basis order is not 0 or 1!");
+    if (basisOrder == 1)
+      basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceType,Real,Real>>();
+    else if (basisOrder == 2)
+      basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C2_FEM<DeviceType,Real,Real>>();
+    shards::CellTopology cellType = basisPtr_->getBaseCellTopology();        // get the cell type from any basis
+    if (basisOrderDens == 1)
+      basisPtrDens_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceType,Real,Real>>();
+    else
+      basisPtrDens_ = ROL::makePtr<Intrepid2::Basis_HVOL_C0_FEM<DeviceType,Real,Real>>(cellType);
+    basisPtrs_.clear();     basisPtrs_.push_back(basisPtr_);
+    basisPtrsDens_.clear(); basisPtrsDens_.push_back(basisPtrDens_); // Density component
+    // Quadrature rules.
+    Intrepid2::DefaultCubatureFactory cubFactory;                            // create cubature factory
+    int cubDegree = parlist.sublist("Problem").get("Cubature Degree",2);     // set cubature degree, e.g., 2
+    cellCub_ = cubFactory.create<DeviceType,Real,Real>(cellType, cubDegree); // create default cubature
+
+    minConductivity_ = parlist.sublist("Problem").get("Minimum Conductivity",1.e-3);
+    SIMPpower_       = parlist.sublist("Problem").get("SIMP Power",3.0);
+  }
+
+  void residual(scalar_view & res,
+                const scalar_view u_coeff,
+                const scalar_view z_coeff = scalar_view(),
+                const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    const Real one(1);
+    // Get dimensions
+    const int c = fe_vol_->gradN().extent_int(0);
+    const int f = fe_vol_->gradN().extent_int(1);
+    const int p = fe_vol_->gradN().extent_int(2);
+    const int d = fe_vol_->gradN().extent_int(3);
+    // Initialize residual storage
+    res = scalar_view("res",c,f);
+    // Temporary storage
+    scalar_view valZ_eval("valZ_eval",c,p);
+    scalar_view gradU_eval("gradU_eval",c,p,d);
+    scalar_view KgradU("KgradU",c,p,d);
+    // Build SIMP density at cubature points
+    fe_dens_->evaluateValue(valZ_eval, z_coeff);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        valZ_eval(i,j) = minConductivity_
+          + (one - minConductivity_) * std::pow(valZ_eval(i,j),SIMPpower_);
+      }
+    }
+    // Build flux function
+    fe_vol_->evaluateGradient(gradU_eval, u_coeff);
+    fst::scalarMultiplyDataData(KgradU,valZ_eval,gradU_eval);
+    // Integrate stiffness term
+    fst::integrate(res,KgradU,fe_vol_->gradNdetJ(),false);
+    // Add force term
+    fst::integrate(res,nforce_eval_,fe_vol_->NdetJ(),true);
+    // Apply Dirichlet boundary conditions
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if (i==2) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numBdryDofs = fidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                res(cidx,fidx_[j][l]) = u_coeff(cidx,fidx_[j][l]) - (bdryCellDofValues_[i][j])(k,fidx_[j][l]);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void Jacobian_1(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    const Real one(1);
+    // Get dimensions
+    const int c = fe_vol_->gradN().extent_int(0);
+    const int f = fe_vol_->gradN().extent_int(1);
+    const int p = fe_vol_->gradN().extent_int(2);
+    const int d = fe_vol_->gradN().extent_int(3);
+    // Initialize Jacobian storage
+    jac = scalar_view("jac", c, f, f);
+    scalar_view valZ_eval("valZ_eval", c, p);
+    scalar_view KgradN("KgradN", c, f, p, d);
+    // Build density-dependent conductivity function
+    fe_dens_->evaluateValue(valZ_eval, z_coeff);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        valZ_eval(i,j) = minConductivity_
+          + (one - minConductivity_) * std::pow(valZ_eval(i,j),SIMPpower_);
+      }
+    }
+    // Build flux function
+    fst::scalarMultiplyDataField(KgradN,valZ_eval,fe_vol_->gradN());
+    // Integrate stiffness term
+    fst::integrate(jac,KgradN,fe_vol_->gradNdetJ(),false);
+
+    // Apply Dirichlet boundary conditions
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if (i==2) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numBdryDofs = fidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                for (int m = 0; m < f; ++m) {
+                  jac(cidx,fidx_[j][l],m) = static_cast<Real>(0);
+                }
+                jac(cidx,fidx_[j][l],fidx_[j][l]) = static_cast<Real>(1);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void Jacobian_2(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    const Real one(1);
+    // Get dimensions
+    const int c  = fe_vol_->gradN().extent_int(0);
+    const int f  = fe_vol_->gradN().extent_int(1);
+    const int fd = fe_dens_->gradN().extent_int(1);
+    const int p  = fe_vol_->gradN().extent_int(2);
+    const int d  = fe_vol_->gradN().extent_int(3);
+    // Initialize Jacobian storage
+    jac = scalar_view("jac", c, f, fd);
+    // Temporary storage
+    scalar_view valZ_eval("valZ_eval", c, p);
+    scalar_view gradU_eval("gradU_eval", c, p, d);
+    scalar_view dKN("dKN", c, fd, p);
+    scalar_view gradUgradN("gradUgradN", c, f, p);
+    // Build density-dependent conductivity function
+    fe_dens_->evaluateValue(valZ_eval, z_coeff);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        valZ_eval(i,j) = SIMPpower_*(one - minConductivity_)
+          * std::pow(valZ_eval(i,j),SIMPpower_-one);
+      }
+    }
+    // Build derivative of conductivity function
+    fst::scalarMultiplyDataField(dKN,valZ_eval,fe_dens_->N());
+    // Integrate stiffness term
+    fe_vol_->evaluateGradient(gradU_eval, u_coeff);
+    fst::dotMultiplyDataField(gradUgradN,gradU_eval,fe_vol_->gradNdetJ());
+    fst::integrate(jac,gradUgradN,dKN,false);
+
+    // Apply Dirichlet boundary conditions
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if (i==2) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numBdryDofs = fidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                for (int m=0; m < f; ++m) {
+                  jac(cidx,fidx_[j][l],m) = static_cast<Real>(0);
+                }
+              }
+            }
+          }
+	}
+      }
+    }
+  }
+
+  void Hessian_11(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Poisson_TopOpt::Hessian_11): Zero Hessian.");
+  }
+
+  void Hessian_12(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    const Real one(1);
+    // Get dimensions
+    const int c  = fe_vol_->gradN().extent_int(0);
+    const int f  = fe_vol_->gradN().extent_int(1);
+    const int fd = fe_dens_->gradN().extent_int(1);
+    const int p  = fe_vol_->gradN().extent_int(2);
+    const int d  = fe_vol_->gradN().extent_int(3);
+    // Initialize Hessian storage
+    hess = scalar_view("hess", c, fd, f);
+    // Temporary storage
+    scalar_view valZ_eval("valZ_eval", c, p);
+    scalar_view l0_coeff("l0_coeff", c, f);
+    scalar_view dKN("dKN", c, fd, p);
+    scalar_view gradL_eval("gradL_eval", c, p, d);
+    scalar_view gradLgradN("gradLgradN", c, f, p);
+    // Build density-dependent conductivity function
+    fe_dens_->evaluateValue(valZ_eval, z_coeff);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        valZ_eval(i,j) = SIMPpower_*(one - minConductivity_)
+          * std::pow(valZ_eval(i,j),SIMPpower_-one);
+      }
+    }
+    // Apply Dirichlet conditions to the multipliers.
+    Kokkos::deep_copy(l0_coeff,l_coeff);
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if (i==2) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numBdryDofs = fidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                l0_coeff(cidx,fidx_[j][l]) = static_cast<Real>(0);
+              }
+            }
+          }
+        }
+      }
+    }
+    // Build derivative of conductivity function
+    fst::scalarMultiplyDataField(dKN,valZ_eval,fe_dens_->N());
+    // Integrate stiffness term
+    fe_vol_->evaluateGradient(gradL_eval, l0_coeff);
+    fst::dotMultiplyDataField(gradLgradN,gradL_eval,fe_vol_->gradNdetJ());
+    fst::integrate(hess,dKN,gradLgradN,false);
+  }
+
+  void Hessian_21(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    const Real one(1);
+    // Get dimensions
+    const int c  = fe_vol_->gradN().extent_int(0);
+    const int f  = fe_vol_->gradN().extent_int(1);
+    const int fd = fe_dens_->gradN().extent_int(1);
+    const int p  = fe_vol_->gradN().extent_int(2);
+    const int d  = fe_vol_->gradN().extent_int(3);
+    // Initialize Hessian storage
+    hess = scalar_view("hess", c, f, fd);
+    // Temporary storage
+    scalar_view valZ_eval("valZ_eval", c, p);
+    scalar_view l0_coeff("l0_coeff", c, f);
+    scalar_view dKN("dKN", c, fd, p);
+    scalar_view gradL_eval("gradL_eval", c, p, d);
+    scalar_view gradLgradN("gradLgradN", c, f, p);
+    // Build density-dependent conductivity function
+    fe_vol_->evaluateValue(valZ_eval, z_coeff);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        valZ_eval(i,j) = SIMPpower_*(one - minConductivity_)
+          * std::pow(valZ_eval(i,j),SIMPpower_-one);
+      }
+    }
+    // Apply Dirichlet conditions to the multipliers.
+    Kokkos::deep_copy(l0_coeff,l_coeff);
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if (i==2) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numBdryDofs = fidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                l0_coeff(cidx,fidx_[j][l]) = static_cast<Real>(0);
+              }
+            }
+          }
+        }
+      }
+    }
+    // Build derivative of conductivity function
+    fst::scalarMultiplyDataField(dKN,valZ_eval,fe_dens_->N());
+    // Integrate stiffness term
+    fe_vol_->evaluateGradient(gradL_eval, l0_coeff);
+    fst::dotMultiplyDataField(gradLgradN,gradL_eval,fe_vol_->gradNdetJ());
+    fst::integrate(hess,gradLgradN,dKN,false);
+  }
+
+  void Hessian_22(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    const Real one(1), two(2);
+    // Get dimensions
+    const int c  = fe_vol_->gradN().extent_int(0);
+    const int f  = fe_vol_->gradN().extent_int(1);
+    const int fd = fe_dens_->gradN().extent_int(1);
+    const int p  = fe_vol_->gradN().extent_int(2);
+    const int d  = fe_vol_->gradN().extent_int(3);
+    // Initialize Hessian storage
+    hess = scalar_view("hess", c, fd, fd);
+    // Temporary storage
+    scalar_view valZ_eval("valZ_eval", c, p);
+    scalar_view l0_coeff("l0_coeff", c, f);
+    scalar_view dKN("dKN", c, fd, p);
+    scalar_view gradU_eval("gradU_eval", c, p, d);
+    scalar_view gradL_eval("gradL_eval", c, p, d);
+    scalar_view gradUgradL("gradUgradL", c, p);
+    scalar_view NgradUgradL("NgradUgradL", c, fd, p);
+    // Build density-dependent conductivity function
+    fe_dens_->evaluateValue(valZ_eval, z_coeff);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        valZ_eval(i,j) = SIMPpower_ * (SIMPpower_ - one)*(one - minConductivity_)
+          * std::pow(valZ_eval(i,j),SIMPpower_-two);
+      }
+    }
+    // Apply Dirichlet conditions to the multipliers.
+    Kokkos::deep_copy(l0_coeff,l_coeff);
+    int numSideSets = bdryCellLocIds_.size();
+    if (numSideSets > 0) {
+      for (int i = 0; i < numSideSets; ++i) {
+        if (i==2) {
+          int numLocalSideIds = bdryCellLocIds_[i].size();
+          for (int j = 0; j < numLocalSideIds; ++j) {
+            int numCellsSide = bdryCellLocIds_[i][j].size();
+            int numBdryDofs = fidx_[j].size();
+            for (int k = 0; k < numCellsSide; ++k) {
+              int cidx = bdryCellLocIds_[i][j][k];
+              for (int l = 0; l < numBdryDofs; ++l) {
+                l0_coeff(cidx,fidx_[j][l]) = static_cast<Real>(0);
+              }
+            }
+          }
+        }
+      }
+    }
+    // Build derivative of conductivity function
+    fst::scalarMultiplyDataField(dKN,valZ_eval,fe_dens_->N());
+    // Integrate stiffness term
+    fe_vol_->evaluateGradient(gradU_eval, u_coeff);
+    fe_vol_->evaluateGradient(gradL_eval, l0_coeff);
+    fst::dotMultiplyDataData(gradUgradL,gradU_eval,gradL_eval);
+    fst::scalarMultiplyDataField(NgradUgradL,gradUgradL,fe_dens_->NdetJ());
+    fst::integrate(hess,dKN,NgradUgradL,false);
+  }
+
+  void RieszMap_1(scalar_view & riesz) override {
+    // GET DIMENSIONS
+    int c = fe_vol_->N().extent_int(0);
+    int f = fe_vol_->N().extent_int(1);
+    // INITIALIZE RIESZ
+    riesz = scalar_view("riesz", c, f, f);
+    Kokkos::deep_copy(riesz,fe_vol_->stiffMat());
+    rst::add(riesz,fe_vol_->massMat());
+  }
+
+  void RieszMap_2(scalar_view & riesz) override {
+    // GET DIMENSIONS
+    int c = fe_dens_->N().extent_int(0);
+    int f = fe_dens_->N().extent_int(1);
+    // INITIALIZE RIESZ
+    riesz = scalar_view("riesz", c, f, f);
+    Kokkos::deep_copy(riesz,fe_dens_->massMat());
+  }
+
+  // This must be called before getFields2
+  void setDensityFields(const std::vector<basis_ptr> &basisPtrs) {
+    TEUCHOS_TEST_FOR_EXCEPTION(getFields2called_, std::invalid_argument,
+      ">>> PDE-OPT/topo-opt/elasticity/src/pde_elasticity.hpp: Must call before getFields2!");
+
+    basisPtrDens_  = basisPtrs[0];
+    basisPtrsDens_ = basisPtrs;
+  }
+
+  std::vector<basis_ptr> getFields() override {
+    return basisPtrs_;
+  }
+
+  std::vector<basis_ptr> getFields2() override {
+    getFields2called_ = true;
+    return basisPtrsDens_;
+  }
+
+  void setCellNodes(const scalar_view &volCellNodes,
+                    const std::vector<std::vector<scalar_view>> &bdryCellNodes,
+                    const std::vector<std::vector<std::vector<int>>> &bdryCellLocIds) override {
+    volCellNodes_ = volCellNodes;
+    bdryCellNodes_ = bdryCellNodes;
+    bdryCellLocIds_ = bdryCellLocIds;
+    // Finite element definition.
+    fe_vol_  = ROL::makePtr<fe_type>(volCellNodes_,basisPtr_,cellCub_);
+    fe_dens_ = ROL::makePtr<fe_type>(volCellNodes_,basisPtrDens_,cellCub_,false);
+    fidx_ = fe_vol_->getBoundaryDofs();
+    computeForce();
+    computeDirichlet();
+  }
+
+  void computeForce(void) {
+    int c = fe_vol_->cubPts().extent_int(0);
+    int p = fe_vol_->cubPts().extent_int(1);
+    int d = fe_vol_->cubPts().extent_int(2);
+    std::vector<Real> pt(d,0);
+    force_eval_ = scalar_view("force_eval_",c,p);
+    nforce_eval_ = scalar_view("nforce_eval_",c,p);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        for (int k = 0; k < d; ++k) {
+          pt[k] = (fe_vol_->cubPts())(i,j,k);
+        }
+        force_eval_(i,j)  = ForceFunc(pt);
+        nforce_eval_(i,j) = -force_eval_(i,j);
+      }
+    }
+  }
+
+  void computeDirichlet(void) {
+    // Compute Dirichlet values at DOFs.
+    int d = basisPtr_->getBaseCellTopology().getDimension();
+    int numSidesets = bdryCellLocIds_.size();
+    bdryCellDofValues_.resize(numSidesets);
+    for (int i=0; i<numSidesets; ++i) {
+      int numLocSides = bdryCellLocIds_[i].size();
+      bdryCellDofValues_[i].resize(numLocSides);
+      for (int j=0; j<numLocSides; ++j) {
+        int c = bdryCellLocIds_[i][j].size();
+        int f = basisPtr_->getCardinality();
+	std::stringstream name;
+	name << "bdryCellDofValues" << i << j;
+        bdryCellDofValues_[i][j] = scalar_view(name.str(), c, f);
+        scalar_view coords("coords", c, f, d);
+        if (c > 0) {
+          fe_vol_->computeDofCoords(coords, bdryCellNodes_[i][j]);
+        }
+        for (int k=0; k<c; ++k) {
+          for (int l=0; l<f; ++l) {
+            std::vector<Real> dofpoint(d);
+            for (int m=0; m<d; ++m) {
+              dofpoint[m] = coords(k, l, m);
+            }
+            (bdryCellDofValues_[i][j])(k, l) = dirichletFunc(dofpoint, i, j);
+          }
+        }
+      }
+    }
+  }
+
+  const ROL::Ptr<fe_type> getFE(void) const {
+    return fe_vol_;
+  }
+
+  const ROL::Ptr<fe_type> getDensityFE(void) const {
+    return fe_dens_;
+  }
+
+  const scalar_view getForce(void) const {
+    return force_eval_;
+  }
+
+}; // PDE_Poisson
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/topo-opt/src/compliance_robjK.hpp b/packages/rol/example/PDE-OPT/topo-opt/src/compliance_robjK.hpp
new file mode 100644
index 000000000000..a6e9bc1013fe
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/topo-opt/src/compliance_robjK.hpp
@@ -0,0 +1,613 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+#ifndef TOPOT_COMPLIANCE_ROBJK_H
+#define TOPOT_COMPLIANCE_ROBJK_H
+
+#include "ROL_Objective.hpp"
+#include "ROL_VectorController.hpp"
+#include "ROL_SecantFactory.hpp"
+#include "../../TOOLS/pdeK.hpp"
+#include "../../TOOLS/assemblerK.hpp"
+#include "../../TOOLS/solver.hpp"
+#include "../../TOOLS/pdevectorK.hpp"
+
+template <class Real, class DeviceType>
+class TopOptComplianceObjective : public ROL::Objective<Real> {
+private:
+  // Elasticity PDE
+  const ROL::Ptr<PDE<Real,DeviceType>> pde_;
+  ROL::Ptr<Assembler<Real,DeviceType>> assembler_;
+  ROL::Ptr<Solver<Real>> solver_, solver_cache_, solver_tmp_;
+  ROL::Ptr<Tpetra::CrsMatrix<>> matJ1_, matJ1_cache_, matJ1_tmp_;
+  ROL::Ptr<Tpetra::CrsMatrix<>> matJ2_, matJ2_cache_, matJ2_tmp_;
+  ROL::Ptr<Tpetra::CrsMatrix<>> matH22_;
+  bool assembleJ1_, assembleJ2_, assembleH22_, assembleF_;
+
+  // Output information
+  const std::string uname_, dname_;
+
+  // Vector Storage
+  ROL::Ptr<Tpetra::MultiVector<>> F_data_, state_sens_data_;
+  ROL::Ptr<Tpetra::MultiVector<>> dstat_data_, dctrl_data_;
+  ROL::Ptr<ROL::Vector<Real>> state_, dctrl_;
+  ROL::Ptr<ROL::VectorController<Real>> stateStore_;
+
+  bool storage_;
+  Real cmpScale_;
+  bool nuke_;
+  int printFreq_;
+
+  unsigned nstat_, nsens_;
+  unsigned nupda_, nfval_, ngrad_, nhess_, nprec_;
+  unsigned napJ1_, nasJ1_, napJ2_, nasJ2_, napH2_, nasH2_, nasLo_;
+
+  // Hessian Approximation Data
+  std::string hessAppr_;
+  Real H2scale_;
+  ROL::Ptr<ROL::Secant<Real>> secant_;
+  ROL::Ptr<Tpetra::MultiVector<>> s_data_;
+  ROL::Ptr<ROL::Vector<Real>> g_new_, g_old_, s_, g_;
+  bool computeGrad_;
+
+public:
+  TopOptComplianceObjective(
+      const ROL::Ptr<PDE<Real,DeviceType>>         &pde,
+      const ROL::Ptr<MeshManager<Real,DeviceType>> &mesh,
+      const ROL::Ptr<const Teuchos::Comm<int>>     &comm,
+      ROL::ParameterList                           &list,
+      std::ostream                                 &stream = std::cout,
+      std::string                                   uname  = "state",
+      std::string                                   dname  = "density")
+    : pde_(pde), assembleJ1_(true), assembleJ2_(true),
+      assembleH22_(true), assembleF_(true),
+      uname_(uname), dname_(dname),
+      nstat_(0), nsens_(0),
+      nupda_(0), nfval_(0), ngrad_(0), nhess_(0), nprec_(0),
+      napJ1_(0), nasJ1_(0), napJ2_(0), nasJ2_(0), napH2_(0), nasH2_(0), nasLo_(0), H2scale_(1) {
+    // Elasticity PDE
+    assembler_ = ROL::makePtr<Assembler<Real,DeviceType>>(pde_->getFields(),
+                                                          pde_->getFields2(),
+                                                          mesh,comm,list,stream);
+    assembler_->setCellNodes(*pde_);
+    solver_       = ROL::makePtr<Solver<Real>>(list.sublist("Solver"));
+    solver_cache_ = ROL::makePtr<Solver<Real>>(list.sublist("Solver"));
+
+    // Vector storage
+    F_data_           = assembler_->createResidualVector();
+    state_sens_data_  = assembler_->createStateVector();
+    dstat_data_       = assembler_->createResidualVector();
+    dctrl_data_       = assembler_->createControlVector();
+    state_            = ROL::makePtr<PDE_PrimalSimVector<Real,DeviceType>>(assembler_->createStateVector(),pde_,assembler_,list);
+    dctrl_            = ROL::makePtr<PDE_DualOptVector<Real,DeviceType>>(dctrl_data_,pde_,assembler_,list);
+    stateStore_       = ROL::makePtr<ROL::VectorController<Real>>();
+
+    storage_   = list.sublist("Problem").get("Use state storage", true);
+    cmpScale_  = list.sublist("Problem").get("Compliance Scaling", 1.0);
+    hessAppr_  = list.sublist("Problem").get("Hessian Approximation", "None");
+    nuke_      = list.sublist("Problem").get("Use Basic Update",false);
+    printFreq_ = list.sublist("Problem").get("Output Frequency",0);
+
+    computeGrad_ = true;
+    if (hessAppr_ == "Secant") {
+      s_data_ = assembler_->createControlVector();
+      s_      = ROL::makePtr<PDE_PrimalOptVector<Real,DeviceType>>(s_data_,pde_,assembler_,list);
+      g_      = ROL::makePtr<PDE_DualOptVector<Real,DeviceType>>(assembler_->createControlVector(),pde_,assembler_,list);
+      g_old_  = ROL::makePtr<PDE_DualOptVector<Real,DeviceType>>(assembler_->createControlVector(),pde_,assembler_,list);
+      g_new_  = ROL::makePtr<PDE_DualOptVector<Real,DeviceType>>(assembler_->createControlVector(),pde_,assembler_,list);
+      secant_ = ROL::SecantFactory<Real>(list);
+    }
+
+    initialize();
+  }
+
+  const ROL::Ptr<Assembler<Real,DeviceType>> getAssembler(void) const {
+    return assembler_;
+  }
+
+  void update(const ROL::Vector<Real> &z, ROL::UpdateType type, int iter = -1) {
+    nupda_++;
+    stateStore_->objectiveUpdate(type);
+    if (nuke_) {
+      update_temp(z,iter);
+    }
+    else {
+      switch (type) {
+        case ROL::UpdateType::Initial: update_initial(z,iter); break;
+        case ROL::UpdateType::Accept:  update_accept(z,iter);  break;
+        case ROL::UpdateType::Revert:  update_revert(z,iter);  break;
+        case ROL::UpdateType::Trial:   update_trial(z,iter);   break;
+        case ROL::UpdateType::Temp:    update_temp(z,iter);    break;
+      }
+    }
+    // Print
+    if (printFreq_ > 0 && iter%printFreq_ == 0) {
+      std::stringstream ufile, zfile;
+      ufile  << uname_ << "_" << iter << ".txt";
+      zfile  << dname_ << "_" << iter << ".txt";
+      ROL::Ptr<const Tpetra::MultiVector<>> z_data = getConstField(z);
+      ROL::Ptr<const Tpetra::MultiVector<>> u_data = getConstField(*state_);
+      assembler_->outputTpetraVector(u_data, ufile.str());
+      assembler_->outputTpetraVector(z_data, zfile.str());
+    }
+  }
+
+  void update( const ROL::Vector<Real> &z, bool flag = true, int iter = -1 ) {
+    nupda_++;
+    stateStore_->objectiveUpdate(true);
+    if (nuke_) {
+      update_temp(z,iter);
+    }
+    else {
+      // Trial:    flag = false, iter = -1
+      // Check:    flag = true,  iter = -1
+      // Reject:   flag = false, iter > -1
+      // Accept:   flag = true,  iter > -1
+      if (flag) {
+        if (iter > -1) {
+          update_accept(z,iter);
+        }
+        else {
+          update_temp(z,iter);
+        }
+      }
+      else {
+        if (iter > -1) {
+          update_revert(z,iter);
+        }
+        else {
+          update_trial(z,iter);
+        }
+      }
+    }
+  }
+
+  Real normalize(const ROL::Vector<Real> &z, Real &tol) {
+    update(z,ROL::UpdateType::Temp);
+    Real val = value(z,tol);
+    cmpScale_ /= val;
+    return cmpScale_;
+  }
+
+  void printToFile(const ROL::Vector<Real> &z, std::ostream &stream = std::cout,
+                   const std::string ufile = "state.txt", const std::string dfile = "density.txt") {
+    update(z,ROL::UpdateType::Temp);
+    ROL::Ptr<const Tpetra::MultiVector<>> z_data = getConstField(z);
+    ROL::Ptr<Tpetra::MultiVector<>>       u_data = getField(*state_);
+    solve_state_equation(u_data, z_data);
+    assembler_->outputTpetraVector(u_data, ufile);
+    assembler_->outputTpetraVector(z_data, dfile);
+    assembler_->printMeshData(stream);
+  }
+
+  void solveState(ROL::Vector<Real> &u, const ROL::Vector<Real> &z) {
+    update(z,ROL::UpdateType::Temp);
+    ROL::Ptr<const Tpetra::MultiVector<>> z_data = getConstField(z);
+    ROL::Ptr<Tpetra::MultiVector<>>       u_data = getField(u);
+    solve_state_equation(u_data, z_data);
+  }
+
+  Real value( const ROL::Vector<Real> &z, Real &tol ) {
+    nfval_++;
+    ROL::Ptr<const Tpetra::MultiVector<>> z_data = getConstField(z);
+    ROL::Ptr<Tpetra::MultiVector<>>       u_data = getField(*state_);
+    // Solve state equation
+    solve_state_equation(*state_,z);
+    // Compute compliance
+    Teuchos::Array<Real> cmp(1,0);
+    F_data_->dot(*u_data, cmp.view(0,1));
+    return cmpScale_*cmp[0];
+  }
+
+  void gradient( ROL::Vector<Real> &g, const ROL::Vector<Real> &z, Real &tol ) {
+    ngrad_++;
+    if (hessAppr_ == "Secant") {
+      // Will not work for stochastic problems
+      if (computeGrad_) {
+        ROL::Ptr<Tpetra::MultiVector<>>       g_data = getField(g);
+        ROL::Ptr<const Tpetra::MultiVector<>> z_data = getConstField(z);
+        ROL::Ptr<Tpetra::MultiVector<>>       u_data = getField(*state_);
+        // Solve state equation
+        solve_state_equation(*state_,z);
+        // Apply adjoint density jacobian
+        assembleJ2(u_data, z_data);
+        applyJacobian2(g_data, u_data, true);
+        g_data->scale(-cmpScale_);
+
+        g_->set(g);
+        computeGrad_ = false;
+      }
+      else {
+        g.set(*g_);
+      }
+    }
+    else {
+      ROL::Ptr<Tpetra::MultiVector<>>       g_data = getField(g);
+      ROL::Ptr<const Tpetra::MultiVector<>> z_data = getConstField(z);
+      ROL::Ptr<Tpetra::MultiVector<>>       u_data = getField(*state_);
+      // Solve state equation
+      solve_state_equation(*state_,z);
+      // Apply adjoint density jacobian
+      assembleJ2(u_data, z_data);
+      applyJacobian2(g_data, u_data, true);
+      g_data->scale(-cmpScale_);
+    }
+  }
+
+  void hessVec( ROL::Vector<Real> &hv, const ROL::Vector<Real> &v, const ROL::Vector<Real> &z, Real &tol ) {
+    nhess_++;
+    if (hessAppr_ == "Zero") {
+      hv.zero();
+    }
+    else if (hessAppr_ == "Positive Riesz") {
+      hv.set(v.dual());
+    }
+    else if (hessAppr_ == "Negative Riesz") {
+      hv.set(v.dual());
+      hv.scale(static_cast<Real>(-1));
+    }
+    else if (hessAppr_ == "Negative") {
+      ROL::Ptr<Tpetra::MultiVector<>>       h_data = getField(hv);
+      ROL::Ptr<const Tpetra::MultiVector<>> v_data = getConstField(v);
+      ROL::Ptr<const Tpetra::MultiVector<>> z_data = getConstField(z);
+      ROL::Ptr<Tpetra::MultiVector<>>       u_data = getField(*state_);
+      // Solve state equation
+      solve_state_equation(*state_,z);
+      // Apply density hessian to direction
+      assembleH22(u_data, z_data);
+      applyHessian22(h_data, v_data);
+      h_data->scale(-cmpScale_);
+    }
+    else if (hessAppr_ == "Positive") {
+      ROL::Ptr<Tpetra::MultiVector<>>       h_data = getField(hv);
+      ROL::Ptr<const Tpetra::MultiVector<>> v_data = getConstField(v);
+      ROL::Ptr<const Tpetra::MultiVector<>> z_data = getConstField(z);
+      ROL::Ptr<Tpetra::MultiVector<>>       u_data = getField(*state_);
+      // Solve state equation
+      solve_state_equation(*state_,z);
+      // Solve state sensitivity equation
+      assembleJ2(u_data, z_data);
+      solve_state_sensitivity(state_sens_data_, v_data);
+      // Apply adjoint density jacobian to state sensitivity
+      applyJacobian2(h_data, state_sens_data_, true);
+      h_data->scale(static_cast<Real>(2)*cmpScale_);
+    }
+    else if (hessAppr_ == "No Solve") {
+      ROL::Ptr<Tpetra::MultiVector<>>       h_data = getField(hv);
+      ROL::Ptr<const Tpetra::MultiVector<>> v_data = getConstField(v);
+      ROL::Ptr<const Tpetra::MultiVector<>> z_data = getConstField(z);
+      ROL::Ptr<Tpetra::MultiVector<>>       u_data = getField(*state_);
+      // Solve state equation
+      solve_state_equation(*state_,z);
+      // Apply density hessian to direction
+      assembleH22(u_data, z_data);
+      applyHessian22(h_data, v_data);
+      // Approximate positive term
+      assembleJ2(u_data, z_data);
+      applyJacobian2(dstat_data_, v_data, false);
+      applyJacobian2(dctrl_data_, dstat_data_, true);
+      // Approximation of inv(K) by aI where a solves min||aF-U||
+      // or b=1/a solves min||F-bU||
+      Teuchos::Array<Real> UdotF(1,0), UdotU(1,0);
+      u_data->dot(*F_data_,UdotF.view(0,1));
+      F_data_->dot(*F_data_,UdotU.view(0,1));
+      H2scale_ = UdotF[0]/UdotU[0];
+      //u_data->dot(*u_data,UdotU.view(0,1));
+      //H2scale_ = UdotU[0]/UdotF[0];
+      h_data->update(static_cast<Real>(2)*cmpScale_*H2scale_,*dctrl_data_,-cmpScale_);
+    }
+    else if (hessAppr_ == "Secant") {
+      // Will not work for stochastic problems
+      ROL::Ptr<Tpetra::MultiVector<>>       h_data = getField(hv);
+      ROL::Ptr<const Tpetra::MultiVector<>> v_data = getConstField(v);
+      ROL::Ptr<const Tpetra::MultiVector<>> z_data = getConstField(z);
+      ROL::Ptr<Tpetra::MultiVector<>>       u_data = getField(*state_);
+      // Solve state equation
+      solve_state_equation(*state_,z);
+      // Apply density hessian to direction
+      assembleH22(u_data, z_data);
+      applyHessian22(h_data, v_data);
+      // Apply secant approximation of positive term
+      secant_->applyB(*dctrl_, v);
+      h_data->update(static_cast<Real>(2)*cmpScale_,*dctrl_data_,-cmpScale_);
+    }
+    else {
+      ROL::Ptr<Tpetra::MultiVector<>>       h_data = getField(hv);
+      ROL::Ptr<const Tpetra::MultiVector<>> v_data = getConstField(v);
+      ROL::Ptr<const Tpetra::MultiVector<>> z_data = getConstField(z);
+      ROL::Ptr<Tpetra::MultiVector<>>       u_data = getField(*state_);
+      // Solve state equation
+      solve_state_equation(*state_,z);
+      // Solve state sensitivity equation
+      assembleJ2(u_data, z_data);
+      solve_state_sensitivity(state_sens_data_, v_data);
+      // Apply density hessian to direction
+      assembleH22(u_data, z_data);
+      applyHessian22(h_data, v_data);
+      // Apply adjoint density jacobian to state sensitivity
+      applyJacobian2(dctrl_data_, state_sens_data_, true);
+      h_data->update(static_cast<Real>(2)*cmpScale_,*dctrl_data_,-cmpScale_);
+    }
+  }
+
+  /** \brief Apply a reduced Hessian preconditioner.
+  */
+  virtual void precond( ROL::Vector<Real> &Pv, const ROL::Vector<Real> &v, const ROL::Vector<Real> &z, Real &tol ) {
+    nprec_++;
+    Pv.set(v.dual());
+  }
+
+  void summarize(std::ostream &stream) const {
+    stream << std::endl;
+    stream << std::string(114,'=') << std::endl;
+    stream << "  Objective_Compliance::summarize" << std::endl;
+    stream << "    Number of calls to update:          " << nupda_ << std::endl;
+    stream << "    Number of calls to value:           " << nfval_ << std::endl;
+    stream << "    Number of calls to gradient:        " << ngrad_ << std::endl;
+    stream << "    Number of calls to hessVec:         " << nhess_ << std::endl;
+    stream << "    Number of calls to precond:         " << nprec_ << std::endl;
+    stream << "    Number of state solves:             " << nstat_ << std::endl;
+    stream << "    Number of state sensitivity solves: " << nsens_ << std::endl;
+    stream << "    Number of load assemblies:          " << nasLo_ << std::endl;
+    stream << "    Number of Jacobian_1 assemblies:    " << nasJ1_ << std::endl;
+    stream << "    Number of Jacobian_1 sovles:        " << napJ1_ << std::endl;
+    stream << "    Number of Jacobian_2 assemblies:    " << nasJ2_ << std::endl;
+    stream << "    Number of Jacobian_2 applies:       " << napJ2_ << std::endl;
+    stream << "    Number of Hessian_22 assemblies:    " << nasH2_ << std::endl;
+    stream << "    Number of Hessian_22 applies:       " << napH2_ << std::endl;
+    stream << std::string(114,'=') << std::endl;
+    stream << std::endl;
+  }
+
+// For parametrized (stochastic) objective functions and constraints
+public:
+  void setParameter(const std::vector<Real> &param) {
+    ROL::Objective<Real>::setParameter(param);
+    pde_->setParameter(param);
+    assembleF_ = true;
+  }
+
+private:
+
+  ROL::Ptr<const Tpetra::MultiVector<>> getConstField(const ROL::Vector<Real> &x) const {
+    ROL::Ptr<const Tpetra::MultiVector<>> xp;
+    try {
+      xp = dynamic_cast<const ROL::TpetraMultiVector<Real>&>(x).getVector();
+    }
+    catch (std::exception &e) {
+      try {
+        ROL::Ptr<const ROL::TpetraMultiVector<Real>> xvec
+          = dynamic_cast<const PDE_OptVector<Real>&>(x).getField();
+        if (xvec == ROL::nullPtr) {
+          xp = ROL::nullPtr;
+        }
+        else {
+          xp = xvec->getVector();
+        }
+      }
+      catch (std::exception &ee) {
+        xp = ROL::nullPtr;
+      }
+    }
+    return xp;
+  }
+
+  ROL::Ptr<Tpetra::MultiVector<>> getField(ROL::Vector<Real> &x) const {
+    ROL::Ptr<Tpetra::MultiVector<>> xp;
+    try {
+      xp = dynamic_cast<ROL::TpetraMultiVector<Real>&>(x).getVector();
+    }
+    catch (std::exception &e) {
+      try {
+        ROL::Ptr<ROL::TpetraMultiVector<Real>> xvec
+          = dynamic_cast<PDE_OptVector<Real>&>(x).getField();
+        if ( xvec == ROL::nullPtr ) {
+          xp = ROL::nullPtr;
+        }
+        else {
+          xp = xvec->getVector();
+        }
+      }
+      catch (std::exception &ee) {
+        xp = ROL::nullPtr;
+      }
+    }
+    return xp;
+  }
+
+  void initialize(void) {
+    ROL::Ptr<Tpetra::MultiVector<>> u_data = getField(*state_);
+    u_data->putScalar(0.0);
+    dctrl_data_->putScalar(1.0);
+    assembleJ1_     = true;
+    assembleJ2_     = true;
+    assembleJ1(u_data, dctrl_data_,false);
+    assembleJ2(u_data, dctrl_data_);
+    matJ1_tmp_      = matJ1_;
+    matJ2_tmp_      = matJ2_;
+    matJ1_          = matJ1_cache_;
+    matJ2_          = matJ2_cache_;
+    assembleJ1_     = true;
+    assembleJ2_     = true;
+    assembleJ1(u_data, dctrl_data_,false);
+    assembleJ2(u_data, dctrl_data_);
+    matJ1_cache_    = matJ1_;
+    matJ2_cache_    = matJ2_;
+    solver_tmp_     = solver_;
+    solver_         = solver_cache_;
+    assembleJ1_     = true;
+    assembleJ2_     = true;
+    // Upon exit, matJ1, matJ2 and solver are set to their cached versions
+  }
+
+  // Assemble load vector
+  void assembleF(void) {
+    if (assembleF_) {
+      nasLo_++;
+      ROL::Ptr<Tpetra::MultiVector<>> u0 = assembler_->createStateVector();
+      ROL::Ptr<Tpetra::MultiVector<>> z1 = assembler_->createControlVector();
+      u0->putScalar(0.0);
+      z1->putScalar(1.0);
+      assembler_->assemblePDEResidual(F_data_,pde_,u0,z1);
+      F_data_->scale(static_cast<Real>(-1));
+      assembleF_ = false;
+    }
+  }
+
+  void assembleJ2(const ROL::Ptr<const Tpetra::MultiVector<>> &u, const ROL::Ptr<const Tpetra::MultiVector<>> &z) {
+    if (assembleJ2_) {
+      nasJ2_++;
+      assembler_->assemblePDEJacobian2(matJ2_,pde_,u,z);
+      assembleJ2_ = false;
+    }
+  }
+
+  void applyJacobian2(ROL::Ptr<Tpetra::MultiVector<>> &Jv, const ROL::Ptr<const Tpetra::MultiVector<>> &v,
+                      const bool transpose) {
+    napJ2_++;
+    if (transpose) {
+      matJ2_->apply(*v,*Jv,Teuchos::TRANS);
+    }
+    else {
+      matJ2_->apply(*v,*Jv);
+    }
+  }
+
+  void assembleH22(const ROL::Ptr<const Tpetra::MultiVector<>> &u, const ROL::Ptr<const Tpetra::MultiVector<>> &z) {
+    if (assembleH22_) {
+      nasH2_++;
+      assembler_->assemblePDEHessian22(matH22_,pde_,u,u,z);
+      assembleH22_ = false;
+    }
+  }
+
+  void applyHessian22(ROL::Ptr<Tpetra::MultiVector<>> &Hv, const ROL::Ptr<const Tpetra::MultiVector<>> &v) {
+    matH22_->apply(*v,*Hv);
+    napH2_++;
+  }
+
+  void assembleJ1(const ROL::Ptr<const Tpetra::MultiVector<>> &u, const ROL::Ptr<const Tpetra::MultiVector<>> &z, bool setSolve = true) {
+    if (assembleJ1_) {
+      nasJ1_++;
+      assembler_->assemblePDEJacobian1(matJ1_,pde_,u,z);
+      if (setSolve) {
+        solver_->setA(matJ1_);
+      }
+      assembleJ1_ = false;
+    }
+  }
+
+  void solve(ROL::Ptr<Tpetra::MultiVector<>> &x, const ROL::Ptr<const Tpetra::MultiVector<>> &b) {
+    napJ1_++;
+    solver_->solve(x,b,false);
+  }
+
+  void solve_state_equation(ROL::Vector<Real> &u, const ROL::Vector<Real> &z) {
+    bool isComputed = false;
+    if (storage_) {
+      isComputed = stateStore_->get(u, ROL::Objective<Real>::getParameter());
+    }
+    if (!isComputed || !storage_) {
+      ROL::Ptr<const Tpetra::MultiVector<>> z_data = getConstField(z);
+      ROL::Ptr<Tpetra::MultiVector<>>       u_data = getField(u);
+      solve_state_equation(u_data,z_data);
+      if (storage_) {
+        stateStore_->set(u, ROL::Objective<Real>::getParameter());
+      }
+    }
+  }
+
+  void solve_state_equation(ROL::Ptr<Tpetra::MultiVector<>> &u, const ROL::Ptr<const Tpetra::MultiVector<>> &z) {
+    nstat_++;
+    assembleF(); assembleJ1(u,z,true);
+    solve(u,F_data_);
+  }
+
+  void solve_state_sensitivity(ROL::Ptr<Tpetra::MultiVector<>> &s, const ROL::Ptr<const Tpetra::MultiVector<>> &v) {
+    // This assumes that assembleJ1 and assemble J2 were called
+    nsens_++;
+    applyJacobian2(dstat_data_, v, false);
+    solve(s, dstat_data_);
+  }
+
+  void update_initial( const ROL::Vector<Real> &z, int iter ) {
+    assembleJ1_   = true;
+    assembleJ2_   = true;
+    assembleH22_  = true;
+
+    // For secant approximation
+    if (hessAppr_ == "Secant") {
+      computeGrad_ = true;
+      const Real half(0.5), one(1);
+      Real tol(std::sqrt(ROL::ROL_EPSILON<Real>()));
+      gradient(*g_old_,z,tol);       // g_old = J'(z)
+      g_old_->scale(half/cmpScale_); // g_old = J'(z)/2c
+      s_->set(z); s_->scale(-one);   // s     = -z
+    }
+  }
+
+  void update_accept( const ROL::Vector<Real> &z, int iter ) {
+    // Push trial state storage into base state storage.
+    solver_tmp_       = solver_cache_;
+    solver_cache_     = solver_;
+    matJ1_tmp_        = matJ1_cache_;
+    matJ1_cache_      = matJ1_;
+    matJ2_tmp_        = matJ2_cache_;
+    matJ2_cache_      = matJ2_;
+    assembleJ1_       = false;
+    // If assembleJ2 was not called, then assembleJ2_ = true
+    assembleH22_      = true;
+
+    // For secant update
+    // Secant Equation: B s = 1/2c(J'(z) - (J'(z_old) + H s))
+    if (hessAppr_ == "Secant") {
+      computeGrad_ = true;
+      const Real half(0.5), one(1);
+      Real tol(std::sqrt(ROL::ROL_EPSILON<Real>()));
+      s_->plus(z);                         // s     = z - z_old
+      applyHessian22(dctrl_data_,s_data_); // dctrl = -H s
+      g_old_->axpy(-half,*dctrl_);         // g_old = (J'(z_old) + cH s)/2c
+      gradient(*g_new_,z,tol);             // gnew  = J'(z)
+      g_new_->scale(half/cmpScale_);       // gnew  = J'(z)/2c
+      secant_->updateStorage(z,*g_new_,*g_old_,*s_,s_->norm(),iter);
+      s_->set(z); s_->scale(-one);         // s     = -z
+      g_old_->set(*g_new_);                // g_old = J'(z)/2c
+    }
+  }
+
+  void update_temp( const ROL::Vector<Real> &z, int iter ) {
+    assembleJ1_     = true;
+    assembleJ2_     = true;
+    assembleH22_    = true;
+    if (hessAppr_ == "Secant") {
+      computeGrad_    = true;
+    }
+  }
+
+  void update_trial( const ROL::Vector<Real> &z, int iter ) {
+    solver_       = solver_tmp_;
+    matJ1_        = matJ1_tmp_;
+    matJ2_        = matJ2_tmp_;
+    assembleJ1_   = true;
+    assembleJ2_   = true;
+    assembleH22_  = false;
+  }
+
+  void update_revert( const ROL::Vector<Real> &z, int iter ) {
+    solver_       = solver_cache_;
+    matJ1_        = matJ1_cache_;
+    matJ2_        = matJ2_cache_;
+    assembleJ1_   = false;
+    // If assembleJ2 was not called, then assembleJ2_ = true
+    assembleH22_  = false;
+    // If gradient was not called, then computeGrad_ = true
+  }
+}; // class TopOptComplianceObjective
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/topo-opt/src/filtered_compliance_robjK.hpp b/packages/rol/example/PDE-OPT/topo-opt/src/filtered_compliance_robjK.hpp
new file mode 100644
index 000000000000..1a6cae2bbeff
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/topo-opt/src/filtered_compliance_robjK.hpp
@@ -0,0 +1,299 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+#ifndef TOPOPT_FILTERED_COMPLIANCE_ROBJK_H
+#define TOPOPT_FILTERED_COMPLIANCE_ROBJK_H
+
+#include "ROL_Objective.hpp"
+#include "../../TOOLS/pdeK.hpp"
+#include "../../TOOLS/assemblerK.hpp"
+#include "../../TOOLS/solver.hpp"
+#include "../../TOOLS/pdevectorK.hpp"
+#include "compliance_robjK.hpp"
+
+template <class Real, class DeviceType>
+class TopOptFilteredComplianceObjective : public ROL::Objective<Real> {
+private:
+  // Compliance objective function
+  ROL::Ptr<TopOptComplianceObjective<Real,DeviceType>> comp_;
+
+  // FIlter PDE
+  const ROL::Ptr<PDE<Real,DeviceType>> pde_filter_;
+  ROL::Ptr<Assembler<Real,DeviceType>> assembler_filter_;
+  ROL::Ptr<Solver<Real>> solver_filter_;
+  ROL::Ptr<Tpetra::CrsMatrix<>> matF1_, matF2_;
+
+  // Vector Storage
+  ROL::Ptr<Tpetra::MultiVector<>> f_state_, f_res_;
+  ROL::Ptr<ROL::Vector<Real>> Fz_, Fzcache_, Fv_, dctrl_;
+
+  bool nuke_;
+  int printFreq_;
+
+  unsigned nupda_, nfval_, ngrad_, nhess_, nprec_;
+  unsigned nasFi_, napFi_;
+
+public:
+  TopOptFilteredComplianceObjective(
+      const ROL::Ptr<PDE<Real,DeviceType>>         &pde,
+      const ROL::Ptr<PDE<Real,DeviceType>>         &filter,
+      const ROL::Ptr<MeshManager<Real,DeviceType>> &mesh,
+      const ROL::Ptr<const Teuchos::Comm<int>>     &comm,
+      ROL::ParameterList                           &list,
+      std::ostream                                 &stream = std::cout)
+    : pde_filter_(filter),
+      nupda_(0), nfval_(0), ngrad_(0), nhess_(0), nprec_(0),
+      nasFi_(0), napFi_(0) {
+    // Compliance objective function
+    comp_ = ROL::makePtr<TopOptComplianceObjective<Real,DeviceType>>(pde,mesh,comm,list,stream,"state","filtered_density");
+
+    // Filter PDE
+    assembler_filter_ = ROL::makePtr<Assembler<Real,DeviceType>>(pde_filter_->getFields(),
+                                                                 pde_filter_->getFields2(),
+                                                                 mesh,comm,list,stream);
+    assembler_filter_->setCellNodes(*pde_filter_);
+    solver_filter_    = ROL::makePtr<Solver<Real>>(list.sublist("Solver"));
+
+    // Vector storage
+    f_state_ = assembler_filter_->createStateVector();
+    f_res_   = assembler_filter_->createResidualVector();
+    Fz_      = ROL::makePtr<PDE_PrimalSimVector<Real,DeviceType>>(assembler_filter_->createStateVector(),pde_filter_,assembler_filter_,list);
+    Fzcache_ = ROL::makePtr<PDE_PrimalSimVector<Real,DeviceType>>(assembler_filter_->createStateVector(),pde_filter_,assembler_filter_,list);
+    Fv_      = ROL::makePtr<PDE_PrimalSimVector<Real,DeviceType>>(assembler_filter_->createStateVector(),pde_filter_,assembler_filter_,list);
+    dctrl_   = ROL::makePtr<PDE_DualSimVector<Real,DeviceType>>(assembler_filter_->createStateVector(),pde_filter_,assembler_filter_,list);
+
+    nuke_      = list.sublist("Problem").get("Use Basic Update",false);
+    printFreq_ = list.sublist("Problem").get("Output Frequency",0);
+
+    assembleFilter();
+  }
+
+  const ROL::Ptr<Assembler<Real,DeviceType>> getAssembler(void) const {
+    return comp_->getAssembler();
+  }
+
+  const ROL::Ptr<Assembler<Real,DeviceType>> getFilterAssembler(void) const {
+    return assembler_filter_;
+  }
+
+  void update(const ROL::Vector<Real> &z, ROL::UpdateType type, int iter = -1) {
+    nupda_++;
+    if (nuke_) {
+      update_temp(z,iter);
+    }
+    else {
+      switch (type) {
+        case ROL::UpdateType::Initial: update_initial(z,iter); break;
+        case ROL::UpdateType::Accept:  update_accept(z,iter);  break;
+        case ROL::UpdateType::Revert:  update_revert(z,iter);  break;
+        case ROL::UpdateType::Trial:   update_trial(z,iter);   break;
+        case ROL::UpdateType::Temp:    update_temp(z,iter);   break;
+      }
+    }
+    comp_->update(*Fz_,type,iter);
+    // Print
+    if (printFreq_ > 0 && iter%printFreq_ == 0) {
+      std::stringstream dfile;
+      dfile  << "density_"          << iter << ".txt";
+      ROL::Ptr<const Tpetra::MultiVector<>> d_data = getConstField(z);
+      assembler_filter_->outputTpetraVector(d_data, dfile.str());
+    }
+  }
+
+  void update( const ROL::Vector<Real> &z, bool flag = true, int iter = -1 ) {
+    nupda_++;
+    if (nuke_) {
+      update_temp(z,iter);
+    }
+    else {
+      // Trial:    flag = false, iter = -1
+      // Check:    flag = true,  iter = -1
+      // Reject:   flag = false, iter > -1
+      // Accept:   flag = true,  iter > -1
+      if (flag) {
+        if (iter > -1) {
+          update_accept(z,iter);
+        }
+        else {
+          update_temp(z,iter);
+        }
+      }
+      else {
+        if (iter > -1) {
+          update_revert(z,iter);
+        }
+        else {
+          update_trial(z,iter);
+        }
+      }
+    }
+    comp_->update(*Fz_,flag,iter);
+  }
+
+  Real normalize(const ROL::Vector<Real> &z, Real &tol) {
+    applyFilter(*Fz_,z,false);
+    return comp_->normalize(*Fz_, tol);
+  }
+
+  void printToFile(const ROL::Vector<Real> &z, std::ostream &stream = std::cout,
+                   std::string ufile = "state.txt", std::string dfile = "density.txt",
+                   std::string ffile = "filtered_density.txt") {
+    update(z,ROL::UpdateType::Temp);
+    ROL::Ptr<const Tpetra::MultiVector<>> dens_data = getConstField(z);
+    assembler_filter_->outputTpetraVector(dens_data,  dfile);
+    comp_->printToFile(*Fz_, stream, ufile, ffile);
+  }
+
+  void solveState(ROL::Vector<Real> &u, const ROL::Vector<Real> &z) {
+    update(z,ROL::UpdateType::Temp);
+    comp_->solveState(u,*Fz_);
+  }
+
+  Real value( const ROL::Vector<Real> &z, Real &tol ) {
+    nfval_++;
+    return comp_->value(*Fz_,tol);
+  }
+
+  void gradient( ROL::Vector<Real> &g, const ROL::Vector<Real> &z, Real &tol ) {
+    ngrad_++;
+    comp_->gradient(*dctrl_, *Fz_, tol);
+    applyFilter(g, *dctrl_, true);
+  }
+
+  void hessVec( ROL::Vector<Real> &hv, const ROL::Vector<Real> &v, const ROL::Vector<Real> &z, Real &tol ) {
+    nhess_++;
+    applyFilter(*Fv_, v, false);
+    comp_->hessVec(*dctrl_, *Fv_, *Fz_, tol);
+    applyFilter(hv, *dctrl_, true);
+  }
+
+  /** \brief Apply a reduced Hessian preconditioner.
+  */
+  virtual void precond( ROL::Vector<Real> &Pv, const ROL::Vector<Real> &v, const ROL::Vector<Real> &z, Real &tol ) {
+    nprec_++;
+    Pv.set(v.dual());
+  }
+
+  void summarize(std::ostream &stream) const {
+    stream << std::endl;
+    stream << std::string(114,'=') << std::endl;
+    stream << "  Filtered_Objective_Compliance::summarize" << std::endl;
+    stream << "    Number of calls to update:          " << nupda_ << std::endl;
+    stream << "    Number of calls to value:           " << nfval_ << std::endl;
+    stream << "    Number of calls to gradient:        " << ngrad_ << std::endl;
+    stream << "    Number of calls to hessVec:         " << nhess_ << std::endl;
+    stream << "    Number of calls to precond:         " << nprec_ << std::endl;
+    stream << "    Number of filter assemblies:        " << nasFi_ << std::endl;
+    stream << "    Number of filter applies:           " << napFi_ << std::endl;
+    stream << std::string(114,'=') << std::endl;
+    stream << std::endl;
+    comp_->summarize(stream);
+  }
+
+// For parametrized (stochastic) objective functions and constraints
+public:
+  void setParameter(const std::vector<Real> &param) {
+    ROL::Objective<Real>::setParameter(param);
+    comp_->setParameter(param);
+  }
+
+private:
+
+  ROL::Ptr<const Tpetra::MultiVector<>> getConstField(const ROL::Vector<Real> &x) const {
+    ROL::Ptr<const Tpetra::MultiVector<>> xp;
+    try {
+      xp = dynamic_cast<const ROL::TpetraMultiVector<Real>&>(x).getVector();
+    }
+    catch (std::exception &e) {
+      try {
+        ROL::Ptr<const ROL::TpetraMultiVector<Real>> xvec
+          = dynamic_cast<const PDE_OptVector<Real>&>(x).getField();
+        if (xvec == ROL::nullPtr) {
+          xp = ROL::nullPtr;
+        }
+        else {
+          xp = xvec->getVector();
+        }
+      }
+      catch (std::exception &ee) {
+        xp = ROL::nullPtr;
+      }
+    }
+    return xp;
+  }
+
+  ROL::Ptr<Tpetra::MultiVector<>> getField(ROL::Vector<Real> &x) const {
+    ROL::Ptr<Tpetra::MultiVector<>> xp;
+    try {
+      xp = dynamic_cast<ROL::TpetraMultiVector<Real>&>(x).getVector();
+    }
+    catch (std::exception &e) {
+      try {
+        ROL::Ptr<ROL::TpetraMultiVector<Real>> xvec
+          = dynamic_cast<PDE_OptVector<Real>&>(x).getField();
+        if ( xvec == ROL::nullPtr ) {
+          xp = ROL::nullPtr;
+        }
+        else {
+          xp = xvec->getVector();
+        }
+      }
+      catch (std::exception &ee) {
+        xp = ROL::nullPtr;
+      }
+    }
+    return xp;
+  }
+
+  void assembleFilter(void) {
+    nasFi_++;
+    ROL::Ptr<Tpetra::MultiVector<>> f_ctrl = assembler_filter_->createControlVector();
+    assembler_filter_->assemblePDEJacobian1(matF1_,pde_filter_,f_state_,f_ctrl);
+    assembler_filter_->assemblePDEJacobian2(matF2_,pde_filter_,f_state_,f_ctrl);
+    solver_filter_->setA(matF1_);
+  }
+
+  void applyFilter(ROL::Vector<Real> &Fx, const ROL::Vector<Real> &x, const bool transpose) {
+    napFi_++;
+    ROL::Ptr<Tpetra::MultiVector<>>      Fx_data = getField(Fx);
+    ROL::Ptr<const Tpetra::MultiVector<>> x_data = getConstField(x);
+    if (transpose) {
+      solver_filter_->solve(f_state_,x_data,false);
+      matF2_->apply(*f_state_,*Fx_data,Teuchos::TRANS);
+    }
+    else {
+      matF2_->apply(*x_data,*f_res_);
+      solver_filter_->solve(Fx_data,f_res_,false);
+    }
+    Fx.scale(static_cast<Real>(-1));
+  }
+
+  void update_initial( const ROL::Vector<Real> &z, int iter ) {
+    applyFilter(*Fz_,z,false);
+    Fzcache_->set(*Fz_);
+  }
+
+  void update_accept( const ROL::Vector<Real> &z, int iter ) {
+    Fzcache_->set(*Fz_);
+  }
+
+  void update_temp( const ROL::Vector<Real> &z, int iter ) {
+    applyFilter(*Fz_,z,false);
+  }
+
+  void update_trial( const ROL::Vector<Real> &z, int iter ) {
+    applyFilter(*Fz_,z,false);
+  }
+
+  void update_revert( const ROL::Vector<Real> &z, int iter ) {
+    Fz_->set(*Fzcache_);
+  }
+}; // class TopOptComplianceObjective
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/topo-opt/src/obj_phasefieldK.hpp b/packages/rol/example/PDE-OPT/topo-opt/src/obj_phasefieldK.hpp
new file mode 100644
index 000000000000..8e72625ead6f
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/topo-opt/src/obj_phasefieldK.hpp
@@ -0,0 +1,283 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  obj.hpp
+    \brief Provides the interface for local (cell-based) objective function computations.
+*/
+
+#ifndef PDEOPT_QOI_PHASEFIELDK_HPP
+#define PDEOPT_QOI_PHASEFIELDK_HPP
+
+#include "../../TOOLS/qoiK.hpp"
+#include "../../TOOLS/feK.hpp"
+
+template <class Real, class DeviceType>
+class QoI_ModicaMortolaEnergy_PhaseField : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> fe_;
+  const Real scale_;
+
+public:
+  QoI_ModicaMortolaEnergy_PhaseField(const ROL::Ptr<fe_type> &fe,
+                                     const Real scale = Real(1))
+    : fe_(fe), scale_(scale) {}
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    const Real one(1), cPhi(0.75);
+    // Get relevant dimensions
+    int c = fe_->gradN().extent_int(0);
+    int p = fe_->gradN().extent_int(2);
+    int d = fe_->gradN().extent_int(3);
+    // Initialize output val
+    val = scalar_view("val",c);
+    // Evaluate on FE basis
+    scalar_view valZ_eval("valZ_eval", c, p);
+    scalar_view gradZ_eval("gradZ_eval", c, p, d);
+    scalar_view valPhi_eval("valPhi_eval", c, p);
+    fe_->evaluateValue(valZ_eval,z_coeff);
+    fe_->evaluateGradient(gradZ_eval,z_coeff);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        Real zij = valZ_eval(i,j);
+        valPhi_eval(i,j) = cPhi*(zij*zij - one)/scale_;
+      }
+    }
+    // Integrate
+    fe_->computeIntegral(val,gradZ_eval,gradZ_eval,false);
+    fe_->computeIntegral(val,valPhi_eval,valPhi_eval,true);
+    rst::scale(val, static_cast<Real>(0.5)*scale_);
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_ModicaMortolaEnergy_PhaseField::gradient_1 is zero.");
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    const Real one(1), cPhi(9.0/8.0);
+    // Retrieve dimensions.
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    int p = fe_->gradN().extent_int(2);
+    int d = fe_->gradN().extent_int(3);
+    // Initialize Gradients.
+    grad = scalar_view("grad",c,f);
+    // Evaluate/interpolate finite element fields on cells.
+    scalar_view valZ_eval("valZ_eval", c, p);
+    scalar_view gradZ_eval("gradZ_eval", c, p, d);
+    scalar_view valPhi_eval("valPhi_eval", c, p);
+    fe_->evaluateValue(valZ_eval,z_coeff);
+    fe_->evaluateGradient(gradZ_eval,z_coeff);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        Real zij = valZ_eval(i,j);
+        valPhi_eval(i,j) = cPhi*zij*(zij*zij-one)/scale_;
+      }
+    }
+    // Evaluate gradient of energy.
+    fst::integrate(grad,gradZ_eval,fe_->gradNdetJ(),false);
+    rst::scale(grad,scale_);
+    fst::integrate(grad,valPhi_eval,fe_->NdetJ(),true);
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_ModicaMortolaEnergy_PhaseField::HessVec_11 is zero.");
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_ModicaMortolaEnergy_PhaseField::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_ModicaMortolaEnergy_PhaseField::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    const Real one(1), three(3), cPhi(9.0/8.0);
+    // Retrieve dimensions.
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    int p = fe_->gradN().extent_int(2);
+    int d = fe_->gradN().extent_int(3);
+    // Initialize Hessians.
+    hess = scalar_view("hess",c,f);
+    // Evaluate/interpolate finite element fields on cells.
+    scalar_view valZ_eval("valZ_eval", c, p);
+    scalar_view valV_eval("valV_eval", c, p);
+    scalar_view gradV_eval("gradV_eval", c, p, d);
+    scalar_view valPhi_eval("valPhi_eval", c, p);
+    fe_->evaluateValue(valZ_eval,z_coeff);
+    fe_->evaluateValue(valV_eval,v_coeff);
+    fe_->evaluateGradient(gradV_eval,v_coeff);
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        Real zij = valZ_eval(i,j);
+        valPhi_eval(i,j) = valV_eval(i,j)*cPhi*(three*zij*zij-one)/scale_;
+      }
+    }
+    // Evaluate hessian-times-vector of energy.
+    fst::integrate(hess,gradV_eval,fe_->gradNdetJ(),false);
+    rst::scale(hess,scale_);
+    fst::integrate(hess,valPhi_eval,fe_->NdetJ(),true);
+  }
+
+}; // QoI_ModicaMortolaEnergy_PhaseField
+
+template <class Real, class DeviceType>
+class QoI_Volume_PhaseField : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> fe_;
+
+public:
+  QoI_Volume_PhaseField(const ROL::Ptr<fe_type> &fe) : fe_(fe) {}
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = fe_->gradN().extent_int(0);
+    int p = fe_->gradN().extent_int(2);
+    // Initialize output val
+    val = scalar_view("val",c);
+    // Evaluate on FE basis
+    scalar_view valZ_eval("valZ_eval",c,p);
+    fe_->evaluateValue(valZ_eval,z_coeff);
+    // Approximate characteristic function
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        valZ_eval(i,j) += static_cast<Real>(1);
+      }
+    }
+    // Compute volume
+    fe_->computeIntegral(val,valZ_eval,valZ_eval);
+    rst::scale(val,static_cast<Real>(0.25));
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Volume_PhaseField::gradient_1 is zero.");
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    int p = fe_->gradN().extent_int(2);
+    int d = fe_->gradN().extent_int(3);
+    // Initialize output grad
+    grad = scalar_view("grad", c, f);
+    // Evaluate on FE basis
+    scalar_view valZ_eval("valZ_eval",c,p);
+    fe_->evaluateValue(valZ_eval,z_coeff);
+    // Approximate derivative of characteristic function
+    for (int i = 0; i < c; ++i) {
+      for (int j = 0; j < p; ++j) {
+        valZ_eval(i,j) += static_cast<Real>(1);
+      }
+    }
+    // Compute gradient of volume
+    fst::integrate(grad,valZ_eval,(fe_->NdetJ()),false);
+    rst::scale(grad,static_cast<Real>(0.5));
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Volume_PhaseField::HessVec_11 is zero.");
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Volume_PhaseField::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Volume_PhaseField::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+    int p = fe_->gradN().extent_int(2);
+    int d = fe_->gradN().extent_int(3);
+    // Initialize output grad
+    hess = scalar_view("hess",c,f);
+    // Evaluate on FE basis
+    scalar_view valV_eval("valV_eval",c,p);
+    fe_->evaluateValue(valV_eval,v_coeff);
+    // Compute gradient of volume
+    fst::integrate(hess,valV_eval,fe_->NdetJ(),false);
+    rst::scale(hess,static_cast<Real>(0.5));
+  }
+
+}; // QoI_Volume_PhaseField
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/topo-opt/src/obj_volumeK.hpp b/packages/rol/example/PDE-OPT/topo-opt/src/obj_volumeK.hpp
new file mode 100644
index 000000000000..2f9528244ff1
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/topo-opt/src/obj_volumeK.hpp
@@ -0,0 +1,129 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  obj.hpp
+    \brief Provides the interface for local (cell-based) objective function computations.
+*/
+
+#ifndef PDEOPT_QOI_VOLUMEK_HPP
+#define PDEOPT_QOI_VOLUMEK_HPP
+
+#include "../../TOOLS/qoiK.hpp"
+#include "../../TOOLS/feK.hpp"
+
+template <class Real, class DeviceType>
+class QoI_Volume_TopoOpt : public QoI<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+
+private:
+  const ROL::Ptr<fe_type> fe_;
+  scalar_view ones_, volFrac_;
+
+public:
+  QoI_Volume_TopoOpt(const ROL::Ptr<fe_type> &fe,
+                     const Real v0 = Real(0))
+  : fe_(fe) {
+    // Get relevant dimensions
+    int c = fe_->cubPts().extent_int(0);
+    int p = fe_->cubPts().extent_int(1);
+    ones_ = scalar_view("ones_",c,p);
+    volFrac_ = scalar_view("volFrac_",c,p);
+    Kokkos::deep_copy(ones_,static_cast<Real>(1));
+    Kokkos::deep_copy(volFrac_,v0);
+    //for (int i = 0; i < c; ++i) {
+    //  for (int j = 0; j < p; ++j) {
+    //    ones_(i,j) = static_cast<Real>(1);
+    //    volFrac_(i,j) = v0;
+    //  }
+    //}
+  }
+
+  Real value(scalar_view & val,
+             const scalar_view u_coeff,
+             const scalar_view z_coeff = scalar_view(),
+             const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = fe_->gradN().extent_int(0);
+    int p = fe_->gradN().extent_int(2);
+
+    // Initialize output val
+    val = scalar_view("val",c);
+    // Evaluate on FE basis
+    scalar_view valZ_eval("valZ_eval", c, p);
+    fe_->evaluateValue(valZ_eval, z_coeff);
+    rst::subtract(valZ_eval,volFrac_);
+
+    // Compute energy
+    fe_->computeIntegral(val,valZ_eval,ones_);
+    return static_cast<Real>(0);
+  }
+
+  void gradient_1(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Volume_TopoOpt::gradient_1 is zero.");
+  }
+
+  void gradient_2(scalar_view & grad,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Get relevant dimensions
+    int c = fe_->gradN().extent_int(0);
+    int f = fe_->gradN().extent_int(1);
+
+    // Initialize output grad
+    grad = scalar_view("grad", c, f);
+
+    // Compute gradient of energy
+    fst::integrate(grad,ones_,fe_->NdetJ(),false);
+  }
+
+  void HessVec_11(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Volume_TopoOpt::HessVec_11 is zero.");
+  }
+
+  void HessVec_12(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Volume_TopoOpt::HessVec_12 is zero.");
+  }
+
+  void HessVec_21(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Volume_TopoOpt::HessVec_21 is zero.");
+  }
+
+  void HessVec_22(scalar_view & hess,
+                  const scalar_view v_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> QoI_Volume_TopoOpt::HessVec_22 is zero.");
+  }
+
+}; // QoI_VolumeObj
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/topo-opt/src/pde_filterK.hpp b/packages/rol/example/PDE-OPT/topo-opt/src/pde_filterK.hpp
new file mode 100644
index 000000000000..855599b84092
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/topo-opt/src/pde_filterK.hpp
@@ -0,0 +1,253 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  pde_filter.hpp
+    \brief Implements the local PDE interface for the structural topology
+           optimization problem.
+*/
+
+#ifndef PDE_TOPO_OPT_FILTERK_HPP
+#define PDE_TOPO_OPT_FILTERK_HPP
+
+#include "../../TOOLS/pdeK.hpp"
+#include "../../TOOLS/feK.hpp"
+
+#include "Intrepid2_HVOL_C0_FEM.hpp"
+#include "Intrepid2_HGRAD_QUAD_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_QUAD_C2_FEM.hpp"
+#include "Intrepid2_HGRAD_HEX_C1_FEM.hpp"
+#include "Intrepid2_HGRAD_HEX_C2_FEM.hpp"
+#include "Intrepid2_DefaultCubatureFactory.hpp"
+#include "Intrepid2_FunctionSpaceTools.hpp"
+#include "Intrepid2_CellTools.hpp"
+
+#include "ROL_Ptr.hpp"
+
+
+template <class Real, class DeviceType>
+class PDE_Filter : public PDE<Real,DeviceType> {
+public:
+  using scalar_view = Kokkos::DynRankView<Real, DeviceType>;
+  using basis_ptr   = Intrepid2::BasisPtr<DeviceType, Real, Real>;
+  using fe_type     = FE<Real,DeviceType>;
+  using fst         = Intrepid2::FunctionSpaceTools<DeviceType>;
+  using ct          = Intrepid2::CellTools<DeviceType>;
+  using rst         = Intrepid2::RealSpaceTools<DeviceType>;
+private:
+  // Finite element basis information
+  basis_ptr basisPtr_, basisPtrDens_;
+  std::vector<basis_ptr> basisPtrs_, basisPtrsDens_;
+  // Cell cubature information
+  ROL::Ptr<Intrepid2::Cubature<DeviceType,Real,Real>> cellCub_;
+  // Cell node information
+  scalar_view volCellNodes_;
+  // Finite element definition
+  ROL::Ptr<fe_type> fe_, feDens_;
+  // Problem parameters.
+  Real lengthScale_;
+
+public:
+  PDE_Filter(Teuchos::ParameterList &parlist) {
+    // Finite element fields.
+    int basisOrder     = parlist.sublist("Problem").get("Filter Basis Order",1);
+    int basisOrderDens = parlist.sublist("Problem").get("Density Basis Order",0);
+    int cubDegree      = parlist.sublist("Problem").get("Cubature Degree",4);
+//    int bdryCubDegree = parlist.sublist("Problem").get("Boundary Cubature Degree",2);
+    int probDim        = parlist.sublist("Problem").get("Problem Dimension",2);
+    TEUCHOS_TEST_FOR_EXCEPTION(probDim>3||probDim<2, std::invalid_argument,
+      ">>> PDE-OPT/poisson/pde_poisson.hpp: Problem dimension is not 2 or 3!");
+    TEUCHOS_TEST_FOR_EXCEPTION(basisOrder>2||basisOrder<1, std::invalid_argument,
+      ">>> PDE-OPT/poisson/pde_poisson.hpp: Basis order is not 1 or 2!");
+    if (probDim == 2) {
+      if (basisOrder == 1)
+        basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceType, Real, Real>>();
+      else if (basisOrder == 2)
+        basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C2_FEM<DeviceType, Real, Real>>();
+    }
+    else if (probDim == 3) {
+      if (basisOrder == 1)
+        basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_HEX_C1_FEM<DeviceType, Real, Real>>();
+      else if (basisOrder == 2)
+        basisPtr_ = ROL::makePtr<Intrepid2::Basis_HGRAD_HEX_C2_FEM<DeviceType, Real, Real>>();
+    }
+    shards::CellTopology cellType = basisPtr_->getBaseCellTopology();        // get cell type from basis
+    if (probDim == 2) {
+      if (basisOrderDens == 1)
+        basisPtrDens_ = ROL::makePtr<Intrepid2::Basis_HGRAD_QUAD_C1_FEM<DeviceType, Real, Real>>();
+      else
+        basisPtrDens_ = ROL::makePtr<Intrepid2::Basis_HVOL_C0_FEM<DeviceType, Real, Real>>(cellType);
+    }
+    else if (probDim == 3) {
+      if (basisOrderDens == 1)
+        basisPtrDens_ = ROL::makePtr<Intrepid2::Basis_HGRAD_HEX_C1_FEM<DeviceType, Real, Real>>();
+      else
+        basisPtrDens_ = ROL::makePtr<Intrepid2::Basis_HVOL_C0_FEM<DeviceType, Real, Real>>(cellType);
+    }
+    basisPtrs_.clear(); basisPtrsDens_.clear();
+    basisPtrs_.push_back(basisPtr_);  // Filtered Density component
+    basisPtrsDens_.push_back(basisPtrDens_); // Density component
+
+    // Quadrature rules.
+    Intrepid2::DefaultCubatureFactory cubFactory;                            // create cubature factory
+    cellCub_ = cubFactory.create<DeviceType,Real,Real>(cellType, cubDegree); // create default cubature
+
+    // Other problem parameters.
+    Real filterRadius = parlist.sublist("Problem").get("Filter Radius",  0.1);
+    lengthScale_ = std::pow(filterRadius, 2)/static_cast<Real>(12);
+  }
+
+  void residual(scalar_view & res,
+                const scalar_view u_coeff,
+                const scalar_view z_coeff = scalar_view(),
+                const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c = fe_->gradN().extent_int(0);
+    const int f = fe_->gradN().extent_int(1);
+    const int p = fe_->gradN().extent_int(2);
+    const int d = fe_->gradN().extent_int(3);
+ 
+    // Initialize residuals.
+    res = scalar_view("res",c,f);
+
+    // Evaluate/interpolate finite element fields on cells.
+    scalar_view valU_eval  = scalar_view("valU_eval", c, p);
+    scalar_view valZ_eval  = scalar_view("valZ_eval", c, p);
+    scalar_view gradU_eval = scalar_view("gradU_eval", c, p, d);
+    fe_->evaluateValue(valU_eval, u_coeff);
+    feDens_->evaluateValue(valZ_eval, z_coeff);
+    fe_->evaluateGradient(gradU_eval, u_coeff);
+
+    rst::scale(gradU_eval, lengthScale_);
+    rst::scale(valZ_eval,  static_cast<Real>(-1));
+
+    /*** Evaluate weak form of the residual. ***/
+    fst::integrate(res,gradU_eval,fe_->gradNdetJ(),false);
+    fst::integrate(res,valU_eval,fe_->NdetJ(),true);
+    fst::integrate(res,valZ_eval,fe_->NdetJ(),true);
+  }
+
+  void Jacobian_1(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c = fe_->gradN().extent_int(0);
+    const int f = fe_->gradN().extent_int(1);
+ 
+    // Initialize Jacobians.
+    jac = scalar_view("jac",c,f,f);
+
+    /*** Evaluate weak form of the Jacobian. ***/
+    Kokkos::deep_copy(jac,fe_->stiffMat());
+    rst::scale(jac, lengthScale_); // ls*gradN1 . gradN2
+    rst::add(jac,fe_->massMat());  // + N1 * N2
+  }
+
+
+  void Jacobian_2(scalar_view & jac,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    // Retrieve dimensions.
+    const int c  = fe_->gradN().extent_int(0);
+    const int f  = fe_->gradN().extent_int(1);
+    const int fd = feDens_->gradN().extent_int(1);
+ 
+    // Initialize Jacobians.
+    jac = scalar_view("jac",c,f,fd);
+
+    /*** Evaluate weak form of the Jacobian. ***/
+    fst::integrate(jac,fe_->NdetJ(),feDens_->N(),false);
+    rst::scale(jac, static_cast<Real>(-1));
+  }
+
+  void Hessian_11(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Filter::Hessian_11): Hessian is zero.");
+  }
+
+  void Hessian_12(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Filter::Hessian_12): Hessian is zero.");
+  }
+
+  void Hessian_21(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Filter::Hessian_21): Hessian is zero.");
+  }
+
+  void Hessian_22(scalar_view & hess,
+                  const scalar_view l_coeff,
+                  const scalar_view u_coeff,
+                  const scalar_view z_coeff = scalar_view(),
+                  const ROL::Ptr<const std::vector<Real>> & z_param = ROL::nullPtr) override {
+    throw Exception::Zero(">>> (PDE_Filter::Hessian_22): Hessian is zero.");
+  }
+
+  void RieszMap_1(scalar_view & riesz) override {
+    // Retrieve dimensions.
+    const int c = fe_->gradN().extent_int(0);
+    const int f = fe_->gradN().extent_int(1);
+ 
+    // Initialize Jacobians.
+    riesz = scalar_view("riesz1",c,f,f);
+    Kokkos::deep_copy(riesz,fe_->stiffMat());
+    rst::add(riesz,fe_->massMat());
+  }
+
+  void RieszMap_2(scalar_view & riesz) override {
+    // Retrieve dimensions.
+    const int c = feDens_->gradN().extent_int(0);
+    const int f = feDens_->gradN().extent_int(1);
+ 
+    // Initialize Jacobians.
+    riesz = scalar_view("riesz2",c,f,f);
+    Kokkos::deep_copy(riesz,feDens_->massMat());
+  }
+
+  std::vector<basis_ptr> getFields() override {
+    return basisPtrs_;
+  }
+
+  std::vector<basis_ptr> getFields2() override {
+    return basisPtrsDens_;
+  }
+
+  void setCellNodes(const scalar_view &volCellNodes,
+                    const std::vector<std::vector<scalar_view>> &bdryCellNodes,
+                    const std::vector<std::vector<std::vector<int>>> &bdryCellLocIds) override {
+    volCellNodes_ = volCellNodes;
+    // Finite element definition.
+    fe_     = ROL::makePtr<fe_type>(volCellNodes_,basisPtr_,cellCub_);
+    feDens_ = ROL::makePtr<fe_type>(volCellNodes_,basisPtrDens_,cellCub_,false);
+  }
+
+  void setFieldPattern(const std::vector<std::vector<int>> &fieldPattern,
+                       const std::vector<std::vector<int>> &fieldPattern2) override {}
+
+  const ROL::Ptr<fe_type> getStateFE(void) const {
+    return fe_;
+  }
+
+  const ROL::Ptr<fe_type> getDensityFE(void) const {
+    return feDens_;
+  }
+
+}; // PDE_Filter
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/topo-opt/src/volume_conK.hpp b/packages/rol/example/PDE-OPT/topo-opt/src/volume_conK.hpp
new file mode 100644
index 000000000000..5a7a950f90f0
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/topo-opt/src/volume_conK.hpp
@@ -0,0 +1,135 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+#ifndef TOPOPT_VOLUME_CONK_H
+#define TOPOPT_VOLUME_CONK_H
+
+#include "ROL_Constraint.hpp"
+#include "ROL_SingletonVector.hpp"
+#include "../../TOOLS/qoiK.hpp"
+#include "../../TOOLS/assemblerK.hpp"
+#include "../../TOOLS/pdevectorK.hpp"
+
+template <class Real, class DeviceType>
+class TopOptVolumeConstraint : public ROL::Constraint<Real> {
+private:
+  // Elasticity PDE
+  const ROL::Ptr<QoI<Real,DeviceType>> qoi_;
+  const ROL::Ptr<Assembler<Real,DeviceType>> assembler_;
+  bool assemble_;
+  ROL::Ptr<Tpetra::MultiVector<>> controlZero_;
+  ROL::Ptr<ROL::Vector<Real>> V_, Vdual_;
+  Real V0_;
+
+  unsigned nupda_;
+  unsigned nfval_;
+  unsigned njaco_;
+  unsigned najac_;
+  unsigned nhess_;
+  unsigned nasse_;
+
+public:
+  TopOptVolumeConstraint(
+      const ROL::Ptr<QoI<Real,DeviceType>>         &qoi,
+      const ROL::Ptr<Assembler<Real,DeviceType>>   &assembler,
+      const ROL::Ptr<const ROL::Vector<Real>>      &z)
+    : qoi_(qoi), assembler_(assembler), assemble_(true),
+      nupda_(0), nfval_(0), njaco_(0), najac_(0), nhess_(0), nasse_(0) {
+    controlZero_ = assembler_->createControlVector(); controlZero_->putScalar(0.0);
+    V_     = z->dual().clone();
+    Vdual_ = z->clone();
+    assemble();
+  }
+
+  void update( const ROL::Vector<Real> &z, bool flag = true, int iter = -1 ) {
+    nupda_++;
+  }
+
+  void value( ROL::Vector<Real> &c, const ROL::Vector<Real> &z, Real &tol ) {
+    nfval_++;
+    dynamic_cast<ROL::SingletonVector<Real>&>(c).setValue(V0_ + Vdual_->dot(z));
+  }
+
+  void applyJacobian(ROL::Vector<Real> &jv, const ROL::Vector<Real> &v,
+                     const ROL::Vector<Real> &z, Real &tol) {
+    njaco_++;
+    dynamic_cast<ROL::SingletonVector<Real>&>(jv).setValue(Vdual_->dot(v));
+  }
+
+  void applyAdjointJacobian( ROL::Vector<Real> &jv, const ROL::Vector<Real> &v, const ROL::Vector<Real> &z, Real &tol ) {
+    najac_++;
+    Real vval = dynamic_cast<const ROL::SingletonVector<Real>&>(v).getValue();
+    jv.set(*V_);
+    jv.scale(vval);
+  }
+
+  void applyAdjointHessian( ROL::Vector<Real> &ahwv, const ROL::Vector<Real> &w, const ROL::Vector<Real> &v, const ROL::Vector<Real> &z, Real &tol ) {
+    nhess_++;
+    ahwv.zero();
+  }
+
+  void summarize(std::ostream &stream) const {
+    stream << std::endl;
+    stream << std::string(114,'=') << std::endl;
+    stream << "  Volume_Constraint::summarize" << std::endl;
+    stream << "    Number of calls to update:               " << nupda_ << std::endl;
+    stream << "    Number of calls to value:                " << nfval_ << std::endl;
+    stream << "    Number of calls to applyJacobian:        " << njaco_ << std::endl;
+    stream << "    Number of calls to applyAdjointJacobian: " << najac_ << std::endl;
+    stream << "    Number of calls to applyAdjointHessian:  " << nhess_ << std::endl;
+    stream << "    Number of assemblies:                    " << nasse_ << std::endl;
+    stream << std::string(114,'=') << std::endl;
+    stream << std::endl;
+  }
+
+// For parametrized (stochastic) objective functions and constraints
+public:
+  void setParameter(const std::vector<Real> &param) {
+    ROL::Constraint<Real>::setParameter(param);
+    qoi_->setParameter(param);
+  }
+
+private:
+
+  ROL::Ptr<Tpetra::MultiVector<>> getField(ROL::Vector<Real> &x) const {
+    ROL::Ptr<Tpetra::MultiVector<>> xp;
+    try {
+      xp = dynamic_cast<ROL::TpetraMultiVector<Real>&>(x).getVector();
+    }
+    catch (std::exception &e) {
+      try {
+        ROL::Ptr<ROL::TpetraMultiVector<Real>> xvec
+          = dynamic_cast<PDE_OptVector<Real>&>(x).getField();
+        if ( xvec == ROL::nullPtr ) {
+          xp = ROL::nullPtr;
+        }
+        else {
+          xp = xvec->getVector();
+        }
+      }
+      catch (std::exception &ee) {
+        xp = ROL::nullPtr;
+      }
+    }
+    return xp;
+  }
+
+  void assemble(void) {
+    nasse_++;
+    if (assemble_) {
+      ROL::Ptr<Tpetra::MultiVector<>> V = getField(*V_);
+      V0_ = assembler_->assembleQoIValue(qoi_,ROL::nullPtr,controlZero_);
+      assembler_->assembleQoIGradient2(V,qoi_,ROL::nullPtr,controlZero_);
+      Vdual_->set(V_->dual());
+      assemble_ = false;
+    }
+  }
+}; // class Volume_Constraint
+
+#endif
diff --git a/packages/rol/example/PDE-OPT/topo-opt/src/volume_objK.hpp b/packages/rol/example/PDE-OPT/topo-opt/src/volume_objK.hpp
new file mode 100644
index 000000000000..ec8a0261924b
--- /dev/null
+++ b/packages/rol/example/PDE-OPT/topo-opt/src/volume_objK.hpp
@@ -0,0 +1,121 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+#ifndef TOPOPT_VOLUME_OBJK_H
+#define TOPOPT_VOLUME_OBJK_H
+
+#include "ROL_Objective.hpp"
+#include "../../TOOLS/qoiK.hpp"
+#include "../../TOOLS/assemblerK.hpp"
+#include "../../TOOLS/pdevectorK.hpp"
+
+template <class Real, class DeviceType>
+class TopOptVolumeObjective : public ROL::Objective<Real> {
+private:
+  const ROL::Ptr<QoI<Real,DeviceType>> qoi_;
+  const ROL::Ptr<Assembler<Real,DeviceType>> assembler_;
+  bool assemble_;
+  ROL::Ptr<Tpetra::MultiVector<>> controlZero_;
+  ROL::Ptr<ROL::Vector<Real>> V_, Vdual_;
+
+  unsigned nupda_;
+  unsigned nfval_;
+  unsigned ngrad_;
+  unsigned nhess_;
+  unsigned nasse_;
+
+public:
+  TopOptVolumeObjective(
+      const ROL::Ptr<QoI<Real,DeviceType>>       &qoi,
+      const ROL::Ptr<Assembler<Real,DeviceType>> &assembler,
+      const ROL::Ptr<const ROL::Vector<Real>>    &z)
+    : qoi_(qoi), assembler_(assembler), assemble_(true),
+      nupda_(0), nfval_(0), ngrad_(0), nhess_(0), nasse_(0) {
+    controlZero_ = assembler_->createControlVector(); controlZero_->putScalar(0.0);
+    V_     = z->dual().clone();
+    Vdual_ = z->clone();
+    assemble();
+  }
+
+  void update( const ROL::Vector<Real> &z, bool flag = true, int iter = -1 ) {
+    nupda_++;
+  }
+
+  Real value( const ROL::Vector<Real> &z, Real &tol ) {
+    nfval_++;
+    return Vdual_->dot(z);
+  }
+
+  void gradient( ROL::Vector<Real> &g, const ROL::Vector<Real> &z, Real &tol ) {
+    ngrad_++;
+    g.set(*V_);
+  }
+
+  void hessVec( ROL::Vector<Real> &hv, const ROL::Vector<Real> &v, const ROL::Vector<Real> &z, Real &tol ) {
+    nhess_++;
+    hv.zero();
+  }
+
+  void summarize(std::ostream &stream) const {
+    stream << std::endl;
+    stream << std::string(114,'=') << std::endl;
+    stream << "  Volume_Objective::summarize" << std::endl;
+    stream << "    Number of calls to update:   " << nupda_ << std::endl;
+    stream << "    Number of calls to value:    " << nfval_ << std::endl;
+    stream << "    Number of calls to gradient: " << ngrad_ << std::endl;
+    stream << "    Number of calls to hessVec:  " << nhess_ << std::endl;
+    stream << "    Number of assemblies:        " << nasse_ << std::endl;
+    stream << std::string(114,'=') << std::endl;
+    stream << std::endl;
+  }
+
+// For parametrized (stochastic) objective functions and constraints
+public:
+  void setParameter(const std::vector<Real> &param) {
+    ROL::Objective<Real>::setParameter(param);
+    qoi_->setParameter(param);
+  }
+
+private:
+
+  ROL::Ptr<Tpetra::MultiVector<>> getField(ROL::Vector<Real> &x) const {
+    ROL::Ptr<Tpetra::MultiVector<>> xp;
+    try {
+      xp = dynamic_cast<ROL::TpetraMultiVector<Real>&>(x).getVector();
+    }
+    catch (std::exception &e) {
+      try {
+        ROL::Ptr<ROL::TpetraMultiVector<Real>> xvec
+          = dynamic_cast<PDE_OptVector<Real>&>(x).getField();
+        if ( xvec == ROL::nullPtr ) {
+          xp = ROL::nullPtr;
+        }
+        else {
+          xp = xvec->getVector();
+        }
+      }
+      catch (std::exception &ee) {
+        xp = ROL::nullPtr;
+      }
+    }
+    return xp;
+  }
+
+  void assemble(void) {
+    nasse_++;
+    if (assemble_) {
+      ROL::Ptr<Tpetra::MultiVector<>> V = getField(*V_);
+      assembler_->assembleQoIGradient2(V,qoi_,ROL::nullPtr,controlZero_);
+      Vdual_->set(V_->dual());
+      assemble_ = false;
+    }
+  }
+}; // class TopOptVolumeObjective
+
+#endif
diff --git a/packages/rol/example/PinT/Van_der_Pol/CMakeLists.txt b/packages/rol/example/PinT/Van_der_Pol/CMakeLists.txt
index e97cbafde5ab..9997ae4df56e 100644
--- a/packages/rol/example/PinT/Van_der_Pol/CMakeLists.txt
+++ b/packages/rol/example/PinT/Van_der_Pol/CMakeLists.txt
@@ -1,4 +1,4 @@
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   example_01
   SOURCES example_01.cpp
   ARGS PrintItAll
@@ -8,7 +8,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
 )
 
 
-TRIBITS_COPY_FILES_TO_BINARY_DIR(
+ROL_COPY_FILES_TO_BINARY_DIR(
   Van_der_PolDataCopy
   SOURCE_FILES
     VdP_Parameters.xml
diff --git a/packages/rol/example/PinT/Van_der_Pol/example_01.cpp b/packages/rol/example/PinT/Van_der_Pol/example_01.cpp
index bdf94f7a80e2..47563a699826 100644
--- a/packages/rol/example/PinT/Van_der_Pol/example_01.cpp
+++ b/packages/rol/example/PinT/Van_der_Pol/example_01.cpp
@@ -9,7 +9,7 @@
 
 #include <iomanip>
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "ROL_Stream.hpp"
 #include "ROL_DynamicConstraintCheck.hpp"
@@ -33,7 +33,7 @@ int main( int argc, char* argv[] ) {
   using PV        = PartitionedVector<RealT>;
   using size_type = typename PV::size_type;
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);  
+  ROL::GlobalMPISession mpiSession(&argc, &argv);  
 
   auto outStream = makeStreamPtr( std::cout, argc > 1 );    
   int  errorFlag = 0;
diff --git a/packages/rol/example/PinT/nonlinear/CMakeLists.txt b/packages/rol/example/PinT/nonlinear/CMakeLists.txt
index ffcf7abfc162..93b41a4a82e8 100644
--- a/packages/rol/example/PinT/nonlinear/CMakeLists.txt
+++ b/packages/rol/example/PinT/nonlinear/CMakeLists.txt
@@ -1,7 +1,7 @@
-TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/mpi/src/function)
-TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/mpi/src/vector)
+ROL_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/mpi/src/function)
+ROL_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/mpi/src/vector)
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   example_01
   SOURCES example_01.cpp
   ARGS PrintItAll
@@ -10,7 +10,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
 )
 
-TRIBITS_COPY_FILES_TO_BINARY_DIR(
+ROL_COPY_FILES_TO_BINARY_DIR(
   PinTNonlinearDataCopy
   SOURCE_FILES
     input_ex01.xml
diff --git a/packages/rol/example/PinT/nonlinear/example_01.cpp b/packages/rol/example/PinT/nonlinear/example_01.cpp
index 90fc54357425..d2f7a8310441 100644
--- a/packages/rol/example/PinT/nonlinear/example_01.cpp
+++ b/packages/rol/example/PinT/nonlinear/example_01.cpp
@@ -22,7 +22,7 @@
 #include "dynamicConstraint.hpp"
 #include "dynamicObjective.hpp"
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "ROL_Stream.hpp"
 #include "ROL_ParameterList.hpp"
 #include "ROL_ReducedDynamicObjective.hpp"
@@ -39,7 +39,7 @@ int main(int argc, char *argv[]) {
   using RealT = double;
   using luint  = std::vector<RealT>::size_type;
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   ROL::Ptr<std::ostream> outStream = ROL::makeStreamPtr( std::cout, argc > 1 );
diff --git a/packages/rol/example/PinT/parabolic-control/AugmentedSystem.cpp b/packages/rol/example/PinT/parabolic-control/AugmentedSystem.cpp
index c19abbb60aee..0ee0b63616dc 100644
--- a/packages/rol/example/PinT/parabolic-control/AugmentedSystem.cpp
+++ b/packages/rol/example/PinT/parabolic-control/AugmentedSystem.cpp
@@ -15,8 +15,7 @@
 #include "ROL_Ptr.hpp"
 
 #include "Teuchos_oblackholestream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Teuchos_StackedTimer.hpp"
 
 #include "ROL_Stream.hpp"
@@ -36,14 +35,14 @@ using size_type = std::vector<RealT>::size_type;
 
 void run_test_kkt(MPI_Comm comm, const ROL::Ptr<std::ostream> & outStream);
 
-int main( int argc, char* argv[] ) 
+int main( int argc, char* argv[] )
 {
   using ROL::Ptr;
   using ROL::makePtr;
   using ROL::makePtrFromRef;
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);  
-  int myRank = Teuchos::GlobalMPISession::getRank();
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
+  int myRank = ROL::GlobalMPISession::getRank();
 
   auto outStream = ROL::makeStreamPtr( std::cout, argc > 1 );
 
@@ -157,11 +156,11 @@ void run_test_kkt(MPI_Comm comm, const ROL::Ptr<std::ostream> & outStream)
   }
 
   // Add MGRIT parameter list stuff
-  int sweeps    = pl->get("MGRIT Sweeps", 1); 
+  int sweeps    = pl->get("MGRIT Sweeps", 1);
   RealT omega   = pl->get("MGRIT Relaxation",2.0/3.0);
   int numLevels = pl->get("MGRIT Levels",3);
   double relTol = pl->get("MGRIT Krylov Relative Tolerance",1e-4);
-  
+
   if(myRank==0) {
     (*outStream) << "Sweeps = " << sweeps    << std::endl;
     (*outStream) << "Omega = "  << omega     << std::endl;
@@ -211,7 +210,7 @@ void run_test_kkt(MPI_Comm comm, const ROL::Ptr<std::ostream> & outStream)
   if(myRank==0)
     (*outStream) << std::endl;
 
-  // check jacobi is reducing the norm 
+  // check jacobi is reducing the norm
   if(false) {
     auto kkt_x_out = kkt_vector->clone();
     auto kkt_diff = kkt_vector->clone();
@@ -230,7 +229,7 @@ void run_test_kkt(MPI_Comm comm, const ROL::Ptr<std::ostream> & outStream)
         res_0 = norm_res;
       if(myRank==0)
         (*outStream) << "NORM RES = " << norm_res << "  " << norm_res/res_0<< std::endl;
-    
+
       pint_con->applyWathenInverse(*kkt_diff,*kkt_res,*state,*control,tol,true); // dx_i = M^{-1} r
 
       kkt_x_out->axpy(omega,*kkt_diff);                                                // x_{i+1} = x_i + omega*dx_i
@@ -269,7 +268,7 @@ void run_test_kkt(MPI_Comm comm, const ROL::Ptr<std::ostream> & outStream)
     Teuchos::ParameterList parlist;
     parlist.set("Absolute Tolerance",1.e-1);
     parlist.set("Relative Tolerance",relTol);
- 
+
     ROL::GMRES<RealT> krylov(parlist); // TODO: Do Belos
 
     int flag = 0, iter = 0;
diff --git a/packages/rol/example/PinT/parabolic-control/CMakeLists.txt b/packages/rol/example/PinT/parabolic-control/CMakeLists.txt
index cf82a78f506f..fce07d8d671e 100644
--- a/packages/rol/example/PinT/parabolic-control/CMakeLists.txt
+++ b/packages/rol/example/PinT/parabolic-control/CMakeLists.txt
@@ -1,7 +1,7 @@
-TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/mpi/src/function)
-TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/mpi/src/vector)
+ROL_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/mpi/src/function)
+ROL_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/mpi/src/vector)
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   example_01
   SOURCES example_01.cpp
   ARGS PrintItAll
@@ -10,7 +10,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
 )
 
-TRIBITS_COPY_FILES_TO_BINARY_DIR(
+ROL_COPY_FILES_TO_BINARY_DIR(
   PinTParabolicControlDataCopy
   SOURCE_FILES
     input_ex01.xml
@@ -21,7 +21,7 @@ TRIBITS_COPY_FILES_TO_BINARY_DIR(
 
 IF( TPL_ENABLE_MPI )
 
-  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  ROL_ADD_EXECUTABLE_AND_TEST(
       PinTConstraint_FD_Test
       SOURCES PinTConstraint_FD_Test.cpp
       ARGS PrintItAll
@@ -30,7 +30,7 @@ IF( TPL_ENABLE_MPI )
       ADD_DIR_TO_NAME
       )
 
-  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  ROL_ADD_EXECUTABLE_AND_TEST(
       AugmentedSystem_test
       SOURCES AugmentedSystem.cpp
       ARGS PrintItAll
diff --git a/packages/rol/example/PinT/parabolic-control/PinTConstraint_FD_Test.cpp b/packages/rol/example/PinT/parabolic-control/PinTConstraint_FD_Test.cpp
index 37fd34fda5f5..d46a581c49b7 100644
--- a/packages/rol/example/PinT/parabolic-control/PinTConstraint_FD_Test.cpp
+++ b/packages/rol/example/PinT/parabolic-control/PinTConstraint_FD_Test.cpp
@@ -12,7 +12,7 @@
 #include <random>
 #include <utility>
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "ROL_Stream.hpp"
 #include "ROL_ParameterList.hpp"
@@ -57,9 +57,9 @@ int main( int argc, char* argv[] )
   using ROL::makePtr;
   using ROL::makePtrFromRef;
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);  
-  int myRank = Teuchos::GlobalMPISession::getRank();
-//  int numProcs = Teuchos::GlobalMPISession::getNProc();
+  ROL::GlobalMPISession mpiSession(&argc, &argv);  
+  int myRank = ROL::GlobalMPISession::getRank();
+//  int numProcs = ROL::GlobalMPISession::getNProc();
 
 //  int iprint     = argc - 1;
   auto outStream = ROL::makeStreamPtr( std::cout, argc > 1 && myRank==0 );
diff --git a/packages/rol/example/PinT/parabolic-control/dynamicConstraint.hpp b/packages/rol/example/PinT/parabolic-control/dynamicConstraint.hpp
index 7e93925cc9a7..382f51a4824f 100644
--- a/packages/rol/example/PinT/parabolic-control/dynamicConstraint.hpp
+++ b/packages/rol/example/PinT/parabolic-control/dynamicConstraint.hpp
@@ -25,7 +25,7 @@
 #include "ROL_ParameterList.hpp"
 #include "ROL_StdVector.hpp"
 #include "ROL_DynamicConstraint.hpp"
-#include "Teuchos_LAPACK.hpp"
+#include "ROL_LAPACK.hpp"
 
 template<class Real>
 class Constraint_ParabolicControl : public ROL::DynamicConstraint<Real> {
@@ -299,7 +299,7 @@ class Constraint_ParabolicControl : public ROL::DynamicConstraint<Real> {
               const std::vector<Real> &r) const {
     u.assign(r.begin(),r.end());
     // Perform LDL factorization
-    Teuchos::LAPACK<int,Real> lp;
+    ROL::LAPACK<int,Real> lp;
     int nx = static_cast<int>(nx_);
     int info;
     int ldb  = nx;
diff --git a/packages/rol/example/PinT/parabolic-control/example_01.cpp b/packages/rol/example/PinT/parabolic-control/example_01.cpp
index f62dc4777a9b..430d0a9a1a86 100644
--- a/packages/rol/example/PinT/parabolic-control/example_01.cpp
+++ b/packages/rol/example/PinT/parabolic-control/example_01.cpp
@@ -31,7 +31,7 @@
 #include "dynamicConstraint.hpp"
 #include "dynamicObjective.hpp"
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "ROL_Stream.hpp"
 #include "ROL_ParameterList.hpp"
 #include "ROL_ReducedDynamicObjective.hpp"
@@ -48,7 +48,7 @@ int main(int argc, char *argv[]) {
   using RealT = double;
   using luint  = std::vector<RealT>::size_type;
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   ROL::Ptr<std::ostream> outStream = ROL::makeStreamPtr( std::cout, argc > 1 );
diff --git a/packages/rol/example/PinT/tanks/CMakeLists.txt b/packages/rol/example/PinT/tanks/CMakeLists.txt
index 176d753464c9..9ecd55161509 100644
--- a/packages/rol/example/PinT/tanks/CMakeLists.txt
+++ b/packages/rol/example/PinT/tanks/CMakeLists.txt
@@ -1,7 +1,7 @@
-TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/mpi/src/function)
-TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/mpi/src/vector)
+ROL_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/mpi/src/function)
+ROL_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/mpi/src/vector)
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   example_01
   SOURCES example_01.cpp
   ARGS PrintItAll
@@ -10,7 +10,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
 )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   example_04
   SOURCES example_04.cpp
   ARGS PrintItAll
@@ -19,7 +19,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
 )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   LowerBandedMatrix_test
   SOURCES LowerBandedMatrix_test.cpp
   ARGS PrintItAll
@@ -28,7 +28,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
 )
 
-TRIBITS_COPY_FILES_TO_BINARY_DIR(
+ROL_COPY_FILES_TO_BINARY_DIR(
   TanksDataCopy
   SOURCE_FILES
     tank-parameters.xml
diff --git a/packages/rol/example/PinT/tanks/LowerBandedMatrix_test.cpp b/packages/rol/example/PinT/tanks/LowerBandedMatrix_test.cpp
index c9beab2781f0..cf51e580639a 100644
--- a/packages/rol/example/PinT/tanks/LowerBandedMatrix_test.cpp
+++ b/packages/rol/example/PinT/tanks/LowerBandedMatrix_test.cpp
@@ -13,7 +13,7 @@
 #include <utility>
 #include "ROL_Ptr.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "LowerBandedMatrix.hpp"
 
 using RealT = double;
@@ -30,7 +30,7 @@ int main( int argc, char* argv[] ) {
   default_random_engine gen(r());
   uniform_real_distribution<RealT> dist(1.0, 2.0);
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);  
+  ROL::GlobalMPISession mpiSession(&argc, &argv);  
 
   int iprint     = argc - 1;
   ROL::Ptr<std::ostream> outStream;
diff --git a/packages/rol/example/PinT/tanks/example_01.cpp b/packages/rol/example/PinT/tanks/example_01.cpp
index 6ae42a3e544c..63b16e63c160 100644
--- a/packages/rol/example/PinT/tanks/example_01.cpp
+++ b/packages/rol/example/PinT/tanks/example_01.cpp
@@ -7,7 +7,7 @@
 // *****************************************************************************
 // @HEADER
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "ROL_Stream.hpp"
 #include "ROL_ParameterList.hpp"
 #include "ROL_SerialConstraint.hpp"
@@ -34,7 +34,7 @@ int main( int argc, char* argv[] ) {
   using State             = Tanks::StateVector<RealT>;
   using Control           = Tanks::ControlVector<RealT>;
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);  
+  ROL::GlobalMPISession mpiSession(&argc, &argv);  
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   auto outStream = ROL::makeStreamPtr( std::cout, argc > 1 );
diff --git a/packages/rol/example/PinT/tanks/example_04.cpp b/packages/rol/example/PinT/tanks/example_04.cpp
index 74ce2aeb079b..6cb30705d5db 100644
--- a/packages/rol/example/PinT/tanks/example_04.cpp
+++ b/packages/rol/example/PinT/tanks/example_04.cpp
@@ -7,7 +7,7 @@
 // *****************************************************************************
 // @HEADER
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "ROL_Stream.hpp"
 #include "ROL_ParameterList.hpp"
 #include "ROL_ReducedDynamicObjective.hpp"
@@ -24,7 +24,7 @@ int main( int argc, char* argv[] ) {
   using RealT     = double;
   using size_type = std::vector<RealT>::size_type;
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   auto outStream = ROL::makeStreamPtr( std::cout, argc > 1 );
diff --git a/packages/rol/example/arrayfire/CMakeLists.txt b/packages/rol/example/arrayfire/CMakeLists.txt
index 2fa26f0231cc..3dedf174d65d 100644
--- a/packages/rol/example/arrayfire/CMakeLists.txt
+++ b/packages/rol/example/arrayfire/CMakeLists.txt
@@ -4,7 +4,7 @@ IF(${PACKAGE_NAME}_ENABLE_ArrayFireCPU)
   # Need ROL_ArrayFireVector.hpp
   TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/arrayfire/src/vector)
 
-  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  ROL_ADD_EXECUTABLE_AND_TEST(
     example_01
     SOURCES example_01.cpp
     ARGS PrintItAll
@@ -13,7 +13,7 @@ IF(${PACKAGE_NAME}_ENABLE_ArrayFireCPU)
     ADD_DIR_TO_NAME
   )
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     ArrayFireDataCopy
     SOURCE_FILES
       input-double.xml
diff --git a/packages/rol/example/arrayfire/example_01.cpp b/packages/rol/example/arrayfire/example_01.cpp
index 7a8b5f69ed2d..1109c637278d 100644
--- a/packages/rol/example/arrayfire/example_01.cpp
+++ b/packages/rol/example/arrayfire/example_01.cpp
@@ -16,8 +16,8 @@
 #include "ROL_LineSearchStep.hpp"
 #include "ROL_StatusTest.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
+#include "ROL_XMLReader.hpp"
 
 #include "example_01.hpp"
 
@@ -28,7 +28,7 @@ typedef float  ElementT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/example/binary-design/CMakeLists.txt b/packages/rol/example/binary-design/CMakeLists.txt
index 4eeb07afc9df..f86677a2b8ab 100644
--- a/packages/rol/example/binary-design/CMakeLists.txt
+++ b/packages/rol/example/binary-design/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   example_01
   SOURCES example_01.cpp
   ARGS PrintItAll
@@ -8,7 +8,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_COPY_FILES_TO_BINARY_DIR(
+ROL_COPY_FILES_TO_BINARY_DIR(
   BinaryDesignDataCopy
   SOURCE_FILES
     input.xml
diff --git a/packages/rol/example/binary-design/example_01.cpp b/packages/rol/example/binary-design/example_01.cpp
index 8099207a9fc4..29a059760215 100644
--- a/packages/rol/example/binary-design/example_01.cpp
+++ b/packages/rol/example/binary-design/example_01.cpp
@@ -16,8 +16,7 @@
 #include "ROL_StdVector.hpp"
 #include "ROL_Stream.hpp"
 
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 
@@ -149,7 +148,7 @@ class BinaryDesignConstraint : public ROL::Constraint<Real> {
 };
 
 int main(int argc, char *argv[]) {
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
@@ -163,13 +162,13 @@ int main(int argc, char *argv[]) {
   int errorFlag  = 0;
 
   // *** Example body.
- 
+
   try {
 
     // Set up problem data
-    int   dim   = 10; // Set problem dimension. 
-    RealT vol   = 2;  // Set desired volume. 
-    RealT alpha = 1;  // Set quadratic penalty. 
+    int   dim   = 10; // Set problem dimension.
+    RealT vol   = 2;  // Set desired volume.
+    RealT alpha = 1;  // Set quadratic penalty.
     ROL::Ptr<std::vector<RealT> > x_ptr = ROL::makePtr<std::vector<RealT>>(dim, 0.0);
     ROL::Ptr<std::vector<RealT> > g_ptr = ROL::makePtr<std::vector<RealT>>(dim, 0.0);
     ROL::Ptr<std::vector<RealT> > d_ptr = ROL::makePtr<std::vector<RealT>>(dim, 0.0);
diff --git a/packages/rol/example/burgers-control/CMakeLists.txt b/packages/rol/example/burgers-control/CMakeLists.txt
index d1288baf4e21..bd38cc610600 100644
--- a/packages/rol/example/burgers-control/CMakeLists.txt
+++ b/packages/rol/example/burgers-control/CMakeLists.txt
@@ -1,60 +1,60 @@
 TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/teuchos/src/sol)
 
-TRIBITS_ADD_EXECUTABLE(
+ROL_ADD_EXECUTABLE(
   example_01
   SOURCES example_01.cpp
   ADD_DIR_TO_NAME
 )
 
-TRIBITS_ADD_EXECUTABLE(
+ROL_ADD_EXECUTABLE(
   example_02
   SOURCES example_02.cpp
   ADD_DIR_TO_NAME
 )
 
-TRIBITS_ADD_EXECUTABLE(
+ROL_ADD_EXECUTABLE(
   example_03
   SOURCES example_03.cpp
   ADD_DIR_TO_NAME
 )
 
-TRIBITS_ADD_EXECUTABLE(
+ROL_ADD_EXECUTABLE(
   example_04
   SOURCES example_04.cpp
   ADD_DIR_TO_NAME
 )
 
-TRIBITS_ADD_EXECUTABLE(
+ROL_ADD_EXECUTABLE(
   example_05
   SOURCES example_05.cpp
   ADD_DIR_TO_NAME
 )
 
-TRIBITS_ADD_EXECUTABLE(
+ROL_ADD_EXECUTABLE(
   example_06
   SOURCES example_06.cpp
   ADD_DIR_TO_NAME
 )
 
-TRIBITS_ADD_EXECUTABLE(
+ROL_ADD_EXECUTABLE(
   example_07
   SOURCES example_07.cpp
   ADD_DIR_TO_NAME
 )
 
-TRIBITS_ADD_EXECUTABLE(
+ROL_ADD_EXECUTABLE(
   example_08
   SOURCES example_08.cpp
   ADD_DIR_TO_NAME
 )
 
-TRIBITS_ADD_EXECUTABLE(
+ROL_ADD_EXECUTABLE(
   example_10
   SOURCES example_10.cpp
   ADD_DIR_TO_NAME
 )
 
-TRIBITS_COPY_FILES_TO_BINARY_DIR(
+ROL_COPY_FILES_TO_BINARY_DIR(
   BurgersControlDataCopy
   SOURCE_FILES
     input.xml input_ex10.xml
diff --git a/packages/rol/example/burgers-control/example_01.cpp b/packages/rol/example/burgers-control/example_01.cpp
index 0f52c8a2da8f..f52f01eecfd3 100644
--- a/packages/rol/example/burgers-control/example_01.cpp
+++ b/packages/rol/example/burgers-control/example_01.cpp
@@ -16,9 +16,7 @@
 #include "ROL_TypeB_LinMoreAlgorithm.hpp"
 #include "ROL_Bounds.hpp"
 
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
-#include "Teuchos_LAPACK.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 #include <fstream>
@@ -38,7 +36,7 @@ int main(int argc, char *argv[]) {
 
   typedef typename vector::size_type luint;
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/example/burgers-control/example_02.cpp b/packages/rol/example/burgers-control/example_02.cpp
index bf2ee6c7e913..bcec5f5d29f5 100644
--- a/packages/rol/example/burgers-control/example_02.cpp
+++ b/packages/rol/example/burgers-control/example_02.cpp
@@ -18,7 +18,7 @@
 #include "ROL_Reduced_Objective_SimOpt.hpp"
 #include "ROL_Stream.hpp"
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Teuchos_LAPACK.hpp"
 
 #include <iostream>
@@ -30,7 +30,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint = argc - 1;
diff --git a/packages/rol/example/burgers-control/example_03.cpp b/packages/rol/example/burgers-control/example_03.cpp
index 5d6e66265d59..79ef69eaf743 100644
--- a/packages/rol/example/burgers-control/example_03.cpp
+++ b/packages/rol/example/burgers-control/example_03.cpp
@@ -18,7 +18,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint = argc - 1;
diff --git a/packages/rol/example/burgers-control/example_03.hpp b/packages/rol/example/burgers-control/example_03.hpp
index 9e2040925aae..e5ea64c69d55 100644
--- a/packages/rol/example/burgers-control/example_03.hpp
+++ b/packages/rol/example/burgers-control/example_03.hpp
@@ -25,8 +25,7 @@
 #include "ROL_ParameterList.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_LAPACK.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 #include <algorithm>
@@ -346,7 +345,7 @@ class Constraint_BurgersControl : public ROL::Constraint_SimOpt<Real> {
       std::vector<Real> Du(du);
       std::vector<Real> D(d);
       // Perform LDL factorization
-      Teuchos::LAPACK<int,Real> lp;
+      ROL::LAPACK<int,Real> lp;
       std::vector<Real> Du2(nx_-2,0.0);
       std::vector<int> ipiv(nx_,0);
       int info;
diff --git a/packages/rol/example/burgers-control/example_04.cpp b/packages/rol/example/burgers-control/example_04.cpp
index 91ee60684ef3..228d0d4d4992 100644
--- a/packages/rol/example/burgers-control/example_04.cpp
+++ b/packages/rol/example/burgers-control/example_04.cpp
@@ -19,7 +19,7 @@
 #include "ROL_ParameterList.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 #include <algorithm>
@@ -36,7 +36,7 @@ typedef H1VectorPrimal<RealT> DualConstraintVector;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
   ROL::Ptr<std::ostream> outStream;
diff --git a/packages/rol/example/burgers-control/example_04.hpp b/packages/rol/example/burgers-control/example_04.hpp
index 21592216870c..dc145fe5c873 100644
--- a/packages/rol/example/burgers-control/example_04.hpp
+++ b/packages/rol/example/burgers-control/example_04.hpp
@@ -18,7 +18,7 @@
 #include "ROL_Constraint_SimOpt.hpp"
 #include "ROL_Objective_SimOpt.hpp"
 
-#include "Teuchos_LAPACK.hpp"
+#include "ROL_LAPACK.hpp"
 
 template<class Real>
 class L2VectorPrimal;
@@ -78,7 +78,7 @@ class BurgersFEM {
     else {
       u.assign(r.begin(),r.end());
       // Perform LDL factorization
-      Teuchos::LAPACK<int,Real> lp;
+      ROL::LAPACK<int,Real> lp;
       std::vector<Real> du2(r.size()-2,0.0);
       std::vector<int> ipiv(r.size(),0);
       int info;
diff --git a/packages/rol/example/burgers-control/example_05.cpp b/packages/rol/example/burgers-control/example_05.cpp
index dda048acab8b..dd5060d7dffc 100644
--- a/packages/rol/example/burgers-control/example_05.cpp
+++ b/packages/rol/example/burgers-control/example_05.cpp
@@ -23,7 +23,7 @@ Real random(const ROL::Ptr<const Teuchos::Comm<int> > &comm) {
 
 int main(int argc, char* argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
   ROL::Ptr<const Teuchos::Comm<int> > comm
     = ROL::toPtr(Teuchos::DefaultComm<int>::getComm());
 
diff --git a/packages/rol/example/burgers-control/example_05.hpp b/packages/rol/example/burgers-control/example_05.hpp
index d991720c4d3e..7ca3185b7f96 100644
--- a/packages/rol/example/burgers-control/example_05.hpp
+++ b/packages/rol/example/burgers-control/example_05.hpp
@@ -8,8 +8,8 @@
 // @HEADER
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_LAPACK.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_LAPACK.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Teuchos_Comm.hpp"
 #include "Teuchos_DefaultComm.hpp"
 #include "Teuchos_CommHelpers.hpp"
@@ -137,7 +137,7 @@ class Constraint_BurgersControl : public ROL::Constraint_SimOpt<Real> {
               const std::vector<Real> &r, const bool transpose = false) {
     u.assign(r.begin(),r.end());
     // Perform LDL factorization
-    Teuchos::LAPACK<int,Real> lp;
+    ROL::LAPACK<int,Real> lp;
     std::vector<Real> du2(nx_-2,0.0);
     std::vector<int> ipiv(nx_,0);
     int info;
diff --git a/packages/rol/example/burgers-control/example_06.cpp b/packages/rol/example/burgers-control/example_06.cpp
index ed346656625f..0a0b583f6479 100644
--- a/packages/rol/example/burgers-control/example_06.cpp
+++ b/packages/rol/example/burgers-control/example_06.cpp
@@ -25,7 +25,7 @@
 #include "ROL_MonteCarloGenerator.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Teuchos_Comm.hpp"
 #include "Teuchos_DefaultComm.hpp"
 #include "Teuchos_CommHelpers.hpp"
@@ -45,7 +45,7 @@ typedef H1VectorPrimal<RealT> DualConstraintVector;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   auto comm = ROL::toPtr(Teuchos::DefaultComm<int>::getComm());
 
diff --git a/packages/rol/example/burgers-control/example_06.hpp b/packages/rol/example/burgers-control/example_06.hpp
index 0e1764cbcd18..3d421991fc7b 100644
--- a/packages/rol/example/burgers-control/example_06.hpp
+++ b/packages/rol/example/burgers-control/example_06.hpp
@@ -19,7 +19,7 @@
 #include "ROL_Objective_SimOpt.hpp"
 #include "ROL_TeuchosBatchManager.hpp"
 
-#include "Teuchos_LAPACK.hpp"
+#include "ROL_LAPACK.hpp"
 
 template<class Real>
 class L2VectorPrimal;
@@ -79,7 +79,7 @@ class BurgersFEM {
     else {
       u.assign(r.begin(),r.end());
       // Perform LDL factorization
-      Teuchos::LAPACK<int,Real> lp;
+      ROL::LAPACK<int,Real> lp;
       std::vector<Real> du2(r.size()-2,0.0);
       std::vector<int> ipiv(r.size(),0);
       int info;
diff --git a/packages/rol/example/burgers-control/example_07.cpp b/packages/rol/example/burgers-control/example_07.cpp
index 8055082d4e4f..4aca28aaddc0 100644
--- a/packages/rol/example/burgers-control/example_07.cpp
+++ b/packages/rol/example/burgers-control/example_07.cpp
@@ -18,7 +18,7 @@
 #include "ROL_ParameterList.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Teuchos_Comm.hpp"
 #include "Teuchos_DefaultComm.hpp"
 #include "Teuchos_CommHelpers.hpp"
@@ -39,7 +39,7 @@ typedef H1VectorPrimal<RealT> DualConstraintVector;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
   ROL::Ptr<const Teuchos::Comm<int>> comm
     = ROL::toPtr(Teuchos::DefaultComm<int>::getComm());
 
diff --git a/packages/rol/example/burgers-control/example_07.hpp b/packages/rol/example/burgers-control/example_07.hpp
index 11d007e7c118..c8145a09d10e 100644
--- a/packages/rol/example/burgers-control/example_07.hpp
+++ b/packages/rol/example/burgers-control/example_07.hpp
@@ -19,7 +19,7 @@
 #include "ROL_Objective_SimOpt.hpp"
 #include "ROL_TeuchosBatchManager.hpp"
 
-#include "Teuchos_LAPACK.hpp"
+#include "ROL_LAPACK.hpp"
 
 template<class Real>
 class L2VectorPrimal;
@@ -79,7 +79,7 @@ class BurgersFEM {
     else {
       u.assign(r.begin(),r.end());
       // Perform LDL factorization
-      Teuchos::LAPACK<int,Real> lp;
+      ROL::LAPACK<int,Real> lp;
       std::vector<Real> du2(r.size()-2,0.0);
       std::vector<int> ipiv(r.size(),0);
       int info;
diff --git a/packages/rol/example/burgers-control/example_08.cpp b/packages/rol/example/burgers-control/example_08.cpp
index e9a6d9356455..9275fa3b365f 100644
--- a/packages/rol/example/burgers-control/example_08.cpp
+++ b/packages/rol/example/burgers-control/example_08.cpp
@@ -20,7 +20,7 @@
 #include "ROL_ParameterList.hpp"
 #include "ROL_Stream.hpp"
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Teuchos_Comm.hpp"
 #include "Teuchos_DefaultComm.hpp"
 #include "Teuchos_CommHelpers.hpp"
@@ -41,7 +41,7 @@ typedef H1VectorPrimal<RealT> DualConstraintVector;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
   ROL::Ptr<const Teuchos::Comm<int> > comm
     = ROL::toPtr(Teuchos::DefaultComm<int>::getComm());
 
diff --git a/packages/rol/example/burgers-control/example_08.hpp b/packages/rol/example/burgers-control/example_08.hpp
index 792c9ea9b128..406c88c2efff 100644
--- a/packages/rol/example/burgers-control/example_08.hpp
+++ b/packages/rol/example/burgers-control/example_08.hpp
@@ -19,7 +19,7 @@
 #include "ROL_Objective_SimOpt.hpp"
 #include "ROL_TeuchosBatchManager.hpp"
 
-#include "Teuchos_LAPACK.hpp"
+#include "ROL_LAPACK.hpp"
 
 template<class Real>
 class L2VectorPrimal;
@@ -79,7 +79,7 @@ class BurgersFEM {
     else {
       u.assign(r.begin(),r.end());
       // Perform LDL factorization
-      Teuchos::LAPACK<int,Real> lp;
+      ROL::LAPACK<int,Real> lp;
       std::vector<Real> du2(r.size()-2,0.0);
       std::vector<int> ipiv(r.size(),0);
       int info;
diff --git a/packages/rol/example/burgers-control/example_10.cpp b/packages/rol/example/burgers-control/example_10.cpp
index cf3ca60b6936..0873656c34ff 100644
--- a/packages/rol/example/burgers-control/example_10.cpp
+++ b/packages/rol/example/burgers-control/example_10.cpp
@@ -13,7 +13,7 @@ typedef double RealT;
 
 int main(int argc, char* argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
   ROL::Ptr<const Teuchos::Comm<int>> comm
     = ROL::toPtr(Teuchos::DefaultComm<int>::getComm());
 
diff --git a/packages/rol/example/burgers-control/example_10.hpp b/packages/rol/example/burgers-control/example_10.hpp
index 56bb360df432..041d39cfd9b1 100644
--- a/packages/rol/example/burgers-control/example_10.hpp
+++ b/packages/rol/example/burgers-control/example_10.hpp
@@ -7,8 +7,8 @@
 // *****************************************************************************
 // @HEADER
 
-#include "Teuchos_LAPACK.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_LAPACK.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Teuchos_Comm.hpp"
 #include "Teuchos_DefaultComm.hpp"
 #include "Teuchos_CommHelpers.hpp"
@@ -139,7 +139,7 @@ class Constraint_BurgersControl : public ROL::Constraint_SimOpt<Real> {
               const std::vector<Real> &r, const bool transpose = false) {
     u.assign(r.begin(),r.end());
     // Perform LDL factorization
-    Teuchos::LAPACK<int,Real> lp;
+    ROL::LAPACK<int,Real> lp;
     std::vector<Real> du2(nx_-2,0.0);
     std::vector<int> ipiv(nx_,0);
     int info;
diff --git a/packages/rol/example/complex/CMakeLists.txt b/packages/rol/example/complex/CMakeLists.txt
index 4567ee5814d5..0cd3b1e081dd 100644
--- a/packages/rol/example/complex/CMakeLists.txt
+++ b/packages/rol/example/complex/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   example_01
   SOURCES example_01.cpp
   ARGS PrintItAll
@@ -8,7 +8,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-#TRIBITS_ADD_EXECUTABLE_AND_TEST(
+#ROL_ADD_EXECUTABLE_AND_TEST(
 #  example_02
 #  SOURCES example_02.cpp
 #  ARGS PrintItAll
diff --git a/packages/rol/example/complex/example_01.cpp b/packages/rol/example/complex/example_01.cpp
index 0d8f68479a42..59018049c112 100644
--- a/packages/rol/example/complex/example_01.cpp
+++ b/packages/rol/example/complex/example_01.cpp
@@ -19,7 +19,7 @@
 
 #include <cmath>
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "ROL_OptimizationSolver.hpp"
 #include "ROL_ComplexStdVector.hpp"
@@ -33,7 +33,7 @@ int main(int argc, char *argv[]) {
   using namespace  ROL;
   using RealT     = double;              
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/example/complex/example_02.cpp b/packages/rol/example/complex/example_02.cpp
index d41800ee9ae4..b051dbee5681 100644
--- a/packages/rol/example/complex/example_02.cpp
+++ b/packages/rol/example/complex/example_02.cpp
@@ -17,7 +17,7 @@
 
 #include <cmath>
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "ROL_LinearConstraint.hpp"
 #include "ROL_OptimizationSolver.hpp"
@@ -34,7 +34,7 @@ int main(int argc, char *argv[]) {
   using namespace  ROL;
   using RealT     = double;              
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/example/dense-hessian/CMakeLists.txt b/packages/rol/example/dense-hessian/CMakeLists.txt
index 3c3a6e1ad8ff..2bc93f03016a 100644
--- a/packages/rol/example/dense-hessian/CMakeLists.txt
+++ b/packages/rol/example/dense-hessian/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   DenseHessian
   SOURCES example_01.cpp
   ARGS PrintItAll
diff --git a/packages/rol/example/dense-hessian/example_01.cpp b/packages/rol/example/dense-hessian/example_01.cpp
index f85a31115e0a..f8d91b6c6c6c 100644
--- a/packages/rol/example/dense-hessian/example_01.cpp
+++ b/packages/rol/example/dense-hessian/example_01.cpp
@@ -20,7 +20,7 @@
 #include "ROL_ScaledStdVector.hpp"
 #include "ROL_Stream.hpp"
 #include "ROL_HelperFunctions.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 
@@ -28,7 +28,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/example/diode-circuit-simopt/CMakeLists.txt b/packages/rol/example/diode-circuit-simopt/CMakeLists.txt
index 6b0e9fce4de4..348a1a69c7c2 100644
--- a/packages/rol/example/diode-circuit-simopt/CMakeLists.txt
+++ b/packages/rol/example/diode-circuit-simopt/CMakeLists.txt
@@ -1,4 +1,4 @@
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   example_01
   SOURCES example_01.cpp
   ARGS PrintItAll
@@ -7,7 +7,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
 )
 
-TRIBITS_COPY_FILES_TO_BINARY_DIR(
+ROL_COPY_FILES_TO_BINARY_DIR(
   DiodeCircuitSimOptDataCopy
   SOURCE_FILES
     input.xml
diff --git a/packages/rol/example/diode-circuit-simopt/example_01.cpp b/packages/rol/example/diode-circuit-simopt/example_01.cpp
index f59a64f4a24f..a8026444e3c2 100644
--- a/packages/rol/example/diode-circuit-simopt/example_01.cpp
+++ b/packages/rol/example/diode-circuit-simopt/example_01.cpp
@@ -21,7 +21,7 @@
 #include "ROL_ParameterList.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 #include <fstream>
@@ -31,7 +31,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/example/diode-circuit-simopt/example_01.hpp b/packages/rol/example/diode-circuit-simopt/example_01.hpp
index 8c660f45caf8..631cb2b8afdd 100644
--- a/packages/rol/example/diode-circuit-simopt/example_01.hpp
+++ b/packages/rol/example/diode-circuit-simopt/example_01.hpp
@@ -7,9 +7,7 @@
 #include "ROL_ConstraintStatusTest.hpp"
 #include "ROL_Types.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
-#include "Teuchos_LAPACK.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 #include <algorithm>
diff --git a/packages/rol/example/diode-circuit/CMakeLists.txt b/packages/rol/example/diode-circuit/CMakeLists.txt
index 89625ee04510..4286bd5b6a6e 100644
--- a/packages/rol/example/diode-circuit/CMakeLists.txt
+++ b/packages/rol/example/diode-circuit/CMakeLists.txt
@@ -1,11 +1,11 @@
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   example_01
   SOURCES example_01.cpp
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_COPY_FILES_TO_BINARY_DIR(
+ROL_COPY_FILES_TO_BINARY_DIR(
   DiodeCircuitDataCopy
   SOURCE_FILES
     input.xml
diff --git a/packages/rol/example/diode-circuit/example_01.cpp b/packages/rol/example/diode-circuit/example_01.cpp
index 710e58db4db9..6e6c1268b760 100644
--- a/packages/rol/example/diode-circuit/example_01.cpp
+++ b/packages/rol/example/diode-circuit/example_01.cpp
@@ -19,7 +19,7 @@
 #include "ROL_ParameterList.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <string>
 #include <iostream>
@@ -30,7 +30,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/example/dual-spaces/rosenbrock-1/CMakeLists.txt b/packages/rol/example/dual-spaces/rosenbrock-1/CMakeLists.txt
index 9c4ff4ded014..081687308a21 100644
--- a/packages/rol/example/dual-spaces/rosenbrock-1/CMakeLists.txt
+++ b/packages/rol/example/dual-spaces/rosenbrock-1/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   example_01
   SOURCES example_01.cpp
   ARGS PrintItAll
@@ -8,7 +8,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   example_02
   SOURCES example_02.cpp
   ARGS PrintItAll
diff --git a/packages/rol/example/dual-spaces/rosenbrock-1/example_01.cpp b/packages/rol/example/dual-spaces/rosenbrock-1/example_01.cpp
index d320dda757e7..fd1d67ee872f 100644
--- a/packages/rol/example/dual-spaces/rosenbrock-1/example_01.cpp
+++ b/packages/rol/example/dual-spaces/rosenbrock-1/example_01.cpp
@@ -16,7 +16,7 @@
 #include "ROL_Rosenbrock.hpp"
 #include "ROL_Solver.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include <iostream>
 
 typedef double RealT;
@@ -230,7 +230,7 @@ Real apply( const ROL::Vector<Real> &x ) const {
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/example/dual-spaces/rosenbrock-1/example_02.cpp b/packages/rol/example/dual-spaces/rosenbrock-1/example_02.cpp
index 71d7eaf302a3..1df14d77088d 100644
--- a/packages/rol/example/dual-spaces/rosenbrock-1/example_02.cpp
+++ b/packages/rol/example/dual-spaces/rosenbrock-1/example_02.cpp
@@ -17,7 +17,7 @@
 #include "ROL_Solver.hpp"
 #include "ROL_Bounds.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include <iostream>
 
 typedef double RealT;
@@ -273,7 +273,7 @@ Real reduce( const ROL::Elementwise::ReductionOp<Real> &r ) const {
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/example/dual-spaces/simple-eq-constr-1/CMakeLists.txt b/packages/rol/example/dual-spaces/simple-eq-constr-1/CMakeLists.txt
index a55542121ea9..e44048ab9bdb 100644
--- a/packages/rol/example/dual-spaces/simple-eq-constr-1/CMakeLists.txt
+++ b/packages/rol/example/dual-spaces/simple-eq-constr-1/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   example_01
   SOURCES example_01.cpp
   ARGS PrintItAll
diff --git a/packages/rol/example/dual-spaces/simple-eq-constr-1/example_01.cpp b/packages/rol/example/dual-spaces/simple-eq-constr-1/example_01.cpp
index ff02259613a1..905a53455481 100644
--- a/packages/rol/example/dual-spaces/simple-eq-constr-1/example_01.cpp
+++ b/packages/rol/example/dual-spaces/simple-eq-constr-1/example_01.cpp
@@ -15,7 +15,7 @@
 #include "ROL_SimpleEqConstrained.hpp"
 #include "ROL_Solver.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 //#include <fenv.h>
@@ -414,7 +414,7 @@ int main(int argc, char *argv[]) {
   typedef std::vector<RealT> vector;
   typedef vector::size_type  luint;
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/example/dual-spaces/simple-eq-constr-2/CMakeLists.txt b/packages/rol/example/dual-spaces/simple-eq-constr-2/CMakeLists.txt
index a55542121ea9..e44048ab9bdb 100644
--- a/packages/rol/example/dual-spaces/simple-eq-constr-2/CMakeLists.txt
+++ b/packages/rol/example/dual-spaces/simple-eq-constr-2/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   example_01
   SOURCES example_01.cpp
   ARGS PrintItAll
diff --git a/packages/rol/example/dual-spaces/simple-eq-constr-2/example_01.cpp b/packages/rol/example/dual-spaces/simple-eq-constr-2/example_01.cpp
index 959217bea20a..b95955a3900f 100644
--- a/packages/rol/example/dual-spaces/simple-eq-constr-2/example_01.cpp
+++ b/packages/rol/example/dual-spaces/simple-eq-constr-2/example_01.cpp
@@ -17,7 +17,7 @@
 #include "ROL_ConstraintStatusTest.hpp"
 #include "ROL_CompositeStep.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 
@@ -414,7 +414,7 @@ int main(int argc, char *argv[]) {
 
     
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/example/gross-pitaevskii/CMakeLists.txt b/packages/rol/example/gross-pitaevskii/CMakeLists.txt
index e64c62af03a3..e3b45467ca45 100644
--- a/packages/rol/example/gross-pitaevskii/CMakeLists.txt
+++ b/packages/rol/example/gross-pitaevskii/CMakeLists.txt
@@ -1,6 +1,6 @@
 
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
     example_01
     SOURCES example_01.cpp
     ARGS PrintItAll
@@ -9,7 +9,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
     ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
     example_02
     SOURCES example_02.cpp 
     ARGS "default" "exactsolve" 
@@ -19,7 +19,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   )
 
 
-  TRIBITS_COPY_FILES_TO_BINARY_DIR(
+  ROL_COPY_FILES_TO_BINARY_DIR(
     GrossPitaevskiiDataCopy
     SOURCE_FILES
      parameters.xml
diff --git a/packages/rol/example/gross-pitaevskii/example_01.cpp b/packages/rol/example/gross-pitaevskii/example_01.cpp
index 4707723c7538..f54637138f31 100644
--- a/packages/rol/example/gross-pitaevskii/example_01.cpp
+++ b/packages/rol/example/gross-pitaevskii/example_01.cpp
@@ -42,7 +42,7 @@ typedef double RealT;
 int main(int argc, char **argv) {
 
     // Set up MPI
-    Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+    ROL::GlobalMPISession mpiSession(&argc, &argv);
 
     // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
     int iprint     = argc - 1;
diff --git a/packages/rol/example/gross-pitaevskii/example_01.hpp b/packages/rol/example/gross-pitaevskii/example_01.hpp
index f96864a60365..6f41d38b4234 100644
--- a/packages/rol/example/gross-pitaevskii/example_01.hpp
+++ b/packages/rol/example/gross-pitaevskii/example_01.hpp
@@ -36,7 +36,7 @@
 #include <iostream>
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "ROL_ParameterList.hpp"
 #include "ROL_StdVector.hpp"
@@ -111,7 +111,7 @@ class Objective_GrossPitaevskii : public Objective<Real> {
 
   public: 
 
-  Objective_GrossPitaevskii(const Real &g, const Vector<Real> &pV) : g_(g),  
+  Objective_GrossPitaevskii(const Real &g, const Vector<Real> &pV) : g_(g),
     Vp_(getVector(pV))  {
     nx_ = Vp_->size(); 
     dx_ = (1.0/(1.0+nx_));
diff --git a/packages/rol/example/gross-pitaevskii/example_02.cpp b/packages/rol/example/gross-pitaevskii/example_02.cpp
index b4a7da2d6f98..6a6c696d096a 100644
--- a/packages/rol/example/gross-pitaevskii/example_02.cpp
+++ b/packages/rol/example/gross-pitaevskii/example_02.cpp
@@ -57,7 +57,7 @@ int main(int argc, char **argv) {
 
     
     // Set up MPI
-    Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+    ROL::GlobalMPISession mpiSession(&argc, &argv);
 
     // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
     int iprint     = argc - 1;
diff --git a/packages/rol/example/gross-pitaevskii/example_02.hpp b/packages/rol/example/gross-pitaevskii/example_02.hpp
index 007e9fb18712..bdf06e9a4d30 100644
--- a/packages/rol/example/gross-pitaevskii/example_02.hpp
+++ b/packages/rol/example/gross-pitaevskii/example_02.hpp
@@ -48,7 +48,7 @@
 #include <iostream>
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "ROL_ParameterList.hpp"
 #include "ROL_StdVector.hpp"
diff --git a/packages/rol/example/gross-pitaevskii/numerics/FiniteDifference.hpp b/packages/rol/example/gross-pitaevskii/numerics/FiniteDifference.hpp
index 25197527f7a8..223f3ecfeb69 100644
--- a/packages/rol/example/gross-pitaevskii/numerics/FiniteDifference.hpp
+++ b/packages/rol/example/gross-pitaevskii/numerics/FiniteDifference.hpp
@@ -8,7 +8,7 @@
 // @HEADER
 
 #include "ROL_StdVector.hpp"
-#include "Teuchos_LAPACK.hpp"
+#include "ROL_LAPACK.hpp"
 
 using namespace ROL;
 
@@ -18,7 +18,7 @@ class FiniteDifference {
     private:
         int n_;
         double dx2_;
-        Teuchos::LAPACK<int,Real> lapack_;
+        ROL::LAPACK<int,Real> lapack_;
 
         // subdiagonal is -1/dx^2 
         std::vector<Real> dl_;
diff --git a/packages/rol/example/intrepid/CMakeLists.txt b/packages/rol/example/intrepid/CMakeLists.txt
index f9642fb76f41..3d1ae2ff755d 100644
--- a/packages/rol/example/intrepid/CMakeLists.txt
+++ b/packages/rol/example/intrepid/CMakeLists.txt
@@ -2,7 +2,7 @@
 IF(${PROJECT_NAME}_ENABLE_IntrepidCore AND 
    ${PROJECT_NAME}_ENABLE_Shards AND
    ${PROJECT_NAME}_ENABLE_Sacado )
-  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  ROL_ADD_EXECUTABLE_AND_TEST(
     example_01
     SOURCES example_01.cpp
     ARGS PrintItAll
diff --git a/packages/rol/example/intrepid/example_01.cpp b/packages/rol/example/intrepid/example_01.cpp
index 15de9d1b8a48..f024783f7b8d 100644
--- a/packages/rol/example/intrepid/example_01.cpp
+++ b/packages/rol/example/intrepid/example_01.cpp
@@ -38,7 +38,7 @@ int main(int argc, char *argv[]) {
   
   
      
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/example/json/CMakeLists.txt b/packages/rol/example/json/CMakeLists.txt
index 46b4996982dc..884a1ed0bd04 100644
--- a/packages/rol/example/json/CMakeLists.txt
+++ b/packages/rol/example/json/CMakeLists.txt
@@ -14,7 +14,7 @@ IF(TPL_ENABLE_JSONCPP)
 
     tribits_include_directories(${JSONCPP_INCLUDE_DIRS})
 
-    TRIBITS_ADD_EXECUTABLE_AND_TEST(
+    ROL_ADD_EXECUTABLE_AND_TEST(
         example_01
         SOURCES example_01.cpp 
         IMPORTEDLIBS jsoncpp
@@ -24,7 +24,7 @@ IF(TPL_ENABLE_JSONCPP)
         ADD_DIR_TO_NAME
     )
 
-    TRIBITS_COPY_FILES_TO_BINARY_DIR(
+    ROL_COPY_FILES_TO_BINARY_DIR(
 	JsonDataCopy
 	SOURCE_FILES
 	 parameters.json
diff --git a/packages/rol/example/json/example_01.cpp b/packages/rol/example/json/example_01.cpp
index 5dac07ac2b61..f59ba116de47 100644
--- a/packages/rol/example/json/example_01.cpp
+++ b/packages/rol/example/json/example_01.cpp
@@ -32,14 +32,14 @@
 #include "example_01.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
+#include "ROL_XMLReader.hpp"
 
 typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/example/l1-penalty/CMakeLists.txt b/packages/rol/example/l1-penalty/CMakeLists.txt
index e661372401ea..e3adb2536606 100644
--- a/packages/rol/example/l1-penalty/CMakeLists.txt
+++ b/packages/rol/example/l1-penalty/CMakeLists.txt
@@ -1,7 +1,7 @@
 tribits_include_directories(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/teuchos/src/vector)
 tribits_include_directories(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/teuchos/src/function)
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   example_01
   SOURCES example_01.cpp
   ARGS PrintItAll
@@ -10,7 +10,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
 )
 
-TRIBITS_COPY_FILES_TO_BINARY_DIR(
+ROL_COPY_FILES_TO_BINARY_DIR(
   L1PenaltyDataCopy
   SOURCE_FILES
     input.xml
diff --git a/packages/rol/example/l1-penalty/example_01.cpp b/packages/rol/example/l1-penalty/example_01.cpp
index 0ae3094e5716..edea19fd481c 100644
--- a/packages/rol/example/l1-penalty/example_01.cpp
+++ b/packages/rol/example/l1-penalty/example_01.cpp
@@ -7,7 +7,7 @@
 // *****************************************************************************
 // @HEADER
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "compressed_sensing.hpp"
 #include "ROL_OptimizationSolver.hpp"
 #include <iostream>
@@ -25,7 +25,7 @@ using TeuchosVectorT = ROL::TeuchosVector<OrdinalT,RealT>;
 
 int main( int argc, char* argv[] ) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   ROL::Ptr<std::ostream> os_ptr;
   ROL::nullstream bhs;
diff --git a/packages/rol/example/l1-penalty/l1_penalty_objective.hpp b/packages/rol/example/l1-penalty/l1_penalty_objective.hpp
index a82610ea090b..047a2245c494 100644
--- a/packages/rol/example/l1-penalty/l1_penalty_objective.hpp
+++ b/packages/rol/example/l1-penalty/l1_penalty_objective.hpp
@@ -23,7 +23,7 @@
    min_{x,r,s} f(x) + gamma * sum(r + s) 
    subject to  c(x,r,s) = g(x) - (r - s) = 0          
                r,s >= 0
-Teuchos::GlobalMPISession::GlobalMPISession
+ROL::GlobalMPISession::GlobalMPISession
 */
 
 namespace ROL {
diff --git a/packages/rol/example/lincon-test/CMakeLists.txt b/packages/rol/example/lincon-test/CMakeLists.txt
index a55542121ea9..e44048ab9bdb 100644
--- a/packages/rol/example/lincon-test/CMakeLists.txt
+++ b/packages/rol/example/lincon-test/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   example_01
   SOURCES example_01.cpp
   ARGS PrintItAll
diff --git a/packages/rol/example/lincon-test/example_01.cpp b/packages/rol/example/lincon-test/example_01.cpp
index 6dc3deb995eb..f2bc18f6fbb7 100644
--- a/packages/rol/example/lincon-test/example_01.cpp
+++ b/packages/rol/example/lincon-test/example_01.cpp
@@ -20,6 +20,7 @@
 #include "ROL_StdObjective.hpp"
 #include "ROL_StdConstraint.hpp"
 #include "ROL_Bounds.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <cmath>
 #include <iostream>
@@ -104,7 +105,7 @@ class ROL_LinearConstraint : public ROL::StdConstraint<Real> {
 typedef double RealT;
 
 int main(int argc, char *argv[]) {
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/example/minimax/CMakeLists.txt b/packages/rol/example/minimax/CMakeLists.txt
index 4fecc9385bab..9efac56e9d79 100644
--- a/packages/rol/example/minimax/CMakeLists.txt
+++ b/packages/rol/example/minimax/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   example_01
   SOURCES example_01.cpp
   ARGS PrintItAll
@@ -8,7 +8,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
 )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   example_02
   SOURCES example_02.cpp
   ARGS PrintItAll
@@ -17,7 +17,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
 )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   example_03
   SOURCES example_03.cpp
   ARGS PrintItAll
@@ -26,7 +26,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
 )
 
-TRIBITS_COPY_FILES_TO_BINARY_DIR(
+ROL_COPY_FILES_TO_BINARY_DIR(
   MinimaxDataCopy
   SOURCE_FILES
     input.xml
diff --git a/packages/rol/example/minimax/example_01.cpp b/packages/rol/example/minimax/example_01.cpp
index 3850dc0690d9..71437f71dd63 100644
--- a/packages/rol/example/minimax/example_01.cpp
+++ b/packages/rol/example/minimax/example_01.cpp
@@ -19,9 +19,7 @@
 #include "ROL_Minimax1.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
-#include "Teuchos_LAPACK.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 #include <algorithm>
@@ -30,7 +28,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint = argc - 1;
@@ -70,7 +68,7 @@ int main(int argc, char *argv[]) {
     // Run algorithm.
     algo.run(x, obj, true, *outStream);
 
-    // Compute error 
+    // Compute error
     ROL::Ptr<ROL::Vector<RealT>> diff = x.clone();
     diff->set(x);
     diff->axpy(-1.0,z);
diff --git a/packages/rol/example/minimax/example_02.cpp b/packages/rol/example/minimax/example_02.cpp
index 5579e4f9e1ae..7c70e9f51f5b 100644
--- a/packages/rol/example/minimax/example_02.cpp
+++ b/packages/rol/example/minimax/example_02.cpp
@@ -19,9 +19,7 @@
 #include "ROL_Minimax2.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
-#include "Teuchos_LAPACK.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 #include <algorithm>
@@ -30,7 +28,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint = argc - 1;
@@ -71,7 +69,7 @@ int main(int argc, char *argv[]) {
     // Run algorithm.
     algo.run(x, obj, true, *outStream);
 
-    // Compute error 
+    // Compute error
     ROL::Ptr<ROL::Vector<RealT>> diff = x.clone();
     diff->set(x);
     diff->axpy(-1.0,z);
diff --git a/packages/rol/example/minimax/example_03.cpp b/packages/rol/example/minimax/example_03.cpp
index 7357a2687963..b72aefebb198 100644
--- a/packages/rol/example/minimax/example_03.cpp
+++ b/packages/rol/example/minimax/example_03.cpp
@@ -19,9 +19,7 @@
 #include "ROL_Minimax3.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
-#include "Teuchos_LAPACK.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 #include <algorithm>
@@ -30,7 +28,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint = argc - 1;
@@ -72,7 +70,7 @@ int main(int argc, char *argv[]) {
     // Run algorithm.
     algo.run(x, obj, true, *outStream);
 
-    // Compute error 
+    // Compute error
     ROL::Ptr<ROL::Vector<RealT>> diff = x.clone();
     diff->set(x);
     diff->axpy(-1.0,z);
diff --git a/packages/rol/example/oed/CMakeLists.txt b/packages/rol/example/oed/CMakeLists.txt
index 776f28715f5c..d4a825c7af6f 100644
--- a/packages/rol/example/oed/CMakeLists.txt
+++ b/packages/rol/example/oed/CMakeLists.txt
@@ -1,11 +1,11 @@
 
-TRIBITS_ADD_EXECUTABLE(
+ROL_ADD_EXECUTABLE(
   PolynomialRegression1D
   SOURCES example_01.cpp
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_COPY_FILES_TO_BINARY_DIR(
+ROL_COPY_FILES_TO_BINARY_DIR(
   OEDExampleDataCopy
   SOURCE_FILES
     input.xml plotResults.m pointsGL.txt weightsGL.txt
diff --git a/packages/rol/example/oed/example_01.cpp b/packages/rol/example/oed/example_01.cpp
index 93b600d543ca..c315fa2390c6 100644
--- a/packages/rol/example/oed/example_01.cpp
+++ b/packages/rol/example/oed/example_01.cpp
@@ -20,7 +20,7 @@
 #include "ROL_OED_Factory.hpp"
 #include "ROL_OED_StdMomentOperator.hpp"
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 
@@ -94,7 +94,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
@@ -175,7 +175,9 @@ int main(int argc, char *argv[]) {
     ROL::Ptr<ROL::Problem<RealT>> problem = factory->get(*parlist,sampler);
     problem->setProjectionAlgorithm(*parlist);
     problem->finalize(false,true,*outStream);
-    problem->check(true,*outStream);
+    ROL::Ptr<ROL::Vector<RealT>> test = factory->getDesign()->clone();
+    test->randomize(1,2);
+    problem->check(true,*outStream,test,0.1);
 
     // Setup ROL solver
     std::clock_t timer = std::clock();
diff --git a/packages/rol/example/parabolic-control/CMakeLists.txt b/packages/rol/example/parabolic-control/CMakeLists.txt
index 39027ca21f5a..553819cde797 100644
--- a/packages/rol/example/parabolic-control/CMakeLists.txt
+++ b/packages/rol/example/parabolic-control/CMakeLists.txt
@@ -1,41 +1,41 @@
 
-TRIBITS_ADD_EXECUTABLE(
+ROL_ADD_EXECUTABLE(
   example_01
   SOURCES example_01.cpp 
   ADD_DIR_TO_NAME
 )
 
-TRIBITS_ADD_EXECUTABLE(
+ROL_ADD_EXECUTABLE(
   example_02
   SOURCES example_02.cpp 
   ADD_DIR_TO_NAME
 )
 
-TRIBITS_ADD_EXECUTABLE(
+ROL_ADD_EXECUTABLE(
   example_03
   SOURCES example_03.cpp 
   ADD_DIR_TO_NAME
 )
 
-TRIBITS_ADD_EXECUTABLE(
+ROL_ADD_EXECUTABLE(
   example_04
   SOURCES example_04.cpp 
   ADD_DIR_TO_NAME
 )
 
-TRIBITS_ADD_EXECUTABLE(
+ROL_ADD_EXECUTABLE(
   example_05
   SOURCES example_05.cpp 
   ADD_DIR_TO_NAME
 )
 
-TRIBITS_ADD_EXECUTABLE(
+ROL_ADD_EXECUTABLE(
   example_06
   SOURCES example_06.cpp 
   ADD_DIR_TO_NAME
 )
 
-TRIBITS_COPY_FILES_TO_BINARY_DIR(
+ROL_COPY_FILES_TO_BINARY_DIR(
   ParabolicControlDataCopy
   SOURCE_FILES
     input.xml
diff --git a/packages/rol/example/parabolic-control/example_01.cpp b/packages/rol/example/parabolic-control/example_01.cpp
index 2193ac51604f..9e18a91aee4c 100644
--- a/packages/rol/example/parabolic-control/example_01.cpp
+++ b/packages/rol/example/parabolic-control/example_01.cpp
@@ -8,7 +8,7 @@
 // @HEADER
 
 /*! \file  example_02.cpp
-    \brief Shows how to solve a linear-quadratic parabolic control problem 
+    \brief Shows how to solve a linear-quadratic parabolic control problem
            with bound constraints.
 */
 
@@ -18,9 +18,8 @@
 #include "ROL_StatusTest.hpp"
 #include "ROL_Types.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
-#include "Teuchos_LAPACK.hpp"
+#include "ROL_GlobalMPISession.hpp"
+#include "ROL_LAPACK.hpp"
 
 #include <iostream>
 #include <fstream>
@@ -49,18 +48,18 @@ class Objective_PoissonControl : public ROL::Objective<Real> {
   Real dt_;
 
   ROL::Ptr<const vector> getVector( const V& x ) {
-    
-    return dynamic_cast<const SV&>(x).getVector(); 
+
+    return dynamic_cast<const SV&>(x).getVector();
   }
 
   ROL::Ptr<vector> getVector( V& x ) {
-    
-    return dynamic_cast<SV&>(x).getVector();  
+
+    return dynamic_cast<SV&>(x).getVector();
   }
 
 public:
 
-  Objective_PoissonControl(std::vector<Real> &u0, Real alpha = 1.e-4, uint nx = 128, uint nt = 100, Real T = 1) 
+  Objective_PoissonControl(std::vector<Real> &u0, Real alpha = 1.e-4, uint nx = 128, uint nt = 100, Real T = 1)
     : u0_(u0), alpha_(alpha), nx_(nx), nt_(nt), T_(T) {
     dx_ = 1.0/((Real)nx-1.0);
     dt_ = T/((Real)nt-1);
@@ -88,7 +87,7 @@ class Objective_PoissonControl : public ROL::Objective<Real> {
     d[nx_-1] = dx_/3.0 + dt_/dx_;
     std::vector<Real> o(nx_-1,dx_/6.0 - dt_/dx_);
     // Perform LDL factorization
-    Teuchos::LAPACK<int,Real> lp;
+    ROL::LAPACK<int,Real> lp;
     int info;
     int ldb  = nx_;
     int nhrs = 1;
@@ -117,7 +116,7 @@ class Objective_PoissonControl : public ROL::Objective<Real> {
     d[nx_-1] = dx_/3.0 + dt_/dx_;
     std::vector<Real> o(nx_-1,dx_/6.0 - dt_/dx_);
     // Perform LDL factorization
-    Teuchos::LAPACK<int,Real> lp;
+    ROL::LAPACK<int,Real> lp;
     int info;
     int ldb  = nx_;
     int nhrs = 1;
@@ -153,7 +152,7 @@ class Objective_PoissonControl : public ROL::Objective<Real> {
     d[nx_-1] = dx_/3.0 + dt_/dx_;
     std::vector<Real> o(nx_-1,dx_/6.0 - dt_/dx_);
     // Perform LDL factorization
-    Teuchos::LAPACK<int,Real> lp;
+    ROL::LAPACK<int,Real> lp;
     int info;
     int ldb  = nx_;
     int nhrs = 1;
@@ -187,7 +186,7 @@ class Objective_PoissonControl : public ROL::Objective<Real> {
     d[nx_-1] = dx_/3.0 + dt_/dx_;
     std::vector<Real> o(nx_-1,dx_/6.0 - dt_/dx_);
     // Perform LDL factorization
-    Teuchos::LAPACK<int,Real> lp;
+    ROL::LAPACK<int,Real> lp;
     int info;
     int ldb  = nx_;
     int nhrs = 1;
@@ -225,7 +224,7 @@ class Objective_PoissonControl : public ROL::Objective<Real> {
 
   Real value( const ROL::Vector<Real> &z, Real &tol ) {
 
-    
+
     ROL::Ptr<const vector> zp = getVector(z);
 
     // SOLVE STATE EQUATION
@@ -266,7 +265,7 @@ class Objective_PoissonControl : public ROL::Objective<Real> {
 
   void gradient( ROL::Vector<Real> &g, const ROL::Vector<Real> &z, Real &tol ) {
 
-    
+
     ROL::Ptr<const vector> zp = getVector(z);
     ROL::Ptr<vector> gp = getVector(g);
 
@@ -285,7 +284,7 @@ class Objective_PoissonControl : public ROL::Objective<Real> {
 
   void hessVec( ROL::Vector<Real> &hv, const ROL::Vector<Real> &v, const ROL::Vector<Real> &z, Real &tol ) {
 
-    
+
     ROL::Ptr<const vector> vp = getVector(v);
     ROL::Ptr<vector> hvp = getVector(hv);
 
@@ -313,7 +312,7 @@ int main(int argc, char *argv[]) {
 
   typedef typename vector::size_type luint;
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
@@ -409,7 +408,7 @@ int main(int argc, char *argv[]) {
       file_tr << (*y_ptr)[i] << "\n";
     }
     file_tr.close();
-   
+
     ROL::Ptr<V> diff = x.clone();
     diff->set(x);
     diff->axpy(-1.0,y);
diff --git a/packages/rol/example/parabolic-control/example_02.cpp b/packages/rol/example/parabolic-control/example_02.cpp
index 4b0571dacadf..528c78f75dea 100644
--- a/packages/rol/example/parabolic-control/example_02.cpp
+++ b/packages/rol/example/parabolic-control/example_02.cpp
@@ -8,7 +8,7 @@
 // @HEADER
 
 /*! \file  example_02.cpp
-    \brief Shows how to solve a linear-quadratic parabolic control problem 
+    \brief Shows how to solve a linear-quadratic parabolic control problem
            with bound constraints.
 */
 
@@ -18,9 +18,8 @@
 #include "ROL_StatusTest.hpp"
 #include "ROL_Types.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
-#include "Teuchos_LAPACK.hpp"
+#include "ROL_GlobalMPISession.hpp"
+#include "ROL_LAPACK.hpp"
 
 #include <iostream>
 #include <fstream>
@@ -37,7 +36,7 @@ class Objective_ParabolicControl : public ROL::Objective<Real> {
   typedef ROL::Vector<Real>    V;
   typedef ROL::StdVector<Real> SV;
 
-  typedef typename vector::size_type  uint;  
+  typedef typename vector::size_type  uint;
 
 private:
   std::vector<Real> u0_;
@@ -53,12 +52,12 @@ class Objective_ParabolicControl : public ROL::Objective<Real> {
 /***************************************************************/
 
   ROL::Ptr<const vector> getVector( const V& x ) {
-     
+
     return dynamic_cast<const SV&>(x).getVector();
   }
 
   ROL::Ptr<vector> getVector( V& x ) {
-    
+
     return dynamic_cast<SV&>(x).getVector();
   }
 
@@ -87,7 +86,7 @@ class Objective_ParabolicControl : public ROL::Objective<Real> {
     o.resize(nx_-1,dx_/6.0 - dt_/dx_);
   }
 
-  void compute_residual(std::vector<Real> &r, 
+  void compute_residual(std::vector<Real> &r,
                   const std::vector<Real> &up, const std::vector<Real> &u, const Real z) {
     r.clear();
     r.resize(nx_,0.0);
@@ -123,11 +122,11 @@ class Objective_ParabolicControl : public ROL::Objective<Real> {
     }
   }
 
-  void linear_solve(std::vector<Real> &u, std::vector<Real> &d, std::vector<Real> &o, 
+  void linear_solve(std::vector<Real> &u, std::vector<Real> &d, std::vector<Real> &o,
               const std::vector<Real> &r) {
     u.assign(r.begin(),r.end());
     // Perform LDL factorization
-    Teuchos::LAPACK<int,Real> lp;
+    ROL::LAPACK<int,Real> lp;
     int info;
     int nx   = static_cast<int>(nx_);
     int ldb  = nx;
@@ -165,13 +164,13 @@ class Objective_ParabolicControl : public ROL::Objective<Real> {
       utmp.assign(u.begin(),u.end());
       update(utmp,s,-alpha);
       compute_residual(r,up,utmp,z);
-      rnorm = compute_norm(r); 
+      rnorm = compute_norm(r);
       while ( rnorm > (1.0-1.e-4*alpha)*tmp && alpha > std::sqrt(ROL::ROL_EPSILON<Real>()) ) {
         alpha /= 2.0;
         utmp.assign(u.begin(),u.end());
         update(utmp,s,-alpha);
         compute_residual(r,up,utmp,z);
-        rnorm = compute_norm(r); 
+        rnorm = compute_norm(r);
       }
       // Update iterate
       u.assign(utmp.begin(),utmp.end());
@@ -186,7 +185,7 @@ class Objective_ParabolicControl : public ROL::Objective<Real> {
 
 public:
 
-  Objective_ParabolicControl(Real alpha = 1.e-4, uint nx = 128, uint nt = 100, Real T = 1) 
+  Objective_ParabolicControl(Real alpha = 1.e-4, uint nx = 128, uint nt = 100, Real T = 1)
     : alpha_(alpha), nx_(nx), nt_(nt), T_(T) {
     u0_.resize(nx_,0.0);
     dx_ = 1.0/((Real)nx-1.0);
@@ -231,7 +230,7 @@ class Objective_ParabolicControl : public ROL::Objective<Real> {
       }
       else {
         apply_mass(r,P[t]);
-      } 
+      }
       // Solve solve adjoint system at current time step
       linear_solve(p,d,o,r);
       // Update State Storage
@@ -239,7 +238,7 @@ class Objective_ParabolicControl : public ROL::Objective<Real> {
     }
   }
 
-  void solve_state_sensitivity(std::vector<std::vector<Real> > &pV, 
+  void solve_state_sensitivity(std::vector<std::vector<Real> > &pV,
                          const std::vector<std::vector<Real> > &U, const std::vector<Real> &z) {
     // Initialize State Storage
     pV.clear();
@@ -268,7 +267,7 @@ class Objective_ParabolicControl : public ROL::Objective<Real> {
     }
   }
 
-  void solve_adjoint_sensitivity(std::vector<std::vector<Real> > &Q, 
+  void solve_adjoint_sensitivity(std::vector<std::vector<Real> > &Q,
                            const std::vector<std::vector<Real> > &U, const std::vector<std::vector<Real> > &P,
                            const std::vector<std::vector<Real> > &pV, const std::vector<Real> &z) {
     // Initialize State Storage
@@ -314,7 +313,7 @@ class Objective_ParabolicControl : public ROL::Objective<Real> {
 
   Real value( const ROL::Vector<Real> &z, Real &tol ) {
 
-    
+
     ROL::Ptr<const vector> zp = getVector(z);
 
     // SOLVE STATE EQUATION
@@ -355,7 +354,7 @@ class Objective_ParabolicControl : public ROL::Objective<Real> {
 
   void gradient( ROL::Vector<Real> &g, const ROL::Vector<Real> &z, Real &tol ) {
 
-    
+
     ROL::Ptr<const vector> zp = getVector(z);
     ROL::Ptr<vector> gp = getVector(g);
 
@@ -373,7 +372,7 @@ class Objective_ParabolicControl : public ROL::Objective<Real> {
 
   void hessVec( ROL::Vector<Real> &hv, const ROL::Vector<Real> &v, const ROL::Vector<Real> &z, Real &tol ) {
 
-    
+
     ROL::Ptr<const vector> zp = getVector(z);
     ROL::Ptr<const vector> vp = getVector(v);
     ROL::Ptr<vector> hvp = getVector(hv);
@@ -405,10 +404,10 @@ int main(int argc, char *argv[]) {
   typedef std::vector<RealT>         vector;
   typedef typename vector::size_type luint;
 
-    
-  
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+
+
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
@@ -504,7 +503,7 @@ int main(int argc, char *argv[]) {
       file_tr << (*y_ptr)[i] << "\n";
     }
     file_tr.close();
-   
+
     ROL::Ptr<ROL::Vector<RealT> > diff = x.clone();
     diff->set(x);
     diff->axpy(-1.0,y);
diff --git a/packages/rol/example/parabolic-control/example_03.cpp b/packages/rol/example/parabolic-control/example_03.cpp
index 24d9945fa44d..97c2d659215e 100644
--- a/packages/rol/example/parabolic-control/example_03.cpp
+++ b/packages/rol/example/parabolic-control/example_03.cpp
@@ -8,7 +8,7 @@
 // @HEADER
 
 /*! \file  example_03.cpp
-    \brief Shows how to solve a linear-quadratic parabolic control problem 
+    \brief Shows how to solve a linear-quadratic parabolic control problem
            with bound constraints.
 */
 
@@ -18,9 +18,8 @@
 #include "ROL_StatusTest.hpp"
 #include "ROL_Types.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
-#include "Teuchos_LAPACK.hpp"
+#include "ROL_GlobalMPISession.hpp"
+#include "ROL_LAPACK.hpp"
 
 #include <iostream>
 #include <fstream>
@@ -59,7 +58,7 @@ class Objective_ParabolicControl : public ROL::Objective<Real> {
   ROL::Ptr<const vector> getVector( const V& x ) {
     return dynamic_cast<const SV&>(x).getVector();
   }
- 
+
   ROL::Ptr<vector> getVector( V& x ) {
     return dynamic_cast<SV&>(x).getVector();
   }
@@ -117,7 +116,7 @@ class Objective_ParabolicControl : public ROL::Objective<Real> {
     }
   }
 
-  void add_pde_hessian(std::vector<Real> &r, const std::vector<Real> &u, const std::vector<Real> &p, 
+  void add_pde_hessian(std::vector<Real> &r, const std::vector<Real> &u, const std::vector<Real> &p,
                  const std::vector<Real> &s, Real alpha = 1.0) {
     // Contribution from nonlinearity
     Real phi = 0.0, fx = 0.0, px = 0.0, sx = 0.0, x = 0.0, w = 0.0;
@@ -179,7 +178,7 @@ class Objective_ParabolicControl : public ROL::Objective<Real> {
         break;
       }
     }
-    return val; 
+    return val;
   }
 
   Real evaluate_nonlinearity(const Real x, const std::vector<Real> &u, const int deriv = 0) {
@@ -196,7 +195,7 @@ class Objective_ParabolicControl : public ROL::Objective<Real> {
     }
   }
 
-  void compute_residual(std::vector<Real> &r, 
+  void compute_residual(std::vector<Real> &r,
                   const std::vector<Real> &up, const std::vector<Real> &u, const Real z) {
     r.clear();
     r.resize(nx_,0.0);
@@ -206,22 +205,22 @@ class Objective_ParabolicControl : public ROL::Objective<Real> {
         // Contribution from mass and stiffness term
         r[i] = dx_/6.0*(2.0*u[i]+u[i+1]) + dt_*eps2_/dx_*(u[i]-u[i+1]);
         // Contribution from previous state
-        r[i]-= dx_/6.0*(2.0*up[i]+up[i+1]); 
+        r[i]-= dx_/6.0*(2.0*up[i]+up[i+1]);
       }
       else if ( i==nx_-1 ) {
         // Contribution from mass and stiffness term
         r[i] = dx_/6.0*(u[i-1]+2.0*u[i]) + dt_*eps2_/dx_*(u[i]-u[i-1]);
         // Contribution from previous state
-        r[i]-= dx_/6.0*(2.0*up[i]+up[i-1]); 
+        r[i]-= dx_/6.0*(2.0*up[i]+up[i-1]);
         // Contribution from control
         r[i]-= dt_*z;
       }
       else {
         // Contribution from mass and stiffness term
-        r[i] = dx_/6.0*(u[i-1]+4.0*u[i]+u[i+1]) 
+        r[i] = dx_/6.0*(u[i-1]+4.0*u[i]+u[i+1])
              + dt_*eps2_/dx_*(2.0*u[i]-u[i-1]-u[i+1]);
         // Contribution from previous state
-        r[i]-= dx_/6.0*(up[i-1]+4.0*up[i]+up[i+1]); 
+        r[i]-= dx_/6.0*(up[i-1]+4.0*up[i]+up[i+1]);
       }
 
       // Contribution from nonlinearity
@@ -256,11 +255,11 @@ class Objective_ParabolicControl : public ROL::Objective<Real> {
     }
   }
 
-  void linear_solve(std::vector<Real> &u, std::vector<Real> &d, std::vector<Real> &o, 
+  void linear_solve(std::vector<Real> &u, std::vector<Real> &d, std::vector<Real> &o,
               const std::vector<Real> &r) {
     u.assign(r.begin(),r.end());
     // Perform LDL factorization
-    Teuchos::LAPACK<int,Real> lp;
+    ROL::LAPACK<int,Real> lp;
     int info;
     int nx  = static_cast<int>(nx_);
     int ldb = nx;
@@ -297,13 +296,13 @@ class Objective_ParabolicControl : public ROL::Objective<Real> {
       utmp.assign(u.begin(),u.end());
       update(utmp,s,-alpha);
       compute_residual(r,up,utmp,z);
-      rnorm = compute_norm(r); 
+      rnorm = compute_norm(r);
       while ( rnorm > (1.0-1.e-4*alpha)*tmp && alpha > std::sqrt(ROL::ROL_EPSILON<Real>()) ) {
         alpha /= 2.0;
         utmp.assign(u.begin(),u.end());
         update(utmp,s,-alpha);
         compute_residual(r,up,utmp,z);
-        rnorm = compute_norm(r); 
+        rnorm = compute_norm(r);
       }
       // Update iterate
       u.assign(utmp.begin(),utmp.end());
@@ -318,9 +317,9 @@ class Objective_ParabolicControl : public ROL::Objective<Real> {
 
 public:
 
-  Objective_ParabolicControl(Real alpha = 1.e-4, Real eps = 1.0, uint nx = 128, uint nt = 100, Real T = 1) 
+  Objective_ParabolicControl(Real alpha = 1.e-4, Real eps = 1.0, uint nx = 128, uint nt = 100, Real T = 1)
     : alpha_(alpha), nx_(nx), nt_(nt), T_(T) {
-    eps2_ = eps*eps;    
+    eps2_ = eps*eps;
 
     u0_.resize(this->nx_,0.0);
     dx_ = 1.0/((Real)nx-1.0);
@@ -334,10 +333,10 @@ class Objective_ParabolicControl : public ROL::Objective<Real> {
 
     wts_.resize(4,0.0);
     wts_[0] = 0.3478548451374538;
-    wts_[1] = 0.6521451548625461; 
-    wts_[2] = 0.6521451548625461; 
+    wts_[1] = 0.6521451548625461;
+    wts_[2] = 0.6521451548625461;
     wts_[3] = 0.3478548451374538;
- 
+
     Real sum = wts_[0]+wts_[1]+wts_[2]+wts_[3];
     wts_[0] *= 2.0/sum;
     wts_[1] *= 2.0/sum;
@@ -383,7 +382,7 @@ class Objective_ParabolicControl : public ROL::Objective<Real> {
       }
       else {
         apply_mass(r,P[t]);
-      } 
+      }
       // Solve solve adjoint system at current time step
       linear_solve(p,d,o,r);
       // Update State Storage
@@ -391,7 +390,7 @@ class Objective_ParabolicControl : public ROL::Objective<Real> {
     }
   }
 
-  void solve_state_sensitivity(std::vector<std::vector<Real> > &sV, 
+  void solve_state_sensitivity(std::vector<std::vector<Real> > &sV,
                          const std::vector<std::vector<Real> > &U, const std::vector<Real> &z) {
     // Initialize State Storage
     sV.clear();
@@ -420,7 +419,7 @@ class Objective_ParabolicControl : public ROL::Objective<Real> {
     }
   }
 
-  void solve_adjoint_sensitivity(std::vector<std::vector<Real> > &Q, 
+  void solve_adjoint_sensitivity(std::vector<std::vector<Real> > &Q,
                            const std::vector<std::vector<Real> > &U, const std::vector<std::vector<Real> > &P,
                            const std::vector<std::vector<Real> > &sV, const std::vector<Real> &z) {
     // Initialize State Storage
@@ -454,7 +453,7 @@ class Objective_ParabolicControl : public ROL::Objective<Real> {
 
   Real value( const ROL::Vector<Real> &z, Real &tol ) {
 
-    
+
     ROL::Ptr<const vector> zp = getVector(z);
 
     // SOLVE STATE EQUATION
@@ -495,7 +494,7 @@ class Objective_ParabolicControl : public ROL::Objective<Real> {
 
   void gradient( ROL::Vector<Real> &g, const ROL::Vector<Real> &z, Real &tol ) {
 
-    
+
     ROL::Ptr<const vector> zp = getVector(z);
     ROL::Ptr<vector> gp = getVector(g);
 
@@ -512,7 +511,7 @@ class Objective_ParabolicControl : public ROL::Objective<Real> {
   }
 
   void hessVec( ROL::Vector<Real> &hv, const ROL::Vector<Real> &v, const ROL::Vector<Real> &z, Real &tol ) {
-    
+
     ROL::Ptr<const vector> zp = getVector(z);
     ROL::Ptr<const vector> vp = getVector(v);
     ROL::Ptr<vector> hvp = getVector(hv);
@@ -545,12 +544,12 @@ int main(int argc, char *argv[]) {
   typedef std::vector<RealT>     vector;
   typedef ROL::Vector<RealT>     V;
   typedef ROL::StdVector<RealT>  SV;
- 
+
   typedef typename vector::size_type luint;
 
-    
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
@@ -571,7 +570,7 @@ int main(int argc, char *argv[]) {
     luint nt     = 100;   // Set temporal discretization.
     RealT T     = 1.0;   // Set end time.
     RealT alpha = 1.e-3; // Set penalty parameter.
-    RealT eps   = 5.e-1; // Set conductivity 
+    RealT eps   = 5.e-1; // Set conductivity
     Objective_ParabolicControl<RealT> obj(alpha,eps,nx,nt,T);
     // Initialize iteration vectors.
     ROL::Ptr<vector> x_ptr = ROL::makePtr<vector>(nt, 1.0);
@@ -648,7 +647,7 @@ int main(int argc, char *argv[]) {
       file_tr << (*y_ptr)[i] << "\n";
     }
     file_tr.close();
-   
+
     ROL::Ptr<V> diff = x.clone();
     diff->set(x);
     diff->axpy(-1.0,y);
diff --git a/packages/rol/example/parabolic-control/example_04.cpp b/packages/rol/example/parabolic-control/example_04.cpp
index 74e1c69e57f1..c242469a54cd 100644
--- a/packages/rol/example/parabolic-control/example_04.cpp
+++ b/packages/rol/example/parabolic-control/example_04.cpp
@@ -28,9 +28,8 @@
 #include "ROL_Bounds.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
-#include "Teuchos_LAPACK.hpp"
+#include "ROL_GlobalMPISession.hpp"
+#include "ROL_LAPACK.hpp"
 
 #include <iostream>
 #include <algorithm>
@@ -42,8 +41,8 @@ class Constraint_ParabolicControl : public ROL::Constraint_SimOpt<Real> {
   typedef std::vector<Real>    vector;
   typedef ROL::Vector<Real>    V;
   typedef ROL::StdVector<Real> SV;
-  
-  typedef typename vector::size_type uint; 
+
+  typedef typename vector::size_type uint;
 
 private:
   std::vector<Real> u0_;
@@ -98,7 +97,7 @@ class Constraint_ParabolicControl : public ROL::Constraint_SimOpt<Real> {
     jv[nx_-1] += eps2_*dt_*4.0*std::pow(u[nx_-1],3.0)*v[nx_-1];
   }
 
-  void apply_pde_hessian(std::vector<Real> &r, const std::vector<Real> &u, const std::vector<Real> &p, 
+  void apply_pde_hessian(std::vector<Real> &r, const std::vector<Real> &u, const std::vector<Real> &p,
                          const std::vector<Real> &s) {
     r.clear();
     r.resize(nx_,0.0);
@@ -106,7 +105,7 @@ class Constraint_ParabolicControl : public ROL::Constraint_SimOpt<Real> {
     r[nx_-1] = eps2_*dt_*12.0*std::pow(u[nx_-1],2.0)*p[nx_-1]*s[nx_-1];
   }
 
-  void compute_residual(std::vector<Real> &r, const std::vector<Real> &up, 
+  void compute_residual(std::vector<Real> &r, const std::vector<Real> &up,
                         const std::vector<Real> &u, const Real z) {
     r.clear();
     r.resize(nx_,0.0);
@@ -138,11 +137,11 @@ class Constraint_ParabolicControl : public ROL::Constraint_SimOpt<Real> {
     }
   }
 
-  void linear_solve(std::vector<Real> &u, std::vector<Real> &d, std::vector<Real> &o, 
+  void linear_solve(std::vector<Real> &u, std::vector<Real> &d, std::vector<Real> &o,
               const std::vector<Real> &r) {
     u.assign(r.begin(),r.end());
     // Perform LDL factorization
-    Teuchos::LAPACK<int,Real> lp;
+    ROL::LAPACK<int,Real> lp;
     int nx = static_cast<int>(nx_);
     int info;
     int ldb  = nx;
@@ -179,13 +178,13 @@ class Constraint_ParabolicControl : public ROL::Constraint_SimOpt<Real> {
       utmp.assign(u.begin(),u.end());
       update(utmp,s,-alpha);
       compute_residual(r,up,utmp,z);
-      rnorm = compute_norm(r); 
+      rnorm = compute_norm(r);
       while ( rnorm > (1.0-1.e-4*alpha)*tmp && alpha > std::sqrt(ROL::ROL_EPSILON<Real>()) ) {
         alpha /= 2.0;
         utmp.assign(u.begin(),u.end());
         update(utmp,s,-alpha);
         compute_residual(r,up,utmp,z);
-        rnorm = compute_norm(r); 
+        rnorm = compute_norm(r);
       }
       // Update iterate
       u.assign(utmp.begin(),utmp.end());
@@ -196,13 +195,13 @@ class Constraint_ParabolicControl : public ROL::Constraint_SimOpt<Real> {
   }
 
   ROL::Ptr<const vector> getVector( const V& x ) {
-    
+
     return dynamic_cast<const SV&>(x).getVector();
   }
 
   ROL::Ptr<vector> getVector( V& x ) {
-    
-    return dynamic_cast<SV&>(x).getVector(); 
+
+    return dynamic_cast<SV&>(x).getVector();
   }
 
 
@@ -213,7 +212,7 @@ class Constraint_ParabolicControl : public ROL::Constraint_SimOpt<Real> {
 
 public:
 
-  Constraint_ParabolicControl(Real eps = 1.0, int nx = 128, int nt = 100, Real T = 1) 
+  Constraint_ParabolicControl(Real eps = 1.0, int nx = 128, int nt = 100, Real T = 1)
     : eps1_(eps), eps2_(1.0), nx_(nx), nt_(nt), T_(T) {
     u0_.resize(nx_,0.0);
     dx_ = 1.0/((Real)nx-1.0);
@@ -269,7 +268,7 @@ class Constraint_ParabolicControl : public ROL::Constraint_SimOpt<Real> {
 
   void applyJacobian_1(ROL::Vector<Real> &jv, const ROL::Vector<Real> &v, const ROL::Vector<Real> &u,
                        const ROL::Vector<Real> &z, Real &tol) {
-    
+
     ROL::Ptr<vector> jvp = getVector(jv);
     ROL::Ptr<const vector> vp = getVector(v);
     ROL::Ptr<const vector> up = getVector(u);
@@ -299,12 +298,12 @@ class Constraint_ParabolicControl : public ROL::Constraint_SimOpt<Real> {
 
   void applyInverseJacobian_1(ROL::Vector<Real> &jv, const ROL::Vector<Real> &v, const ROL::Vector<Real> &u,
                               const ROL::Vector<Real> &z, Real &tol) {
- 
+
     ROL::Ptr<vector> jvp = getVector(jv);
     ROL::Ptr<const vector> vp = getVector(v);
     ROL::Ptr<const vector> up = getVector(u);
     ROL::Ptr<const vector> zp = getVector(z);
-   
+
     // Initialize State Storage
     std::vector<Real> M(u0_);
     std::vector<Real> sold(u0_);
@@ -325,7 +324,7 @@ class Constraint_ParabolicControl : public ROL::Constraint_SimOpt<Real> {
       compute_pde_jacobian(d,o,unew);
       // Get Right Hand Side
       if ( t > 0 ) {
-        apply_mass(M,sold); 
+        apply_mass(M,sold);
         update(vnew,M);
       }
       // Solve solve adjoint system at current time step
@@ -367,9 +366,9 @@ class Constraint_ParabolicControl : public ROL::Constraint_SimOpt<Real> {
     }
   }
 
-  void applyInverseAdjointJacobian_1(ROL::Vector<Real> &jv, const ROL::Vector<Real> &v, 
+  void applyInverseAdjointJacobian_1(ROL::Vector<Real> &jv, const ROL::Vector<Real> &v,
                                      const ROL::Vector<Real> &u, const ROL::Vector<Real> &z, Real &tol) {
-    
+
     ROL::Ptr<vector> jvp = getVector(jv);
     ROL::Ptr<const vector> vp = getVector(v);
     ROL::Ptr<const vector> up = getVector(u);
@@ -395,7 +394,7 @@ class Constraint_ParabolicControl : public ROL::Constraint_SimOpt<Real> {
       compute_pde_jacobian(d,o,unew);
       // Get Right Hand Side
       if ( t < nt_ ) {
-        apply_mass(M,sold); 
+        apply_mass(M,sold);
         update(vnew,M);
       }
       // Solve solve adjoint system at current time step
@@ -434,7 +433,7 @@ class Constraint_ParabolicControl : public ROL::Constraint_SimOpt<Real> {
     }
   }
 
-  void applyAdjointHessian_11(ROL::Vector<Real> &hwv, const ROL::Vector<Real> &w, 
+  void applyAdjointHessian_11(ROL::Vector<Real> &hwv, const ROL::Vector<Real> &w,
                               const ROL::Vector<Real> &v, const ROL::Vector<Real> &u,
                               const ROL::Vector<Real> &z, Real &tol) {
 
@@ -465,19 +464,19 @@ class Constraint_ParabolicControl : public ROL::Constraint_SimOpt<Real> {
     }
   }
 
-  void applyAdjointHessian_12(ROL::Vector<Real> &hwv, const ROL::Vector<Real> &w, 
+  void applyAdjointHessian_12(ROL::Vector<Real> &hwv, const ROL::Vector<Real> &w,
                               const ROL::Vector<Real> &v, const ROL::Vector<Real> &u,
                               const ROL::Vector<Real> &z, Real &tol) {
     hwv.zero();
   }
 
-  void applyAdjointHessian_21(ROL::Vector<Real> &hwv, const ROL::Vector<Real> &w, 
+  void applyAdjointHessian_21(ROL::Vector<Real> &hwv, const ROL::Vector<Real> &w,
                               const ROL::Vector<Real> &v, const ROL::Vector<Real> &u,
                               const ROL::Vector<Real> &z, Real &tol) {
     hwv.zero();
   }
 
-  void applyAdjointHessian_22(ROL::Vector<Real> &hwv, const ROL::Vector<Real> &w, 
+  void applyAdjointHessian_22(ROL::Vector<Real> &hwv, const ROL::Vector<Real> &w,
                               const ROL::Vector<Real> &v, const ROL::Vector<Real> &u,
                               const ROL::Vector<Real> &z, Real &tol) {
     hwv.zero();
@@ -490,8 +489,8 @@ class Objective_ParabolicControl : public ROL::Objective_SimOpt<Real> {
   typedef std::vector<Real>    vector;
   typedef ROL::Vector<Real>    V;
   typedef ROL::StdVector<Real> SV;
-  
-  typedef typename vector::size_type uint; 
+
+  typedef typename vector::size_type uint;
 
 private:
   Real alpha_;
@@ -529,7 +528,7 @@ class Objective_ParabolicControl : public ROL::Objective_SimOpt<Real> {
   }
 
   Real compute_dot(const std::vector<Real> &r, const std::vector<Real> &s) {
-    Real ip = 0.0; 
+    Real ip = 0.0;
     for (uint i = 0; i < r.size(); i++) {
       ip += r[i]*s[i];
     }
@@ -557,13 +556,13 @@ class Objective_ParabolicControl : public ROL::Objective_SimOpt<Real> {
   }
 
   ROL::Ptr<const vector> getVector( const V& x ) {
-    
+
     return dynamic_cast<const SV&>(x).getVector();
   }
 
   ROL::Ptr<vector> getVector( V& x ) {
-    
-    return dynamic_cast<SV&>(x).getVector(); 
+
+    return dynamic_cast<SV&>(x).getVector();
   }
 
 /*************************************************************/
@@ -572,7 +571,7 @@ class Objective_ParabolicControl : public ROL::Objective_SimOpt<Real> {
 
 public:
 
-  Objective_ParabolicControl(Real alpha = 1.e-4, int nx = 128, int nt = 100, Real T = 1) 
+  Objective_ParabolicControl(Real alpha = 1.e-4, int nx = 128, int nt = 100, Real T = 1)
     : alpha_(alpha), nx_(nx), nt_(nt), T_(T) {
     dx_ = 1.0/((Real)nx-1.0);
     dt_ = T/((Real)nt-1.0);
@@ -587,8 +586,8 @@ class Objective_ParabolicControl : public ROL::Objective_SimOpt<Real> {
     std::vector<Real> uT(nx_,0.0);
     for (uint n = 0; n < nx_; n++) {
       uT[n] = (*up)[(nt_-1)*nx_ + n] - evaluate_target((Real)n*dx_);
-    } 
-    Real val = 0.5*compute_weighted_dot(uT,uT); 
+    }
+    Real val = 0.5*compute_weighted_dot(uT,uT);
 
     // Add Norm of Control
     for (uint t = 0; t < nt_; t++) {
@@ -607,7 +606,7 @@ class Objective_ParabolicControl : public ROL::Objective_SimOpt<Real> {
     std::vector<Real> uT(nx_,0.0);
     for (uint n = 0; n < nx_; n++) {
       uT[n] = (*up)[(nt_-1)*nx_ + n] - evaluate_target((Real)n*dx_);
-    } 
+    }
     std::vector<Real> M(nx_,0.0);
     apply_mass(M,uT);
     for (uint n = 0; n < nx_; n++) {
@@ -621,14 +620,14 @@ class Objective_ParabolicControl : public ROL::Objective_SimOpt<Real> {
     ROL::Ptr<vector> gp = getVector(g);
     ROL::Ptr<const vector> up = getVector(u);
     ROL::Ptr<const vector> zp = getVector(z);
- 
+
     // Compute gradient
     for (uint n = 0; n < nt_; n++) {
       (*gp)[n] = dt_*alpha_*(*zp)[n];
     }
   }
 
-  void hessVec_11( ROL::Vector<Real> &hv, const ROL::Vector<Real> &v, const ROL::Vector<Real> &u, 
+  void hessVec_11( ROL::Vector<Real> &hv, const ROL::Vector<Real> &v, const ROL::Vector<Real> &u,
                    const ROL::Vector<Real> &z, Real &tol ) {
 
     ROL::Ptr<vector> hvp = getVector(hv);
@@ -640,7 +639,7 @@ class Objective_ParabolicControl : public ROL::Objective_SimOpt<Real> {
     std::vector<Real> vT(nx_,0.0);
     for (uint n = 0; n < nx_; n++) {
       vT[n] = (*vp)[(nt_-1)*nx_ + n];
-    } 
+    }
     std::vector<Real> M(nx_,0.0);
     apply_mass(M,vT);
     for (uint n = 0; n < nx_; n++) {
@@ -648,17 +647,17 @@ class Objective_ParabolicControl : public ROL::Objective_SimOpt<Real> {
     }
   }
 
-  void hessVec_12( ROL::Vector<Real> &hv, const ROL::Vector<Real> &v, const ROL::Vector<Real> &u, 
+  void hessVec_12( ROL::Vector<Real> &hv, const ROL::Vector<Real> &v, const ROL::Vector<Real> &u,
                    const ROL::Vector<Real> &z, Real &tol ) {
     hv.zero();
   }
 
-  void hessVec_21( ROL::Vector<Real> &hv, const ROL::Vector<Real> &v, const ROL::Vector<Real> &u, 
+  void hessVec_21( ROL::Vector<Real> &hv, const ROL::Vector<Real> &v, const ROL::Vector<Real> &u,
                    const ROL::Vector<Real> &z, Real &tol ) {
     hv.zero();
   }
 
-  void hessVec_22( ROL::Vector<Real> &hv, const ROL::Vector<Real> &v, const ROL::Vector<Real> &u, 
+  void hessVec_22( ROL::Vector<Real> &hv, const ROL::Vector<Real> &v, const ROL::Vector<Real> &u,
                    const ROL::Vector<Real> &z, Real &tol ) {
 
     ROL::Ptr<vector> hvp = getVector(hv);
@@ -680,10 +679,10 @@ int main(int argc, char *argv[]) {
   typedef std::vector<RealT>    vector;
   typedef ROL::Vector<RealT>    V;
   typedef ROL::StdVector<RealT> SV;
-   
+
   typedef typename vector::size_type luint;
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
@@ -705,7 +704,7 @@ int main(int argc, char *argv[]) {
     luint nt     = 40;    // Set temporal discretization.
     RealT T     = 1.0;   // Set end time.
     RealT alpha = 1.e-3; // Set penalty parameter.
-    RealT eps   = 5.e-1; // Set conductivity 
+    RealT eps   = 5.e-1; // Set conductivity
     Objective_ParabolicControl<RealT> obj(alpha,nx,nt,T);
     Constraint_ParabolicControl<RealT> con(eps,nx,nt,T);
 
@@ -803,7 +802,7 @@ int main(int argc, char *argv[]) {
     xz.zero();
     std::clock_t timer_pdas = std::clock();
     algo->run(xz, robj, icon, true, *outStream);
-    *outStream << "Primal Dual Active Set required " << (std::clock()-timer_pdas)/(RealT)CLOCKS_PER_SEC 
+    *outStream << "Primal Dual Active Set required " << (std::clock()-timer_pdas)/(RealT)CLOCKS_PER_SEC
                << " seconds.\n";
 
     // Projected Newton.
@@ -817,7 +816,7 @@ int main(int argc, char *argv[]) {
     xz.zero();
     std::clock_t timer_tr = std::clock();
     algo->run(xz, robj, icon, true, *outStream);
-    *outStream << "Projected Newton required " << (std::clock()-timer_tr)/(RealT)CLOCKS_PER_SEC 
+    *outStream << "Projected Newton required " << (std::clock()-timer_tr)/(RealT)CLOCKS_PER_SEC
                << " seconds.\n";
 
     // Composite step.
@@ -832,7 +831,7 @@ int main(int argc, char *argv[]) {
     x.zero();
     std::clock_t timer_cs = std::clock();
     algo->run(x, g, l, c, obj, con, true, *outStream);
-    *outStream << "Composite Step required " << (std::clock()-timer_cs)/(RealT)CLOCKS_PER_SEC 
+    *outStream << "Composite Step required " << (std::clock()-timer_cs)/(RealT)CLOCKS_PER_SEC
                << " seconds.\n";
   }
   catch (std::logic_error& err) {
diff --git a/packages/rol/example/parabolic-control/example_05.cpp b/packages/rol/example/parabolic-control/example_05.cpp
index 16e4ed570cf6..50a94ec7511d 100644
--- a/packages/rol/example/parabolic-control/example_05.cpp
+++ b/packages/rol/example/parabolic-control/example_05.cpp
@@ -45,9 +45,8 @@
 #include "ROL_Bounds.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
-#include "Teuchos_LAPACK.hpp"
+#include "ROL_GlobalMPISession.hpp"
+#include "ROL_LAPACK.hpp"
 
 #include <iostream>
 #include <algorithm>
@@ -284,7 +283,7 @@ class Constraint_ParabolicControl : public ROL::Constraint_SimOpt<Real> {
               const std::vector<Real> &r) {
     u.assign(r.begin(),r.end());
     // Perform LDL factorization
-    Teuchos::LAPACK<int,Real> lp;
+    ROL::LAPACK<int,Real> lp;
     int nx = static_cast<int>(nx_);
     int info;
     int ldb  = nx;
@@ -857,7 +856,7 @@ int main(int argc, char *argv[]) {
   
       
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/example/parabolic-control/example_06.cpp b/packages/rol/example/parabolic-control/example_06.cpp
index 96cf72bc366e..a3dda137bf71 100644
--- a/packages/rol/example/parabolic-control/example_06.cpp
+++ b/packages/rol/example/parabolic-control/example_06.cpp
@@ -43,9 +43,8 @@
 #include "ROL_Reduced_Objective_SimOpt.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
-#include "Teuchos_LAPACK.hpp"
+#include "ROL_GlobalMPISession.hpp"
+#include "ROL_LAPACK.hpp"
 
 #include <iostream>
 #include <algorithm>
@@ -311,7 +310,7 @@ class Constraint_ParabolicControl : public ROL::Constraint_SimOpt<Real> {
               const std::vector<Real> &r) {
     u.assign(r.begin(),r.end());
     // Perform LDL factorization
-    Teuchos::LAPACK<int,Real> lp;
+    ROL::LAPACK<int,Real> lp;
     int nx = static_cast<int>(nx_);
     int info;
     int ldb  = nx;
@@ -929,7 +928,7 @@ int main(int argc, char *argv[]) {
 
     
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/example/paraboloid-circle/CMakeLists.txt b/packages/rol/example/paraboloid-circle/CMakeLists.txt
index 0fed4ec6d6cc..2008db0fccf1 100644
--- a/packages/rol/example/paraboloid-circle/CMakeLists.txt
+++ b/packages/rol/example/paraboloid-circle/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   example_01
   SOURCES example_01.cpp
   ARGS PrintItAll
diff --git a/packages/rol/example/paraboloid-circle/example_01.cpp b/packages/rol/example/paraboloid-circle/example_01.cpp
index b44e066b54b5..6914df75aac7 100644
--- a/packages/rol/example/paraboloid-circle/example_01.cpp
+++ b/packages/rol/example/paraboloid-circle/example_01.cpp
@@ -19,7 +19,7 @@
 #include "ROL_CompositeStep.hpp"
 #include "ROL_ConstraintStatusTest.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 
@@ -28,7 +28,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/example/poisson-control/CMakeLists.txt b/packages/rol/example/poisson-control/CMakeLists.txt
index 41a8304c5f83..de8719d821b2 100644
--- a/packages/rol/example/poisson-control/CMakeLists.txt
+++ b/packages/rol/example/poisson-control/CMakeLists.txt
@@ -1,7 +1,7 @@
 TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/teuchos/src/function)
 TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/teuchos/src/sol)
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   example_01
   SOURCES example_01.cpp
   ARGS PrintItAll
@@ -10,7 +10,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
 )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   example_02
   SOURCES example_02.cpp
   ARGS PrintItAll
@@ -19,7 +19,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
 )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   example_03
   SOURCES example_03.cpp
   ARGS PrintItAll
@@ -28,7 +28,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
 )
 
-TRIBITS_COPY_FILES_TO_BINARY_DIR(
+ROL_COPY_FILES_TO_BINARY_DIR(
   PoissonControlDataCopy
   SOURCE_FILES
     input.xml
diff --git a/packages/rol/example/poisson-control/example_01.cpp b/packages/rol/example/poisson-control/example_01.cpp
index dd5949d480f0..29eacf77ec4e 100644
--- a/packages/rol/example/poisson-control/example_01.cpp
+++ b/packages/rol/example/poisson-control/example_01.cpp
@@ -23,8 +23,7 @@
 #include "ROL_Types.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 #include <algorithm>
@@ -71,12 +70,12 @@ int main(int argc, char *argv[]) {
   typedef std::vector<RealT>     vector;
   typedef ROL::Vector<RealT>     V;
   typedef ROL::StdVector<RealT>  SV;
- 
+
   typedef typename vector::size_type luint;
-  
-    
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+
+
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
@@ -104,7 +103,7 @@ int main(int argc, char *argv[]) {
 
     for ( luint i = 0; i < dim; i++ ) {
       if ( i < dim/3.0  ||  i > 2*dim/3.0 ) {
-        (*l_ptr)[i] = 0.0; 
+        (*l_ptr)[i] = 0.0;
         (*u_ptr)[i] = 0.25;
       }
       else {
@@ -175,7 +174,7 @@ int main(int argc, char *argv[]) {
       file_tr << (*y_ptr)[i] << "\n";
     }
     file_tr.close();
-   
+
     ROL::Ptr<V> error = x.clone();
     error->set(x);
     error->axpy(-1.0,y);
diff --git a/packages/rol/example/poisson-control/example_02.cpp b/packages/rol/example/poisson-control/example_02.cpp
index 3e095d20212c..0d3218b45e7d 100644
--- a/packages/rol/example/poisson-control/example_02.cpp
+++ b/packages/rol/example/poisson-control/example_02.cpp
@@ -25,14 +25,14 @@
 // Teuchos includes
 #include "Teuchos_Time.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Teuchos_Comm.hpp"
 #include "Teuchos_DefaultComm.hpp"
 #include "Teuchos_CommHelpers.hpp"
 
 int main( int argc, char *argv[] ) {  
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   auto comm = ROL::toPtr( Teuchos::DefaultComm<int>::getComm() );
 
diff --git a/packages/rol/example/poisson-control/example_03.cpp b/packages/rol/example/poisson-control/example_03.cpp
index 15e55cf4774c..00451c700322 100644
--- a/packages/rol/example/poisson-control/example_03.cpp
+++ b/packages/rol/example/poisson-control/example_03.cpp
@@ -22,14 +22,14 @@
 // Teuchos includes
 #include "Teuchos_Time.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "Teuchos_Comm.hpp"
 #include "Teuchos_DefaultComm.hpp"
 #include "Teuchos_CommHelpers.hpp"
 
 int main( int argc, char *argv[] ) {  
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   auto comm = ROL::toPtr( Teuchos::DefaultComm<int>::getComm() );
 
diff --git a/packages/rol/example/poisson-inversion/CMakeLists.txt b/packages/rol/example/poisson-inversion/CMakeLists.txt
index 6f63d854758a..37188bf68a7a 100644
--- a/packages/rol/example/poisson-inversion/CMakeLists.txt
+++ b/packages/rol/example/poisson-inversion/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   example_01
   SOURCES example_01.cpp
   ARGS PrintItAll
@@ -8,7 +8,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
 )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   example_02
   SOURCES example_02.cpp
   ARGS PrintItAll
diff --git a/packages/rol/example/poisson-inversion/example_01.cpp b/packages/rol/example/poisson-inversion/example_01.cpp
index 2918cfd986e9..7271ea224c3d 100644
--- a/packages/rol/example/poisson-inversion/example_01.cpp
+++ b/packages/rol/example/poisson-inversion/example_01.cpp
@@ -21,7 +21,7 @@
 #include "ROL_TrustRegionStep.hpp"
 #include "ROL_StatusTest.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 #include <algorithm>
@@ -30,7 +30,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/example/poisson-inversion/example_02.cpp b/packages/rol/example/poisson-inversion/example_02.cpp
index 1f30a7f620a3..3c05a771ba5a 100644
--- a/packages/rol/example/poisson-inversion/example_02.cpp
+++ b/packages/rol/example/poisson-inversion/example_02.cpp
@@ -22,9 +22,10 @@
 #include "ROL_Types.hpp"
 #include "ROL_HelperFunctions.hpp"
 #include "ROL_Stream.hpp"
+#include "ROL_LAPACK.hpp"
 
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <fstream>
 #include <iostream>
@@ -155,7 +156,7 @@ class Objective_PoissonInversion : public ROL::Objective<Real> {
     u[    0] = z[    0]/hu_ * u0_;
     u[nu_-1] = z[nz_-1]/hu_ * u1_;
     // Solve Tridiagonal System Using LAPACK's SPD Tridiagonal Solver
-    Teuchos::LAPACK<int,Real> lp;
+    ROL::LAPACK<int,Real> lp;
     int info;
     int ldb  = nu_;
     int nhrs = 1;
@@ -182,7 +183,7 @@ class Objective_PoissonInversion : public ROL::Objective<Real> {
     p.resize(nu_,0.0);
     apply_mass(p,r);    
     // Solve Tridiagonal System Using LAPACK's SPD Tridiagonal Solver
-    Teuchos::LAPACK<int,Real> lp;
+    ROL::LAPACK<int,Real> lp;
     int info;
     int ldb  = nu_;
     int nhrs = 1;
@@ -206,7 +207,7 @@ class Objective_PoissonInversion : public ROL::Objective<Real> {
     w.resize(nu_,0.0);
     apply_linearized_control_operator(w,z,v,u);
     // Solve Tridiagonal System Using LAPACK's SPD Tridiagonal Solver
-    Teuchos::LAPACK<int,Real> lp;
+    ROL::LAPACK<int,Real> lp;
     int info;
     int ldb  = nu_;
     int nhrs = 1;
@@ -236,7 +237,7 @@ class Objective_PoissonInversion : public ROL::Objective<Real> {
       q[i] -= res[i];
     }
     // Solve Tridiagonal System Using LAPACK's SPD Tridiagonal Solver
-    Teuchos::LAPACK<int,Real> lp;
+    ROL::LAPACK<int,Real> lp;
     int info;
     int ldb  = nu_;
     int nhrs = 1;
@@ -434,7 +435,7 @@ int main(int argc, char *argv[]) {
 
     
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/example/putting/CMakeLists.txt b/packages/rol/example/putting/CMakeLists.txt
index 3f8b317e835b..a0731a46ac0d 100644
--- a/packages/rol/example/putting/CMakeLists.txt
+++ b/packages/rol/example/putting/CMakeLists.txt
@@ -1,11 +1,11 @@
 
-TRIBITS_ADD_EXECUTABLE(
+ROL_ADD_EXECUTABLE(
   example_01
   SOURCES example_01.cpp
   ADD_DIR_TO_NAME
 )
 
-TRIBITS_COPY_FILES_TO_BINARY_DIR(
+ROL_COPY_FILES_TO_BINARY_DIR(
   PuttingControlDataCopy
   SOURCE_FILES
     input.xml plot_trajectory.m
diff --git a/packages/rol/example/putting/example_01.cpp b/packages/rol/example/putting/example_01.cpp
index fd992e7f5bf2..a0b8d0122fa1 100644
--- a/packages/rol/example/putting/example_01.cpp
+++ b/packages/rol/example/putting/example_01.cpp
@@ -17,7 +17,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint = argc - 1;
diff --git a/packages/rol/example/putting/example_01.hpp b/packages/rol/example/putting/example_01.hpp
index b99e091e72f3..6daa56f0fe83 100644
--- a/packages/rol/example/putting/example_01.hpp
+++ b/packages/rol/example/putting/example_01.hpp
@@ -13,8 +13,7 @@
 */
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 #include <algorithm>
diff --git a/packages/rol/example/quadratic/CMakeLists.txt b/packages/rol/example/quadratic/CMakeLists.txt
index bc9d601d4122..aca1217d604f 100644
--- a/packages/rol/example/quadratic/CMakeLists.txt
+++ b/packages/rol/example/quadratic/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   example_01
   SOURCES example_01.cpp
   ARGS PrintItAll
@@ -8,7 +8,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_COPY_FILES_TO_BINARY_DIR(
+ROL_COPY_FILES_TO_BINARY_DIR(
   QuadraticDataCopy
   SOURCE_FILES
     input.xml
diff --git a/packages/rol/example/quadratic/example_01.cpp b/packages/rol/example/quadratic/example_01.cpp
index 5e751a00e79b..7dfd7c6c7e23 100644
--- a/packages/rol/example/quadratic/example_01.cpp
+++ b/packages/rol/example/quadratic/example_01.cpp
@@ -21,8 +21,7 @@
 #include "ROL_StdVector.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 
@@ -83,7 +82,7 @@ class precond : public ROL::LinearOperator<Real> {
 };
 
 int main(int argc, char *argv[]) {
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
@@ -97,11 +96,11 @@ int main(int argc, char *argv[]) {
   int errorFlag  = 0;
 
   // *** Example body.
- 
+
   try {
 
     // Set up problem data
-    int dim = 10; // Set problem dimension. 
+    int dim = 10; // Set problem dimension.
     ROL::Ptr<std::vector<RealT> > x_ptr = ROL::makePtr<std::vector<RealT>>(dim, 0.0);
     ROL::Ptr<std::vector<RealT> > g_ptr = ROL::makePtr<std::vector<RealT>>(dim, 0.0);
     ROL::Ptr<std::vector<RealT> > d_ptr = ROL::makePtr<std::vector<RealT>>(dim, 0.0);
@@ -133,7 +132,7 @@ int main(int argc, char *argv[]) {
 
     // Test objective
     obj->checkGradient(*x, *d, true, *outStream);
-    *outStream << "\n"; 
+    *outStream << "\n";
 
     obj->checkHessVec(*x, *v, true, *outStream);
     *outStream << "\n";
diff --git a/packages/rol/example/rosenbrock/CMakeLists.txt b/packages/rol/example/rosenbrock/CMakeLists.txt
index 0fed4ec6d6cc..2008db0fccf1 100644
--- a/packages/rol/example/rosenbrock/CMakeLists.txt
+++ b/packages/rol/example/rosenbrock/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   example_01
   SOURCES example_01.cpp
   ARGS PrintItAll
diff --git a/packages/rol/example/rosenbrock/example_01.cpp b/packages/rol/example/rosenbrock/example_01.cpp
index b4cd6bdb4743..0ddff59a41f3 100644
--- a/packages/rol/example/rosenbrock/example_01.cpp
+++ b/packages/rol/example/rosenbrock/example_01.cpp
@@ -19,7 +19,7 @@
 #include "ROL_LineSearchStep.hpp"
 #include "ROL_StatusTest.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 
@@ -27,7 +27,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/example/sacado/Accessor_CPP17.hpp b/packages/rol/example/sacado/Accessor_CPP17.hpp
new file mode 100644
index 000000000000..4387429934ea
--- /dev/null
+++ b/packages/rol/example/sacado/Accessor_CPP17.hpp
@@ -0,0 +1,89 @@
+#ifndef ACCESSOR_HPP
+#define ACCESSOR_HPP
+
+#include <type_traits>
+#include <utility>
+
+// --- C++17 Trait Definitions ---
+
+// 1. Trait to check if T is a specialization of Accessor (from previous answer)
+template <typename T>
+struct is_accessor_specialization_trait_cpp17 : std::false_type {};
+template <typename VectorType>
+struct is_accessor_specialization_trait_cpp17<Accessor<VectorType>> : std::true_type {};
+
+// 2. Trait to check for member types
+template <typename T, typename = void>
+struct has_required_member_types : std::false_type {};
+
+template <typename T>
+
+                                        typename T::value_type,
+                                        typename T::size_type
+                                    >> : std::true_type {};
+
+// 3. Trait to check for get_value method (if member types exist)
+template <typename T, bool HasTypes, typename = void> // bool HasTypes is a precondition
+struct has_correct_get_value_method : std::false_type {};
+
+template <typename T>
+struct has_correct_get_value_method<T, /* HasTypes = */ true,
+    std::void_t<decltype(
+        std::declval<const T&>().get_value(
+            std::declval<const typename T::container_type&>(),
+            std::declval<typename T::size_type>()
+        )
+    )>
+> : std::is_same<
+        decltype(
+            std::declval<const T&>().get_value(
+                std::declval<const typename T::container_type&>(),
+                std::declval<typename T::size_type>()
+            )
+        ),
+        typename T::value_type
+    > {};
+
+// 4. Trait to check for set_value method (if member types exist)
+template <typename T, bool HasTypes, typename = void> // bool HasTypes is a precondition
+struct has_correct_set_value_method : std::false_type {};
+
+template <typename T>
+struct has_correct_set_value_method<T, /* HasTypes = */ true,
+    std::void_t<decltype(
+        std::declval<const T&>().set_value(
+            std::declval<typename T::container_type&>(), // Note: non-const container
+            std::declval<typename T::size_type>(),
+            std::declval<typename T::value_type>()
+        )
+    )>
+> : std::is_same< // Check return type is void
+        decltype(
+            std::declval<const T&>().set_value(
+                std::declval<typename T::container_type&>(),
+                std::declval<typename T::size_type>(),
+                std::declval<typename T::value_type>()
+            )
+        ),
+        void
+    > {};
+
+// 5. Combine all checks for C++17
+template<typename T>
+struct is_compliant_accessor_trait {
+private:
+    static constexpr bool is_specialization = is_accessor_specialization_trait_cpp17<T>::value;
+    static constexpr bool has_types = has_required_member_types<T>::value;
+    // Only check methods if it's a specialization AND has the types, to avoid hard errors with declval on non-existent types
+    static constexpr bool has_get = has_correct_get_value_method<T, is_specialization && has_types>::value;
+    static constexpr bool has_set = has_correct_set_value_method<T, is_specialization && has_types>::value;
+
+public:
+    static constexpr bool value = is_specialization && has_types && has_get && has_set;
+};
+
+
+template <typename T>
+inline constexpr bool is_compliant_accessor_v = is_compliant_accessor_trait<T>::value;
+
+#endif // ACCESSOR_HPP
diff --git a/packages/rol/example/sacado/Accessor_CPP20.hpp b/packages/rol/example/sacado/Accessor_CPP20.hpp
new file mode 100644
index 000000000000..ca1dd043d19a
--- /dev/null
+++ b/packages/rol/example/sacado/Accessor_CPP20.hpp
@@ -0,0 +1,41 @@
+#include <concepts>
+#include <utility>
+
+// Helper to check if T is a specialization of Accessor (from previous answer)
+template <typename T>
+struct is_accessor_specialization_trait_cpp20 : std::false_type {};
+
+template <typename VectorType>
+struct is_accessor_specialization_trait_cpp20<Accessor<VectorType>> : std::true_type {};
+
+template <typename T>
+concept IsAccessorSpecialization = is_accessor_specialization_trait_cpp20<T>::value;
+
+// Concept to check for full compliance
+template<typename Acc>
+concept CompliantAccessor = IsAccessorSpecialization<Acc> && requires (
+    const Acc& accessor,
+    // Use placeholders that will be deduced based on Acc's member types
+    // If Acc::container_type doesn't exist, this part of the requires clause will fail.
+    typename Acc::container_type& mutable_container,
+    const typename Acc::container_type& const_container,
+    typename Acc::size_type index,
+    typename Acc::value_type val
+) {
+    // 1. Check for required member types
+    typename Acc::container_type;
+    typename Acc::value_type;
+    typename Acc::size_type;
+
+    // 2. Check for get_value member function
+    //    - const-correct
+    //    - takes const container_type&, size_type
+    //    - returns value_type
+    { accessor.get_value(const_container, index) } -> std::same_as<typename Acc::value_type>;
+
+    // 3. Check for set_value member function
+    //    - const-correct (the method itself is const)
+    //    - takes container_type&, size_type, value_type
+    //    - returns void
+    { accessor.set_value(mutable_container, index, val) } -> std::same_as<void>;
+};
diff --git a/packages/rol/example/sacado/CMakeLists.txt b/packages/rol/example/sacado/CMakeLists.txt
index 8ae98e36e4c6..ec9c01ff7240 100644
--- a/packages/rol/example/sacado/CMakeLists.txt
+++ b/packages/rol/example/sacado/CMakeLists.txt
@@ -2,7 +2,7 @@ TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/sacado/src/fu
 TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/util/numerics/)
 
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   example_01a
   SOURCES example_01a.cpp
   ARGS PrintItAll
@@ -11,7 +11,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
 )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   example_01b
   SOURCES example_01b.cpp
   ARGS PrintItAll
@@ -20,7 +20,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
 )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   example_02
   SOURCES example_02.cpp
   ARGS PrintItAll
@@ -29,7 +29,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
 )
 
-TRIBITS_COPY_FILES_TO_BINARY_DIR(
+ROL_COPY_FILES_TO_BINARY_DIR(
   SacadoDataCopy
   SOURCE_FILES
    parameters.xml
@@ -37,7 +37,7 @@ TRIBITS_COPY_FILES_TO_BINARY_DIR(
   DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}"
 )
 
-#  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+#  ROL_ADD_EXECUTABLE_AND_TEST(
 #    example_03
 #    SOURCES example_03.cpp
 #    ARGS PrintItAll
diff --git a/packages/rol/example/sacado/ROL_Sacado_Traits.hpp b/packages/rol/example/sacado/ROL_Sacado_Traits.hpp
new file mode 100644
index 000000000000..9603ac887ca4
--- /dev/null
+++ b/packages/rol/example/sacado/ROL_Sacado_Traits.hpp
@@ -0,0 +1,90 @@
+#ifndef ROL_SACADO_TRAITS_HPP
+#define ROL_SACADO_TRAITS_HPP
+
+#include <type_traits>
+#include <utility>
+
+#include "Sacado.hpp"
+#include "ROL_Objective.hpp"
+
+constexpr bool is_Objective( ... ) { return false; }
+constexpr bool is_Constraint( ... ) { return false; }
+
+template<class Real>
+constexpr bool is_Objective( const ROL::Objective<Real>& ) { return true; }
+
+template<class Real>
+constexpr bool is_Constraint( const ROL::Constraint<Real>& ) { return true; }
+
+constexpr bool is_DFad( ... ) { return false; }
+constexpr bool is_SFad( ... ) { return false; }
+
+template<typename T>
+constexpr bool is_DFad( const Sacado::Fad::DFad<T>& ) { return true; }
+
+template<typename T, int Num>
+constexpr bool is_SFad( const Sacado::Fad::SFad<T,Num>& ) { return true; }
+
+struct InvalidType {};
+
+constexpr bool InvalidType Fad_value( ... );
+
+template<typename T>
+constexpr T Fad_value( const Sacado::Fad::DFad<T>& );
+
+template<typename T, int Num>
+constexpr T Fad_value( const Sacado::Fad::SFad<T,Num>& );
+
+template<typename T>
+using Fad_value_t = decltype( Fad_value( std::declval<T>() ) );
+
+template<typename VectorType>
+struct Accessor {};
+
+template<typename Real>
+struct Accessor<std::vector<Real>> {
+  using container_type = std::vector<Real>;
+  using value_type     = typename std::vector<Real>::value_type;
+  using size_type      = typename std::vector<Real>::size_type;
+  inline value_type get_value( const container_type& x, size_type i ) const {
+    return x[i];
+  }
+  inline void set_value( container_type& x, size_type i, value_type xi ) const {
+    x[i] = xi;
+  }
+};
+
+// 1. Define the trait structure
+template <typename T, typename = void> // SFINAE helper for more complex traits, not strictly needed here but good practice
+struct is_accessor_specialization : std::false_type {};
+
+// 2. Define the partial specialization for Accessor<VectorType>
+template <typename VectorType>
+struct is_accessor_specialization<Accessor<VectorType>> : std::true_type {};
+
+// 3. Define the C++17 style _v variable template
+template <typename T>
+inline constexpr bool has_accessor_v = is_accessor_specialization<T>::value;
+
+// --- Example Usage (C++17) ---
+
+// SFINAE example for a function
+template <typename T,
+          typename std::enable_if_t<has_accessor_v<T>, int> = 0>
+void process_if_accessor(const T& acc_specialization) {
+    // This function will only be available in overload resolution if has_accessor_v<T> is true.
+    // You can use acc_specialization here.
+    // For demonstration:
+    if constexpr (has_accessor_v<T>) { // if constexpr is C++17
+        // Additional compile-time checks or type usage
+    }
+}
+
+
+template<typename Real>
+struct ObjectiveValue {
+  template<typename ScalarT>
+  ScalarT value  
+};
+
+#endif // ROL_SACADO_TRAITS_HPP
diff --git a/packages/rol/example/sacado/example_01a.cpp b/packages/rol/example/sacado/example_01a.cpp
index d67a8ca9fb99..31ba118943a0 100644
--- a/packages/rol/example/sacado/example_01a.cpp
+++ b/packages/rol/example/sacado/example_01a.cpp
@@ -30,7 +30,7 @@
 #include "ROL_ParameterList.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "example_01a.hpp"
 
@@ -40,7 +40,7 @@ typedef double RealT;
 
 int main(int argc, char **argv)
 {
-    Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+    ROL::GlobalMPISession mpiSession(&argc, &argv);
 
     // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
     int iprint     = argc - 1;
diff --git a/packages/rol/example/sacado/example_01b.cpp b/packages/rol/example/sacado/example_01b.cpp
index 29137ef0721a..44c939a13ae3 100644
--- a/packages/rol/example/sacado/example_01b.cpp
+++ b/packages/rol/example/sacado/example_01b.cpp
@@ -23,7 +23,7 @@
 #include "ROL_ParameterList.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "example_01b.hpp"
 
@@ -33,7 +33,7 @@ typedef double RealT;
 
 int main(int argc, char **argv)
 {
-    Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+    ROL::GlobalMPISession mpiSession(&argc, &argv);
 
     // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
     int iprint     = argc - 1;
diff --git a/packages/rol/example/sacado/example_02.cpp b/packages/rol/example/sacado/example_02.cpp
index 1546b7ff69e4..3059acf45eb3 100644
--- a/packages/rol/example/sacado/example_02.cpp
+++ b/packages/rol/example/sacado/example_02.cpp
@@ -30,7 +30,7 @@
 #include "ROL_ParameterList.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "example_02.hpp"
 
@@ -42,7 +42,7 @@ int main(int argc, char **argv)
 {
 
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/example/simple-eq-constr/CMakeLists.txt b/packages/rol/example/simple-eq-constr/CMakeLists.txt
index 0fed4ec6d6cc..2008db0fccf1 100644
--- a/packages/rol/example/simple-eq-constr/CMakeLists.txt
+++ b/packages/rol/example/simple-eq-constr/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   example_01
   SOURCES example_01.cpp
   ARGS PrintItAll
diff --git a/packages/rol/example/simple-eq-constr/example_01.cpp b/packages/rol/example/simple-eq-constr/example_01.cpp
index f2b67d98e3a5..5e942965c81d 100644
--- a/packages/rol/example/simple-eq-constr/example_01.cpp
+++ b/packages/rol/example/simple-eq-constr/example_01.cpp
@@ -18,7 +18,7 @@
 #include "ROL_CompositeStep.hpp"
 #include "ROL_ConstraintStatusTest.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 
@@ -27,7 +27,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/example/stream-buffer/CMakeLists.txt b/packages/rol/example/stream-buffer/CMakeLists.txt
index 0fed4ec6d6cc..2008db0fccf1 100644
--- a/packages/rol/example/stream-buffer/CMakeLists.txt
+++ b/packages/rol/example/stream-buffer/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   example_01
   SOURCES example_01.cpp
   ARGS PrintItAll
diff --git a/packages/rol/example/stream-buffer/example_01.cpp b/packages/rol/example/stream-buffer/example_01.cpp
index 3605ee6681ed..5f82d22b1417 100644
--- a/packages/rol/example/stream-buffer/example_01.cpp
+++ b/packages/rol/example/stream-buffer/example_01.cpp
@@ -18,7 +18,7 @@
 #include "ROL_Problem.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 #include <sstream>
@@ -43,7 +43,7 @@ class mybuffer : public std::basic_streambuf<charT,Traits> {
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a
   // (dummy) command-line argument is provided.
diff --git a/packages/rol/example/tempus/CMakeLists.txt b/packages/rol/example/tempus/CMakeLists.txt
index 1ea1cdb4676d..a8f0a9cae066 100644
--- a/packages/rol/example/tempus/CMakeLists.txt
+++ b/packages/rol/example/tempus/CMakeLists.txt
@@ -1,6 +1,6 @@
 TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/thyra/src/vector)
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   example_01
   SOURCES example_01.cpp
   ARGS PrintItAll
@@ -9,7 +9,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
 )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   example_02
   SOURCES example_02.cpp
   ARGS PrintItAll
@@ -18,7 +18,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
 )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   example_sincos
   SOURCES example_sincos.cpp
   ARGS PrintItAll
@@ -27,7 +27,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
 )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   example_parabolic_thyravec
   SOURCES example_parabolic_thyravec.cpp
   ARGS PrintItAll
@@ -36,7 +36,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
 )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   example_parabolic_modeleval
   SOURCES example_parabolic_modeleval.cpp
   ARGS PrintItAll
@@ -45,7 +45,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
 )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   example_parabolic_modeleval_forward-adjoint
   SOURCES example_parabolic_modeleval_forward-adjoint.cpp
   ARGS PrintItAll
@@ -54,7 +54,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
 )
 
-TRIBITS_COPY_FILES_TO_BINARY_DIR(
+ROL_COPY_FILES_TO_BINARY_DIR(
   ForwardEulerSinCosDataCopy
   SOURCE_FILES
     example_01.xml example_02.xml example_sincos.xml example_parabolic.xml
diff --git a/packages/rol/example/tempus/example_02.cpp b/packages/rol/example/tempus/example_02.cpp
index 518f30300cd7..c990f4ee4e13 100644
--- a/packages/rol/example/tempus/example_02.cpp
+++ b/packages/rol/example/tempus/example_02.cpp
@@ -21,7 +21,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint = argc - 1;
diff --git a/packages/rol/example/tempus/example_02.hpp b/packages/rol/example/tempus/example_02.hpp
index 25830331ec7a..140974553265 100644
--- a/packages/rol/example/tempus/example_02.hpp
+++ b/packages/rol/example/tempus/example_02.hpp
@@ -17,11 +17,11 @@
 #include "ROL_Constraint_SimOpt.hpp"
 #include "ROL_Objective_SimOpt.hpp"
 #include "ROL_Reduced_Objective_SimOpt.hpp"
+#include "ROL_ParameterList.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
-#include "Teuchos_LAPACK.hpp"
+#include "ROL_GlobalMPISession.hpp"
+#include "ROL_LAPACK.hpp"
 
 #include <iostream>
 #include <algorithm>
@@ -341,7 +341,7 @@ class Constraint_BurgersControl : public ROL::Constraint_SimOpt<Real> {
       std::vector<Real> Du(du);
       std::vector<Real> D(d);
       // Perform LDL factorization
-      Teuchos::LAPACK<int,Real> lp;
+      ROL::LAPACK<int,Real> lp;
       std::vector<Real> Du2(nx_-2,0.0);
       std::vector<int> ipiv(nx_,0);
       int info;
@@ -392,8 +392,8 @@ class Constraint_BurgersControl : public ROL::Constraint_SimOpt<Real> {
 
 public:
 
-  Constraint_BurgersControl(int nx = 128, int nt = 100, Real T = 1, 
-                                    Real nu = 1.e-2, Real u0 = 0.0, Real u1 = 0.0, Real f = 0.0) 
+  Constraint_BurgersControl(int nx = 128, int nt = 100, Real T = 1,
+                                    Real nu = 1.e-2, Real u0 = 0.0, Real u1 = 0.0, Real f = 0.0)
     : nx_((unsigned)nx), nt_((unsigned)nt), T_(T), nu_(nu), u0_(u0), u1_(u1), f_(f) {
     dx_ = 1.0/((Real)nx+1.0);
     dt_ = T/((Real)nt);
@@ -479,7 +479,7 @@ class Constraint_BurgersControl : public ROL::Constraint_SimOpt<Real> {
     value(c,u,z,tol);
   }
 
-  void applyJacobian_1(ROL::Vector<Real> &jv, const ROL::Vector<Real> &v, const ROL::Vector<Real> &u, 
+  void applyJacobian_1(ROL::Vector<Real> &jv, const ROL::Vector<Real> &v, const ROL::Vector<Real> &u,
                        const ROL::Vector<Real> &z, Real &tol) {
     ROL::Ptr<std::vector<Real> > jvp =
       dynamic_cast<ROL::StdVector<Real>&>(jv).getVector();
@@ -500,7 +500,7 @@ class Constraint_BurgersControl : public ROL::Constraint_SimOpt<Real> {
       apply_pde_jacobian(J,vold,uold,vnew,unew);
       for (unsigned n = 0; n < nx_; n++) {
         (*jvp)[t*nx_+n] = J[n];
-      }  
+      }
       vold.assign(vnew.begin(),vnew.end());
       uold.assign(unew.begin(),unew.end());
     }
@@ -571,7 +571,7 @@ class Constraint_BurgersControl : public ROL::Constraint_SimOpt<Real> {
     }
   }
 
-  void applyAdjointJacobian_1(ROL::Vector<Real> &ajv, const ROL::Vector<Real> &v, const ROL::Vector<Real> &u, 
+  void applyAdjointJacobian_1(ROL::Vector<Real> &ajv, const ROL::Vector<Real> &v, const ROL::Vector<Real> &u,
                               const ROL::Vector<Real> &z, Real &tol) {
     ROL::Ptr<std::vector<Real> > jvp =
       dynamic_cast<ROL::StdVector<Real>&>(ajv).getVector();
@@ -591,7 +591,7 @@ class Constraint_BurgersControl : public ROL::Constraint_SimOpt<Real> {
       apply_pde_jacobian(J,vold,unew,vnew,unew,true);
       for (unsigned n = 0; n < nx_; n++) {
         (*jvp)[(t-1)*nx_+n] = J[n];
-      }  
+      }
       vold.assign(vnew.begin(),vnew.end());
     }
   }
@@ -670,7 +670,7 @@ class Constraint_BurgersControl : public ROL::Constraint_SimOpt<Real> {
       dynamic_cast<const ROL::StdVector<Real>&>(v).getVector();
     std::vector<Real> snew(nx_,0.0);
     std::vector<Real> wnew(nx_,0.0);
-    std::vector<Real> wold(nx_,0.0); 
+    std::vector<Real> wold(nx_,0.0);
     std::vector<Real> vnew(nx_,0.0);
     for (unsigned t = nt_; t > 0; t--) {
       for (unsigned n = 0; n < nx_; n++) {
@@ -684,7 +684,7 @@ class Constraint_BurgersControl : public ROL::Constraint_SimOpt<Real> {
       wold.assign(wnew.begin(),wnew.end());
     }
   }
-  
+
   void applyAdjointHessian_12(ROL::Vector<Real> &ahwv, const ROL::Vector<Real> &w, const ROL::Vector<Real> &v,
                               const ROL::Vector<Real> &u, const ROL::Vector<Real> &z, Real &tol) {
     ahwv.zero();
@@ -763,7 +763,7 @@ class Objective_BurgersControl : public ROL::Objective_SimOpt<Real> {
 
 public:
 
-  Objective_BurgersControl(Real alpha = 1.e-4, int nx = 128, int nt = 100, Real T = 1.0) 
+  Objective_BurgersControl(Real alpha = 1.e-4, int nx = 128, int nt = 100, Real T = 1.0)
     : alpha_(alpha), nx_((unsigned)nx), nt_((unsigned)nt), T_(T) {
     dx_ = 1.0/((Real)nx+1.0);
     dt_ = T/((Real)nt);
@@ -841,7 +841,7 @@ class Objective_BurgersControl : public ROL::Objective_SimOpt<Real> {
     }
   }
 
-  void hessVec_11( ROL::Vector<Real> &hv, const ROL::Vector<Real> &v, 
+  void hessVec_11( ROL::Vector<Real> &hv, const ROL::Vector<Real> &v,
                    const ROL::Vector<Real> &u, const ROL::Vector<Real> &z, Real &tol ) {
     ROL::Ptr<std::vector<Real> > hvp =
       dynamic_cast<ROL::StdVector<Real>&>(hv).getVector();
@@ -863,17 +863,17 @@ class Objective_BurgersControl : public ROL::Objective_SimOpt<Real> {
     }
   }
 
-  void hessVec_12( ROL::Vector<Real> &hv, const ROL::Vector<Real> &v, 
+  void hessVec_12( ROL::Vector<Real> &hv, const ROL::Vector<Real> &v,
                    const ROL::Vector<Real> &u, const ROL::Vector<Real> &z, Real &tol ) {
     hv.zero();
   }
 
-  void hessVec_21( ROL::Vector<Real> &hv, const ROL::Vector<Real> &v, 
+  void hessVec_21( ROL::Vector<Real> &hv, const ROL::Vector<Real> &v,
                    const ROL::Vector<Real> &u, const ROL::Vector<Real> &z, Real &tol ) {
     hv.zero();
   }
 
-  void hessVec_22( ROL::Vector<Real> &hv, const ROL::Vector<Real> &v, 
+  void hessVec_22( ROL::Vector<Real> &hv, const ROL::Vector<Real> &v,
                    const ROL::Vector<Real> &u, const ROL::Vector<Real> &z, Real &tol ) {
     ROL::Ptr<std::vector<Real> > hvp = ROL::constPtrCast<std::vector<Real> >(
       (dynamic_cast<const ROL::StdVector<Real>&>(hv)).getVector());
diff --git a/packages/rol/example/tempus/example_parabolic_modeleval.cpp b/packages/rol/example/tempus/example_parabolic_modeleval.cpp
index 927c31e5a7bc..c9fa7981689d 100644
--- a/packages/rol/example/tempus/example_parabolic_modeleval.cpp
+++ b/packages/rol/example/tempus/example_parabolic_modeleval.cpp
@@ -30,7 +30,7 @@
 
 #include "example_parabolic_modeleval.hpp"
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tempus_IntegratorBasic.hpp"
 
@@ -52,7 +52,7 @@ int main(int argc, char *argv[]) {
   using RealT = double;
   using luint  = std::vector<RealT>::size_type;
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   ROL::Ptr<std::ostream> outStream = ROL::makeStreamPtr( std::cout, argc > 1 );
diff --git a/packages/rol/example/tempus/example_parabolic_modeleval_forward-adjoint.cpp b/packages/rol/example/tempus/example_parabolic_modeleval_forward-adjoint.cpp
index 63b76c949c0a..d494fafdefa7 100644
--- a/packages/rol/example/tempus/example_parabolic_modeleval_forward-adjoint.cpp
+++ b/packages/rol/example/tempus/example_parabolic_modeleval_forward-adjoint.cpp
@@ -30,7 +30,7 @@
 
 #include "example_parabolic_modeleval_forward-adjoint.hpp"
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tempus_IntegratorBasic.hpp"
 
@@ -52,7 +52,7 @@ int main(int argc, char *argv[]) {
   using RealT = double;
   using luint  = std::vector<RealT>::size_type;
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   ROL::Ptr<std::ostream> outStream = ROL::makeStreamPtr( std::cout, argc > 1 );
diff --git a/packages/rol/example/tempus/example_parabolic_thyravec.cpp b/packages/rol/example/tempus/example_parabolic_thyravec.cpp
index 38c386f39f50..6b76f9606188 100644
--- a/packages/rol/example/tempus/example_parabolic_thyravec.cpp
+++ b/packages/rol/example/tempus/example_parabolic_thyravec.cpp
@@ -30,7 +30,7 @@
 
 #include "example_parabolic_thyravec.hpp"
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "ROL_Stream.hpp"
 #include "ROL_ParameterList.hpp"
 #include "ROL_ReducedDynamicObjective.hpp"
@@ -47,7 +47,7 @@ int main(int argc, char *argv[]) {
   using RealT = double;
   using luint  = std::vector<RealT>::size_type;
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   ROL::Ptr<std::ostream> outStream = ROL::makeStreamPtr( std::cout, argc > 1 );
diff --git a/packages/rol/example/tempus/example_sincos.cpp b/packages/rol/example/tempus/example_sincos.cpp
index 35bd6bd5531a..4ea09756d8f7 100644
--- a/packages/rol/example/tempus/example_sincos.cpp
+++ b/packages/rol/example/tempus/example_sincos.cpp
@@ -7,7 +7,6 @@
 // *****************************************************************************
 // @HEADER
 
-#include "Teuchos_XMLParameterListHelpers.hpp"
 #include "Teuchos_DefaultComm.hpp"
 
 #include "example_sincos.hpp"
@@ -21,11 +20,11 @@
 int main(int argc, char *argv[])
 {
   using Teuchos::RCP;
-  using Teuchos::ParameterList;
+  using ROL::ParameterList;
   using Teuchos::sublist;
   using Teuchos::getParametersFromXmlFile;
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // Save ROL output to rol.out
   std::ofstream fout;
diff --git a/packages/rol/example/tempus/example_sincos.hpp b/packages/rol/example/tempus/example_sincos.hpp
index f2bb53037600..68e1427cfca3 100644
--- a/packages/rol/example/tempus/example_sincos.hpp
+++ b/packages/rol/example/tempus/example_sincos.hpp
@@ -12,6 +12,7 @@
 
 #include "Thyra_ModelEvaluator.hpp" // Interface
 #include "Thyra_StateFuncModelEvaluatorBase.hpp" // Implementation
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Teuchos_ParameterListAcceptorDefaultBase.hpp"
 #include "Teuchos_ParameterList.hpp"
diff --git a/packages/rol/example/tensor-opt/CMakeLists.txt b/packages/rol/example/tensor-opt/CMakeLists.txt
index 0eb61ecb6dc3..4b19049101e5 100644
--- a/packages/rol/example/tensor-opt/CMakeLists.txt
+++ b/packages/rol/example/tensor-opt/CMakeLists.txt
@@ -1,6 +1,6 @@
 TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/teuchos/src/vector)
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   example_01
   SOURCES example_01.cpp
   ARGS PrintItAll
@@ -9,7 +9,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
 )
 
-TRIBITS_COPY_FILES_TO_BINARY_DIR(
+ROL_COPY_FILES_TO_BINARY_DIR(
   TensorOptDataCopy
   SOURCE_FILES
     example_01.xml
diff --git a/packages/rol/example/tensor-opt/example_01.cpp b/packages/rol/example/tensor-opt/example_01.cpp
index e869cb578dd2..0dea0d8e423e 100644
--- a/packages/rol/example/tensor-opt/example_01.cpp
+++ b/packages/rol/example/tensor-opt/example_01.cpp
@@ -41,9 +41,8 @@
 //#pragma GCC diagnostic ignored "-Wshadow"
 //#pragma GCC diagnostic ignored "-Wundef"
 
-#include "Teuchos_ParameterList.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
-
+#include "ROL_ParameterList.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "ROL_Vector.hpp"
 #include "ROL_CArrayVector.hpp"
 #include "ROL_Bounds.hpp"
@@ -802,7 +801,7 @@ class SemidefiniteProgramming
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/example/topology-optimization/CMakeLists.txt b/packages/rol/example/topology-optimization/CMakeLists.txt
index 9e905131e6cf..b8fb999bc0fd 100644
--- a/packages/rol/example/topology-optimization/CMakeLists.txt
+++ b/packages/rol/example/topology-optimization/CMakeLists.txt
@@ -1,17 +1,12 @@
 
-TRIBITS_ADD_EXECUTABLE(
-  example_01
-  SOURCES example_01.cpp 
-  ADD_DIR_TO_NAME
-)
-TRIBITS_ADD_EXECUTABLE(
+ROL_ADD_EXECUTABLE(
   example_02
-  SOURCES example_02.cpp 
+  SOURCES example_02.cpp
   ADD_DIR_TO_NAME
 )
 
 
-TRIBITS_COPY_FILES_TO_BINARY_DIR(
+ROL_COPY_FILES_TO_BINARY_DIR(
   TopologyOptimizationDataCopy
   SOURCE_FILES
     input.xml
@@ -24,13 +19,21 @@ string( TOLOWER ${CMAKE_BUILD_TYPE} lower_build_type )
 
 if( ${lower_build_type} STREQUAL "release" )
 
-  TRIBITS_ADD_TEST(
+  ROL_ADD_EXECUTABLE_AND_TEST(
     example_01
+    SOURCES example_01.cpp
     ARGS PrintItAll
     NUM_MPI_PROCS 1
-    CATEGORIES CONTINUOUS
     PASS_REGULAR_EXPRESSION "TEST PASSED"
     ADD_DIR_TO_NAME
   )
 
+else()
+
+  ROL_ADD_EXECUTABLE(
+    example_01
+    SOURCES example_01.cpp
+    ADD_DIR_TO_NAME
+  )
+
 endif()
diff --git a/packages/rol/example/topology-optimization/example_01.cpp b/packages/rol/example/topology-optimization/example_01.cpp
index e0d2a8f6cd73..685e5495984a 100644
--- a/packages/rol/example/topology-optimization/example_01.cpp
+++ b/packages/rol/example/topology-optimization/example_01.cpp
@@ -18,8 +18,7 @@
 #include "ROL_StatusTest.hpp"
 #include "ROL_Types.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_LAPACK.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <fstream>
 #include <iostream>
@@ -32,9 +31,8 @@
 #include "ROL_Reduced_Objective_SimOpt.hpp"
 #include "ROL_StdBoundConstraint.hpp"
 #include "ROL_ParameterList.hpp"
-
-#include "Teuchos_SerialDenseVector.hpp"
-#include "Teuchos_SerialDenseSolver.hpp"
+#include "ROL_LAPACK.hpp"
+#include "ROL_LinearAlgebra.hpp"
 
 template<class Real>
 class FEM {
@@ -48,7 +46,7 @@ class FEM {
   int prob_;
   Real E_;
   Real nu_;
-  Teuchos::SerialDenseMatrix<int,Real> KE_;
+  ROL::LA::Matrix<Real> KE_;
 
 public:
 
@@ -175,7 +173,7 @@ class FEM {
     return false;
   }
 
-  void set_boundary_conditions(Teuchos::SerialDenseVector<int, Real> &U) {
+  void set_boundary_conditions(ROL::LA::Vector<Real> &U) {
     for (int i=0; i<U.length(); i++) {
       if ( check_on_boundary(i) ) {
         U(i) = 0.0;
@@ -199,7 +197,7 @@ class FEM {
     }
   }
 
-  void build_force(Teuchos::SerialDenseVector<int, Real> &F) {
+  void build_force(ROL::LA::Vector<Real> &F) {
     F.resize(numU());
     F.putScalar(0.0);
     switch(prob_) {
@@ -208,7 +206,7 @@ class FEM {
     }
   }
 
-  void build_jacobian(Teuchos::SerialDenseMatrix<int, Real> &K, const std::vector<Real> &z,
+  void build_jacobian(ROL::LA::Matrix<Real> &K, const std::vector<Real> &z,
                       bool transpose = false) {
     // Fill jacobian
     K.shape(2*(nx_+1)*(ny_+1),2*(nx_+1)*(ny_+1));
@@ -243,7 +241,7 @@ class FEM {
     }
   }
 
-  void build_jacobian(Teuchos::SerialDenseMatrix<int, Real> &K, const std::vector<Real> &z, 
+  void build_jacobian(ROL::LA::Matrix<Real> &K, const std::vector<Real> &z, 
                       const std::vector<Real> &v, bool transpose = false) {
     // Fill jacobian
     K.shape(2*(nx_+1)*(ny_+1),2*(nx_+1)*(ny_+1));
@@ -447,24 +445,22 @@ class Constraint_TopOpt : public ROL::Constraint_SimOpt<Real> {
     ROL::Ptr<const vector> zp = getVector(z);
 
     // Assemble Jacobian
-    Teuchos::SerialDenseMatrix<int, Real> K;
+    ROL::LA::Matrix<Real> K;
     FEM_->build_jacobian(K,*zp);
     // Assemble RHS
-    Teuchos::SerialDenseVector<int, Real> F(K.numRows());
+    ROL::LA::Vector<Real> F(K.numRows());
     FEM_->build_force(F);
     // Solve
-    Teuchos::SerialDenseVector<int, Real> U(K.numCols());
-    Teuchos::SerialDenseSolver<int, Real> solver;
-    solver.setMatrix( Teuchos::rcpFromRef(K) );
-    solver.setVectors( Teuchos::rcpFromRef(U), Teuchos::rcpFromRef(F) );
-    solver.factorWithEquilibration(true);
-    solver.factor();
-    solver.solve();
-    FEM_->set_boundary_conditions(U);
+    ROL::Ptr<ROL::LAPACK<int, Real>> lapack = ROL::makePtr<ROL::LAPACK<int,Real>>();
+    int info;
+    int n = K.numRows();
+    lapack->POTRF('U',n,K.values(),n,&info);
+    lapack->POTRS('U',n,1,K.values(),n,F.values(),n,&info);
+    FEM_->set_boundary_conditions(F);
     // Retrieve solution
-    up->resize(U.length(),0.0);
-    for (uint i=0; i<static_cast<uint>(U.length()); i++) {
-      (*up)[i] = U(i);
+    up->resize(F.length(),0.0);
+    for (uint i=0; i<static_cast<uint>(F.length()); i++) {
+      (*up)[i] = F(i);
     }
     // Compute residual
     this->value(c,u,z,tol);
@@ -513,25 +509,23 @@ class Constraint_TopOpt : public ROL::Constraint_SimOpt<Real> {
     ROL::Ptr<const vector> zp = getVector(z);
 
     // Assemble Jacobian
-    Teuchos::SerialDenseMatrix<int, Real> K;
+    ROL::LA::Matrix<Real> K;
     FEM_->build_jacobian(K,*zp);
     // Solve
-    Teuchos::SerialDenseVector<int, Real> U(K.numCols());
-    Teuchos::SerialDenseVector<int, Real> F(vp->size());
+    ROL::LA::Vector<Real> F(vp->size());
     for (uint i=0; i<vp->size(); i++) {
       F(i) = (*vp)[i];
     }
-    Teuchos::SerialDenseSolver<int, Real> solver;
-    solver.setMatrix(Teuchos::rcpFromRef(K));
-    solver.setVectors(Teuchos::rcpFromRef(U),Teuchos::rcpFromRef(F));
-    solver.factorWithEquilibration(true);
-    solver.factor();
-    solver.solve();
-    FEM_->set_boundary_conditions(U);
+    ROL::Ptr<ROL::LAPACK<int, Real>> lapack = ROL::makePtr<ROL::LAPACK<int,Real>>();
+    int info;
+    int n = K.numRows();
+    lapack->POTRF('U',n,K.values(),n,&info);
+    lapack->POTRS('U',n,1,K.values(),n,F.values(),n,&info);
+    FEM_->set_boundary_conditions(F);
     // Retrieve solution
-    ijvp->resize(U.length(),0.0);
-    for (uint i=0; i<static_cast<uint>(U.length()); i++) {
-      (*ijvp)[i] = U(i);
+    ijvp->resize(F.length(),0.0);
+    for (uint i=0; i<static_cast<uint>(F.length()); i++) {
+      (*ijvp)[i] = F(i);
     }
   }
 
@@ -580,25 +574,23 @@ class Constraint_TopOpt : public ROL::Constraint_SimOpt<Real> {
      ROL::Ptr<const vector> zp = getVector(z);
 
     // Assemble Jacobian
-    Teuchos::SerialDenseMatrix<int, Real> K;
+    ROL::LA::Matrix<Real> K;
     FEM_->build_jacobian(K,*zp);
     // Solve
-    Teuchos::SerialDenseVector<int, Real> U(K.numCols());
-    Teuchos::SerialDenseVector<int, Real> F(vp->size());
+    ROL::LA::Vector<Real> F(vp->size());
     for (uint i=0; i<vp->size(); i++) {
       F(i) = (*vp)[i];
     }
-    Teuchos::SerialDenseSolver<int, Real> solver;
-    solver.setMatrix(Teuchos::rcpFromRef(K));
-    solver.setVectors(Teuchos::rcpFromRef(U), Teuchos::rcpFromRef(F));
-    solver.factorWithEquilibration(true);
-    solver.factor();
-    solver.solve();
-    FEM_->set_boundary_conditions(U);
+    ROL::Ptr<ROL::LAPACK<int, Real>> lapack = ROL::makePtr<ROL::LAPACK<int,Real>>();
+    int info;
+    int n = K.numRows();
+    lapack->POTRF('U',n,K.values(),n,&info);
+    lapack->POTRS('U',n,1,K.values(),n,F.values(),n,&info);
+    FEM_->set_boundary_conditions(F);
     // Retrieve solution
-    iajvp->resize(U.length(),0.0);
-    for (int i=0; i<U.length(); i++) {
-      (*iajvp)[i] = U(i);
+    iajvp->resize(F.length(),0.0);
+    for (int i=0; i<F.length(); i++) {
+      (*iajvp)[i] = F(i);
     }
   }
 
@@ -1000,7 +992,7 @@ typedef double RealT;
 int main(int argc, char *argv[]) {
 
   typedef typename std::vector<RealT>::size_type luint;
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
@@ -1152,7 +1144,7 @@ int main(int argc, char *argv[]) {
       for (luint i=0; i<nx; i++) {
         for (luint lj=0; lj<ny; lj++) {
           val = (*z_ptr)[i+lj*nx];
-          file << i << "  " << lj << "  " << val << "\n"; 
+          file << i << "  " << lj << "  " << val << "\n";
         }
       }
       file.close();
diff --git a/packages/rol/example/topology-optimization/example_02.cpp b/packages/rol/example/topology-optimization/example_02.cpp
index b45ba0efd7c4..0e92fac264cd 100644
--- a/packages/rol/example/topology-optimization/example_02.cpp
+++ b/packages/rol/example/topology-optimization/example_02.cpp
@@ -13,8 +13,7 @@
 */
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_LAPACK.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 #include <algorithm>
@@ -26,9 +25,8 @@
 #include "ROL_Problem.hpp"
 #include "ROL_Solver.hpp"
 #include "ROL_ParameterList.hpp"
-
-#include "Teuchos_SerialDenseVector.hpp"
-#include "Teuchos_SerialDenseSolver.hpp"
+#include "ROL_LAPACK.hpp"
+#include "ROL_LinearAlgebra.hpp"
 
 template<class Real>
 class FEM {
@@ -42,7 +40,7 @@ class FEM {
   int prob_;
   Real E_;
   Real nu_;
-  Teuchos::SerialDenseMatrix<int,Real> KE_;
+  ROL::LA::Matrix<Real> KE_;
 
 public:
 
@@ -169,7 +167,7 @@ class FEM {
     return false;
   }
 
-  void set_boundary_conditions(Teuchos::SerialDenseVector<int, Real> &U) {
+  void set_boundary_conditions(ROL::LA::Vector<Real> &U) {
     for (int i=0; i<U.length(); i++) {
       if ( check_on_boundary(i) ) {
         U(i) = 0.0;
@@ -193,7 +191,7 @@ class FEM {
     }
   }
 
-  void build_force(Teuchos::SerialDenseVector<int, Real> &F) {
+  void build_force(ROL::LA::Vector<Real> &F) {
     F.resize(numU());
     F.putScalar(0.0);
     switch(prob_) {
@@ -202,7 +200,7 @@ class FEM {
     }
   }
 
-  void build_jacobian(Teuchos::SerialDenseMatrix<int, Real> &K, const std::vector<Real> &z,
+  void build_jacobian(ROL::LA::Matrix<Real> &K, const std::vector<Real> &z,
                       bool transpose = false) {
     // Fill jacobian
     K.shape(2*(nx_+1)*(ny_+1),2*(nx_+1)*(ny_+1));
@@ -237,7 +235,7 @@ class FEM {
     }
   }
 
-  void build_jacobian(Teuchos::SerialDenseMatrix<int, Real> &K, const std::vector<Real> &z, 
+  void build_jacobian(ROL::LA::Matrix<Real> &K, const std::vector<Real> &z, 
                       const std::vector<Real> &v, bool transpose = false) {
     // Fill jacobian
     K.shape(2*(nx_+1)*(ny_+1),2*(nx_+1)*(ny_+1));
@@ -433,24 +431,22 @@ class Constraint_TopOpt : public ROL::Constraint_SimOpt<Real> {
     ROL::Ptr<const vector> zp = getVector(z);
 
     // Assemble Jacobian
-    Teuchos::SerialDenseMatrix<int, Real> K;
+    ROL::LA::Matrix<Real> K;
     FEM_->build_jacobian(K,*zp);
     // Assemble RHS
-    Teuchos::SerialDenseVector<int, Real> F(K.numRows());
+    ROL::LA::Vector<Real> F(K.numRows());
     FEM_->build_force(F);
     // Solve
-    Teuchos::SerialDenseVector<int, Real> U(K.numCols());
-    Teuchos::SerialDenseSolver<int, Real> solver;
-    solver.setMatrix( Teuchos::rcpFromRef(K) );
-    solver.setVectors( Teuchos::rcpFromRef(U), Teuchos::rcpFromRef(F) );
-    solver.factorWithEquilibration(true);
-    solver.factor();
-    solver.solve();
-    FEM_->set_boundary_conditions(U);
+    ROL::Ptr<ROL::LAPACK<int, Real>> lapack = ROL::makePtr<ROL::LAPACK<int,Real>>();
+    int info;
+    int n = K.numRows();
+    lapack->POTRF('U',n,K.values(),n,&info);
+    lapack->POTRS('U',n,1,K.values(),n,F.values(),n,&info);
+    FEM_->set_boundary_conditions(F);
     // Retrieve solution
-    up->resize(U.length(),0.0);
-    for (uint i=0; i<static_cast<uint>(U.length()); i++) {
-      (*up)[i] = U(i);
+    up->resize(F.length(),0.0);
+    for (uint i=0; i<static_cast<uint>(F.length()); i++) {
+      (*up)[i] = F(i);
     }
     // Compute residual
     this->value(c,u,z,tol);
@@ -492,25 +488,23 @@ class Constraint_TopOpt : public ROL::Constraint_SimOpt<Real> {
     ROL::Ptr<const vector> zp = getVector(z);
 
     // Assemble Jacobian
-    Teuchos::SerialDenseMatrix<int, Real> K;
+    ROL::LA::Matrix<Real> K;
     FEM_->build_jacobian(K,*zp);
     // Solve
-    Teuchos::SerialDenseVector<int, Real> U(K.numCols());
-    Teuchos::SerialDenseVector<int, Real> F(vp->size());
+    ROL::LA::Vector<Real> F(vp->size());
     for (uint i=0; i<vp->size(); i++) {
       F(i) = (*vp)[i];
     }
-    Teuchos::SerialDenseSolver<int, Real> solver;
-    solver.setMatrix(Teuchos::rcpFromRef(K));
-    solver.setVectors(Teuchos::rcpFromRef(U),Teuchos::rcpFromRef(F));
-    solver.factorWithEquilibration(true);
-    solver.factor();
-    solver.solve();
-    FEM_->set_boundary_conditions(U);
+    ROL::Ptr<ROL::LAPACK<int, Real>> lapack = ROL::makePtr<ROL::LAPACK<int,Real>>();
+    int info;
+    int n = K.numRows();
+    lapack->POTRF('U',n,K.values(),n,&info);
+    lapack->POTRS('U',n,1,K.values(),n,F.values(),n,&info);
+    FEM_->set_boundary_conditions(F);
     // Retrieve solution
-    ijvp->resize(U.length(),0.0);
-    for (uint i=0; i<static_cast<uint>(U.length()); i++) {
-      (*ijvp)[i] = U(i);
+    ijvp->resize(F.length(),0.0);
+    for (uint i=0; i<static_cast<uint>(F.length()); i++) {
+      (*ijvp)[i] = F(i);
     }
   }
 
@@ -553,25 +547,23 @@ class Constraint_TopOpt : public ROL::Constraint_SimOpt<Real> {
      ROL::Ptr<const vector> zp = getVector(z);
 
     // Assemble Jacobian
-    Teuchos::SerialDenseMatrix<int, Real> K;
+    ROL::LA::Matrix<Real> K;
     FEM_->build_jacobian(K,*zp);
     // Solve
-    Teuchos::SerialDenseVector<int, Real> U(K.numCols());
-    Teuchos::SerialDenseVector<int, Real> F(vp->size());
+    ROL::LA::Vector<Real> F(vp->size());
     for (uint i=0; i<vp->size(); i++) {
       F(i) = (*vp)[i];
     }
-    Teuchos::SerialDenseSolver<int, Real> solver;
-    solver.setMatrix(Teuchos::rcpFromRef(K));
-    solver.setVectors(Teuchos::rcpFromRef(U), Teuchos::rcpFromRef(F));
-    solver.factorWithEquilibration(true);
-    solver.factor();
-    solver.solve();
-    FEM_->set_boundary_conditions(U);
+    ROL::Ptr<ROL::LAPACK<int, Real>> lapack = ROL::makePtr<ROL::LAPACK<int,Real>>();
+    int info;
+    int n = K.numRows();
+    lapack->POTRF('U',n,K.values(),n,&info);
+    lapack->POTRS('U',n,1,K.values(),n,F.values(),n,&info);
+    FEM_->set_boundary_conditions(F);
     // Retrieve solution
-    iajvp->resize(U.length(),0.0);
-    for (int i=0; i<U.length(); i++) {
-      (*iajvp)[i] = U(i);
+    iajvp->resize(F.length(),0.0);
+    for (int i=0; i<F.length(); i++) {
+      (*iajvp)[i] = F(i);
     }
   }
 
@@ -872,7 +864,7 @@ typedef double RealT;
 int main(int argc, char *argv[]) {
 
   typedef typename std::vector<RealT>::size_type luint;
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/example/wave-calibration/CMakeLists.txt b/packages/rol/example/wave-calibration/CMakeLists.txt
index f5b6f4068548..a239696142e0 100644
--- a/packages/rol/example/wave-calibration/CMakeLists.txt
+++ b/packages/rol/example/wave-calibration/CMakeLists.txt
@@ -1,6 +1,6 @@
 
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
     example_01
     SOURCES example_01.cpp
     ARGS PrintItAll
diff --git a/packages/rol/example/wave-calibration/example_01.cpp b/packages/rol/example/wave-calibration/example_01.cpp
index 1f6224f3db9b..fe072f864548 100644
--- a/packages/rol/example/wave-calibration/example_01.cpp
+++ b/packages/rol/example/wave-calibration/example_01.cpp
@@ -19,7 +19,7 @@
 #include "ROL_TrustRegionStep.hpp"
 #include "ROL_Algorithm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <fstream>
 #include <iostream>
@@ -129,7 +129,7 @@ void fft(CArray& x)
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/example/zakharov/CMakeLists.txt b/packages/rol/example/zakharov/CMakeLists.txt
index 147c5366742e..3bb95b7e23ef 100644
--- a/packages/rol/example/zakharov/CMakeLists.txt
+++ b/packages/rol/example/zakharov/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   example_01
   SOURCES example_01.cpp
   ARGS PrintItAll
@@ -8,7 +8,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   example_02
   SOURCES example_02.cpp
   ARGS PrintItAll
@@ -17,7 +17,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_COPY_FILES_TO_BINARY_DIR(
+ROL_COPY_FILES_TO_BINARY_DIR(
   ZakharovDataCopy
   SOURCE_FILES
     parameters.xml
diff --git a/packages/rol/example/zakharov/example_01.cpp b/packages/rol/example/zakharov/example_01.cpp
index 3fb0814414c7..a9c0e5ef4652 100644
--- a/packages/rol/example/zakharov/example_01.cpp
+++ b/packages/rol/example/zakharov/example_01.cpp
@@ -22,7 +22,7 @@
 #include "ROL_ParameterList.hpp"
 #include "ROL_ParameterListConverters.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 
@@ -30,13 +30,11 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  using namespace Teuchos;
-
   typedef std::vector<RealT>          vector;  
   typedef ROL::Vector<RealT>          V;      // Abstract vector
   typedef ROL::StdVector<RealT>       SV;     // Concrete vector containing std::vector data
 
-  GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   auto outStream = ROL::makeStreamPtr( std::cout, argc > 1 );
diff --git a/packages/rol/example/zakharov/example_02.cpp b/packages/rol/example/zakharov/example_02.cpp
index da0a915c1632..e60bab78c6aa 100644
--- a/packages/rol/example/zakharov/example_02.cpp
+++ b/packages/rol/example/zakharov/example_02.cpp
@@ -21,19 +21,17 @@
 #include "ROL_Zakharov.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  using namespace Teuchos;
-
   typedef std::vector<RealT>          vector;  
   typedef ROL::Vector<RealT>          V;      // Abstract vector
   typedef ROL::StdVector<RealT>       SV;     // Concrete vector containing std::vector data
 
-  GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   auto outStream = ROL::makeStreamPtr( std::cout, argc > 1 );
diff --git a/packages/rol/pyrol/CMakeLists.txt b/packages/rol/pyrol/CMakeLists.txt
index 8efacf8edb59..d3a4ab3f156c 100644
--- a/packages/rol/pyrol/CMakeLists.txt
+++ b/packages/rol/pyrol/CMakeLists.txt
@@ -27,6 +27,8 @@ TRIBITS_ADD_OPTION_AND_DEFINE(PYROL_BINDER_USE_ONE_FILE
   "Enable the use of one file by Binder."
   OFF )
 
+SET(PYROL_BINDER_NUM_FILES "100" CACHE STRING "Number of files to use when PYROL_USE_ONE_FILE=OFF.")
+
 TRIBITS_ADD_OPTION_AND_DEFINE(PYROL_BINDER_CMAKE_ERROR
   PYROL_CMAKE_ERROR
   "Stop the configuration if Binder fails."
@@ -402,6 +404,7 @@ EXECUTE_PROCESS(COMMAND
   )
 MESSAGE(STATUS "pybind11 CMake path: ${pybind11_DIR}")
 
+find_package(Python3 REQUIRED Development.Module)
 find_package(pybind11 REQUIRED)
 
 EXECUTE_PROCESS(COMMAND
diff --git a/packages/rol/pyrol/pyproject.toml b/packages/rol/pyrol/pyproject.toml
index 0138249c8c0e..ea070d73b48f 100644
--- a/packages/rol/pyrol/pyproject.toml
+++ b/packages/rol/pyrol/pyproject.toml
@@ -2,28 +2,28 @@
 requires = ["scikit-build-core>=0.3.3","pybind11 @ git+https://github.com/pybind/pybind11.git@c6c9a9e59b2b64393de0432aa6867ed27367912a"]
 build-backend = "scikit_build_core.build"
 
-
 [project]
-name = "pyrol"
-version = "0.0.1"
-description="A Python interface to the Rapid Optimization Library (ROL)"
-readme = "README.md"
+name = "rol-python"
+version = "VERSION"
+description="Python interface to the Rapid Optimization Library (ROL)"
+readme = "packages/rol/README.md"
+license = "BSD-3-Clause"
 authors = [
   { name = "Christian Glusa" },
   { name = "Kim Liegeois" },
   { name = "Aurya Javeed" },
 ]
-requires-python = ">=3.7"
+requires-python = ">=3.8"
 dependencies = ["numpy"]
 classifiers = [
   "Development Status :: 4 - Beta",
-  "License :: OSI Approved :: MIT License",
-  "Programming Language :: Python :: 3 :: Only",
-  "Programming Language :: Python :: 3.7",
   "Programming Language :: Python :: 3.8",
   "Programming Language :: Python :: 3.9",
   "Programming Language :: Python :: 3.10",
   "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+  "Programming Language :: Python :: 3.13",
+  "Programming Language :: Python :: 3.14",
 ]
 
 [tool.scikit-build]
@@ -47,4 +47,4 @@ ROL_ENABLE_PYROL = "ON"
 PYROL_ENABLE_BINDER = "OFF"
 PYROL_PIP_INSTALL = "ON"
 CMAKE_INSTALL_RPATH="$ORIGIN/lib64;$ORIGIN/lib;$ORIGIN;@loader_path/lib64;@loader_path/lib;@loader_path"
-CMAKE_INTERPROCEDURAL_OPTIMIZATION="OFF"
+CMAKE_INTERPROCEDURAL_OPTIMIZATION="OFF"
\ No newline at end of file
diff --git a/packages/rol/pyrol/python/__init__.py b/packages/rol/pyrol/python/__init__.py
index 1dbc5274c14d..f7ca6557e70d 100644
--- a/packages/rol/pyrol/python/__init__.py
+++ b/packages/rol/pyrol/python/__init__.py
@@ -18,9 +18,17 @@ def wrapper(scalarType=getDefaultScalarType()):
 
 defaultScalarType = getDefaultScalarType()
 
-supported_objects = {"Bounds", "Constraint", "LinearOperator",
-                     "LinearConstraint", "Objective", "Problem", "Solver",
-                     "Vector", "getCout", "getParametersFromXmlFile"}
+supported_objects = {"Bounds",
+                     "Constraint",
+                     "getCout", 
+                     "getParametersFromXmlFile", 
+                     "getParametersFromYamlFile",
+                     "LinearConstraint", 
+                     "LinearOperator",
+                     "Objective", 
+                     "Problem", 
+                     "Solver",
+                     "Vector"}
 
 unsupported = importlib.import_module('.unsupported', 'pyrol')
 
diff --git a/packages/rol/pyrol/src/PyROL_ETI.hpp b/packages/rol/pyrol/src/PyROL_ETI.hpp
index 857443539003..aec26cb77a94 100644
--- a/packages/rol/pyrol/src/PyROL_ETI.hpp
+++ b/packages/rol/pyrol/src/PyROL_ETI.hpp
@@ -20,15 +20,17 @@
 #include <ROL_Objective.hpp>
 #include <ROL_Objective_SimOpt.hpp>
 #include <ROL_OED_Factory.hpp>
+#include <ROL_PrimalDualRisk.hpp>
 #include <ROL_Problem.hpp>
 #include <ROL_ReducedDynamicObjective.hpp>
 #include <ROL_Reduced_Objective_SimOpt.hpp>
-#include <ROL_RiskNeutralObjective.hpp>
 #include <ROL_SampleGenerator.hpp>
 #include <ROL_SerialConstraint.hpp>
 #include <ROL_SerialObjective.hpp>
 #include <ROL_SimConstraint.hpp>
 #include <ROL_Solver.hpp>
+#include <ROL_StochasticProblem.hpp>
+#include <ROL_UserInputGenerator.hpp>
 #include <ROL_ValidateFunction.hpp>
 #include <ROL_Vector.hpp>
 #include <ROL_Vector_SimOpt.hpp>
@@ -65,8 +67,11 @@
 
 #define BINDER_ROL_STOCHASTIC(SCALAR) \
   BINDER_ETI_ABSTRACT(MonteCarloGenerator<SCALAR>) \
+  BINDER_ETI_ABSTRACT(PrimalDualRisk<SCALAR>) \
   BINDER_ETI_ABSTRACT(RiskNeutralObjective<SCALAR>) \
-  BINDER_ETI_ABSTRACT(SampleGenerator<SCALAR>)
+  BINDER_ETI_ABSTRACT(SampleGenerator<SCALAR>) \
+  BINDER_ETI_ABSTRACT(StochasticProblem<SCALAR>) \
+  BINDER_ETI_ABSTRACT(UserInputGenerator<SCALAR>)
 
 #define BINDER_ROL_OED(SCALAR) \
   BINDER_ETI_ABSTRACT(Factory<SCALAR>)
@@ -86,7 +91,6 @@ namespace OED {
   BINDER_ROL_OED(double)
 }
 
-
 }
 
 #endif // PYROL_ETI
diff --git a/packages/rol/rol.png b/packages/rol/rol.png
index e30489faaec3..5b8232a2642f 100644
Binary files a/packages/rol/rol.png and b/packages/rol/rol.png differ
diff --git a/packages/rol/src/CMakeLists.txt b/packages/rol/src/CMakeLists.txt
index a32dcd9528c7..2424bc99ba85 100644
--- a/packages/rol/src/CMakeLists.txt
+++ b/packages/rol/src/CMakeLists.txt
@@ -1,5 +1,6 @@
 
 INCLUDE(TribitsLibraryMacros)
+INCLUDE(ROLUtils)
 
 #
 # A) Package-specific configuration options
@@ -51,6 +52,9 @@ GET_PROPERTY( LAPACK_DIR    GLOBAL PROPERTY LAPACK_DIR  )
 GET_PROPERTY( BLAS_STRING GLOBAL PROPERTY BLAS_IMPL )
 GET_PROPERTY( BLAS_DIR    GLOBAL PROPERTY BLAS_DIR  )
 
+GET_PROPERTY( MPI_STRING GLOBAL PROPERTY MPI_IMPL )
+GET_PROPERTY( MPI_DIR    GLOBAL PROPERTY MPI_DIR  )
+
 MESSAGE( "-- ROL has been configured to use the build options:"              )
 MESSAGE( "--   ROL::Ptr           is implemented by ${PTR_STRING}"           )
 MESSAGE( "--   ROL::ParameterList is implemented by ${PARAMETERLIST_STRING}" )
@@ -58,6 +62,7 @@ MESSAGE( "--   ROL::stacktrace    is implemented by ${STACKTRACE_STRING}"    )
 MESSAGE( "--   ROL::LinearAlgebra is implemented by ${LA_STRING}"            )
 MESSAGE( "--   ROL::LAPACK        is implemented by ${LAPACK_STRING}"        )
 MESSAGE( "--   ROL::BLAS          is implemented by ${BLAS_STRING}"          )
+MESSAGE( "--   ROL::MPI           is implemented by ${MPI_STRING}"           )
 
 TRIBITS_INCLUDE_DIRECTORIES( ${PTR_DIR} )
 APPEND_GLOB( HEADERS ${PTR_DIR}/*.hpp )
@@ -77,6 +82,9 @@ APPEND_GLOB( HEADERS ${LAPACK_DIR}/*.hpp )
 TRIBITS_INCLUDE_DIRECTORIES( ${BLAS_DIR} )
 APPEND_GLOB( HEADERS ${BLAS_DIR}/*.hpp )
 
+ROL_INCLUDE_DIRECTORIES( ${MPI_DIR} )
+APPEND_GLOB( HEADERS ${MPI_DIR}/*.hpp )
+
 #
 # vector headers
 #
diff --git a/packages/rol/src/algorithm/TypeP/ROL_TypeP_SpectralGradientAlgorithm_Def.hpp b/packages/rol/src/algorithm/TypeP/ROL_TypeP_SpectralGradientAlgorithm_Def.hpp
index 59e7662748d1..8ff4b8e19154 100644
--- a/packages/rol/src/algorithm/TypeP/ROL_TypeP_SpectralGradientAlgorithm_Def.hpp
+++ b/packages/rol/src/algorithm/TypeP/ROL_TypeP_SpectralGradientAlgorithm_Def.hpp
@@ -10,6 +10,8 @@
 #ifndef ROL_TYPEP_SPECTRALGRADIENTALGORITHM_DEF_HPP
 #define ROL_TYPEP_SPECTRALGRADIENTALGORITHM_DEF_HPP
 
+#include <deque>
+
 namespace ROL {
 namespace TypeP {
 
diff --git a/packages/rol/src/compatibility/eigen/ROL_LinearAlgebra.hpp b/packages/rol/src/compatibility/eigen/ROL_LinearAlgebra.hpp
index a72d658ef590..6f65749acb1f 100644
--- a/packages/rol/src/compatibility/eigen/ROL_LinearAlgebra.hpp
+++ b/packages/rol/src/compatibility/eigen/ROL_LinearAlgebra.hpp
@@ -42,81 +42,253 @@ using EMatrix = Eigen::Matrix<Real,Eigen::Dynamic,Eigen::Dynamic>;
 template<typename Real>
 class Vector{
   private:
-    EVector<Real> v_;
+    EVector<Real> v_;  // Owned data
+    Eigen::Map<EVector<Real>> map_;  // View of external data
+    mutable bool using_map_;  // Flag to track which member to use
+    
   public:
-    Vector(int size) : v_(size) {}
-    Vector() : v_() {}
-    Real* values(){ return const_cast<Real*>(v_.data()); }
-    void resize(int n) { v_.resize(n); }
-    void size(int n) { v_.resize(n); v_.setZero();}
+    Vector(int size) : v_(size), map_(v_.data(), size), using_map_(false) {}
+    Vector() : v_(), map_(nullptr, 0), using_map_(false) {}
+    
+    Vector(const Vector& other) : v_(), map_(nullptr, 0), using_map_(false) {
+      if (other.using_map_) {
+        // Copy data from the map into owned storage
+        v_ = other.map_;
+        map_ = Eigen::Map<EVector<Real>>(v_.data(), v_.size());
+      } else {
+        v_ = other.v_;  
+        map_ = Eigen::Map<EVector<Real>>(v_.data(), v_.size());
+      }
+    }
+    
+    Vector(DataAccess access, Real* data, int size) : v_(), map_(nullptr, 0), using_map_(true) {
+      if (access == View) {
+        // Create map to external data
+        map_ = Eigen::Map<EVector<Real>>(data, size);
+      } else {
+        // Copy data into owned storage
+        v_.resize(size);
+        using_map_ = false;
+        map_ = Eigen::Map<EVector<Real>>(v_.data(), size);
+        std::copy(data, data + size, v_.data());
+      }
+    }
+
+    Real* values() { 
+      return using_map_ ? const_cast<Real*>(map_.data()) : const_cast<Real*>(v_.data()); 
+    }
+    
+    void resize(int n) { 
+      if (!using_map_) {
+        v_.resize(n); 
+        map_ = Eigen::Map<EVector<Real>>(v_.data(), n);
+      }
+      // Cannot resize a mapped vector
+    }
+    
+    void size(int n) { 
+      if (!using_map_) {
+        v_.resize(n); 
+        v_.setZero();
+        map_ = Eigen::Map<EVector<Real>>(v_.data(), n);
+      }
+    }
+    
     Real& operator()(int i) {
-      return v_(i);
+      return using_map_ ? map_(i) : v_(i);
+    }
+    
+    Real& operator[](int i) { 
+      return using_map_ ? map_[i] : v_[i];
     }
-    Real& operator[](int i) { return v_[i];}
-    Real dot(Vector<Real> x) {
-      return v_.dot(x.v_);
+    
+    Real dot(const Vector<Real>& x) const {
+      auto& this_ref = using_map_ ? map_ : v_;
+      auto& x_ref = x.using_map_ ? x.map_ : x.v_;
+      return this_ref.dot(x_ref);
+    }
+    
+    void operator -=(const Vector<Real>& x) { 
+//      auto& this_ref = using_map_ ? map_ : v_;
+//      const auto& x_ref = x.using_map_ ? x.map_ : x.v_;
+//      this_ref -= x_ref;
+      map_ -= x.map_;
+    }
+    
+    void operator +=(const Vector<Real>& x) { 
+//      auto& this_ref = using_map_ ? map_ : v_;
+//      const auto& x_ref = x.using_map_ ? x.map_ : x.v_;
+//      this_ref += x_ref;
+      map_ += x.map_;
+    }
+    
+    void scale(Real alpha) { 
+      if (using_map_) {
+        map_ *= alpha;
+      } else {
+        v_ *= alpha;
+      }
+    }
+    
+    int numRows() const { 
+      return using_map_ ? map_.size() : v_.size(); 
+    }
+    
+    int stride() const { 
+      return numRows(); // For LAPACK compatibility - return size for vectors
     }
-    void operator -=(Vector<Real> x) { v_-= x.v_;}
-    void operator +=(Vector<Real> x) { v_+= x.v_;}
-    void scale(Real alpha) { v_ *= alpha; }
-    int numRows() { return v_.size(); }
-    int stride() { return v_.outerStride(); }
 };
 
 template<typename Real>
 class Matrix{
   private:
-    EMatrix<Real> M_;
+    EMatrix<Real> M_;  // Owned data
+    Eigen::Map<EMatrix<Real>> map_;  // View of external data
+    bool using_map_;  // Flag to track which member to use
+    
   public:
-    Matrix() : M_() {}
-    Matrix(int rows, int columns) : M_(rows, columns) {}
+    Matrix() : M_(), map_(nullptr, 0, 0), using_map_(false) {}
+    
+    Matrix(const Matrix& other) : M_(), map_(nullptr, 0, 0), using_map_(false) {
+      if (other.using_map_) {
+        // Copy data from the map into owned storage
+        M_ = other.map_;
+        map_ = Eigen::Map<EMatrix<Real>>(M_.data(), M_.rows(), M_.cols());
+      } else {
+        M_ = other.M_;  
+        map_ = Eigen::Map<EMatrix<Real>>(M_.data(), M_.rows(), M_.cols());
+      }
+    }
+    
+    Matrix(int rows, int columns) : M_(rows, columns), map_(M_.data(), rows, columns), using_map_(false) {
+      // Create owned matrix and map pointing to it
+    }
+    
+    // New constructor to support DataAccess with external data pointer
+    Matrix(DataAccess access, Real* data, int rows, int cols, int stride = 0) 
+      : M_(), map_(nullptr, 0, 0), using_map_(true) {
+      if (stride == 0) stride = rows;  // Default to column-major
+      
+      if (access == View) {
+        // Create map to external data
+        map_ = Eigen::Map<EMatrix<Real>>(data, rows, cols);
+      } else { // Copy
+        // Create owned storage and copy data
+        M_.resize(rows, cols);
+        using_map_ = false;
+        map_ = Eigen::Map<EMatrix<Real>>(M_.data(), rows, cols);
+        
+        // Copy data respecting stride (assume column-major external data)
+        for (int j = 0; j < cols; j++) {
+          for (int i = 0; i < rows; i++) {
+            M_(i, j) = data[i + j * stride];
+          }
+        }
+      }
+    }
+    
+    // Existing block constructor - modified to work with dual member approach
     Matrix(DataAccess access, Matrix<Real> A, int rows, int cols, int rowstart = 0, int colstart=0)
-    {
-      if(access == Copy)
-        M_ = A.M_.block(rowstart, colstart, rows, cols).eval();
-      else
-        M_ = A.M_.block(rowstart, colstart, rows, cols); // Does this DTRT?
-
+      : M_(), map_(nullptr, 0, 0), using_map_(false) {
+      if(access == Copy) {
+        if (A.using_map_) {
+          M_ = A.map_.block(rowstart, colstart, rows, cols).eval();
+        } else {
+          M_ = A.M_.block(rowstart, colstart, rows, cols).eval();
+        }
+        map_ = Eigen::Map<EMatrix<Real>>(M_.data(), M_.rows(), M_.cols());
+      } else {
+        // View case - this is tricky with Map, we'll create a copy for now
+        // A proper implementation would need nested Map support
+        if (A.using_map_) {
+          M_ = A.map_.block(rowstart, colstart, rows, cols).eval();
+        } else {
+          M_ = A.M_.block(rowstart, colstart, rows, cols).eval();
+        }
+        map_ = Eigen::Map<EMatrix<Real>>(M_.data(), M_.rows(), M_.cols());
+      }
     }
-    Real* values(){ return const_cast<Real*>(M_.data()); }
-    int stride() { return M_.outerStride(); }
-    void reshape(int m, int n) { M_.resize(m, n); }
-    Real normOne() { return M_.template lpNorm<1>(); }
-    Eigen::PartialPivLU<EMatrix<Real>> partialPivLu(bool inplace)
-    {
-      if(inplace)
-        return Eigen::PartialPivLU<Eigen::Ref<EMatrix<Real>>>(M_);
-      else
-        return M_.partialPivLu();
+    
+    Real* values() { 
+      return using_map_ ? const_cast<Real*>(map_.data()) : const_cast<Real*>(M_.data()); 
+    }
+    
+    int stride() { 
+      // Return leading dimension for LAPACK compatibility
+      return using_map_ ? map_.rows() : M_.rows();
+    }
+    void reshape(int m, int n) { 
+      if (!using_map_) {
+        M_.resize(m, n); 
+        map_ = Eigen::Map<EMatrix<Real>>(M_.data(), m, n);
+      }
+      // Cannot reshape a mapped matrix
+    }
+    
+    Real normOne() { 
+      return using_map_ ? map_.template lpNorm<1>() : M_.template lpNorm<1>(); 
     }
+    
+    Eigen::PartialPivLU<EMatrix<Real>> partialPivLu(bool inplace) {
+      if (using_map_) {
+        if(inplace)
+          return Eigen::PartialPivLU<Eigen::Ref<Eigen::Map<EMatrix<Real>>>>(map_);
+        else
+          return map_.partialPivLu();
+      } else {
+        if(inplace)
+          return Eigen::PartialPivLU<Eigen::Ref<EMatrix<Real>>>(M_);
+        else
+          return M_.partialPivLu();
+      }
+    }
+    
     Real& operator()(int i, int j) {
-      return M_(i, j);
+      return using_map_ ? map_(i, j) : M_(i, j);
     }
 
-    void multiply	(ETransp transa, ETransp transb, Real alpha, const Matrix&	A,
-                   const Matrix& B, Real 	beta) {
+    void multiply(ETransp transa, ETransp transb, Real alpha, const Matrix& A, const Matrix& B, Real beta) {
+      // Get the appropriate matrix references
+      auto& A_ref = A.using_map_ ? A.map_ : A.M_;
+      auto& B_ref = B.using_map_ ? B.map_ : B.M_;
+//      auto& this_ref = using_map_ ? map_ : M_;
 
       EMatrix<Real> AA;
       if(transa == NO_TRANS)
-        AA = A.M_;
+        AA = A_ref;
       else if(transa == TRANS)
-        AA = A.M_.transpose();
+        AA = A_ref.transpose();
       else
-        AA = A.M_.conjugate();
+        AA = A_ref.conjugate();
+        
       EMatrix<Real> BB;
-      if(transa == NO_TRANS)
-        BB = B.M_;
-      else if(transa == TRANS)
-        BB = B.M_.transpose();
+      if(transb == NO_TRANS)
+        BB = B_ref;
+      else if(transb == TRANS)
+        BB = B_ref.transpose();
       else
-        BB = B.M_.conjugate();
-      if(beta != Real(1))
-        M_ *= beta;
-      M_.noalias() += alpha * AA * BB;
+        BB = B_ref.conjugate();
+        
+      if(beta != Real(1)) map_ *= beta;
+      map_.noalias() += alpha * AA * BB;
     }
 
-    int numRows() { return M_.rows(); }
-    int numCols() { return M_.cols(); }
+    int numRows() const { 
+      return using_map_ ? map_.rows() : M_.rows(); 
+    }
+    
+    int numCols() const { 
+      return using_map_ ? map_.cols() : M_.cols(); 
+    }
+    
+    // Set all elements to scalar value
+    void putScalar(Real value) {
+      if (using_map_) {
+        map_.setConstant(value);
+      } else {
+        M_.setConstant(value);
+      }
+    }
 
 };
 
diff --git a/packages/rol/src/compatibility/simple/lapack/ROL_LAPACK.hpp b/packages/rol/src/compatibility/simple/lapack/ROL_LAPACK.hpp
index a9e47e63c2f2..fb9352ad90c6 100644
--- a/packages/rol/src/compatibility/simple/lapack/ROL_LAPACK.hpp
+++ b/packages/rol/src/compatibility/simple/lapack/ROL_LAPACK.hpp
@@ -67,6 +67,11 @@ functions that include the macro:
 #define DGEEV_F77   F77_BLAS_MANGLE(dgeev,  DGEEV )
 #define DGELS_F77   F77_BLAS_MANGLE(dgels,  DGELS )
 #define DGELSS_F77  F77_BLAS_MANGLE(dgelss, DGELSS)
+#define DGESVD_F77  F77_BLAS_MANGLE(dgesvd, DGESVD)
+#define DGGEV_F77   F77_BLAS_MANGLE(dggev,  DGGEV)
+#define DPOTRF_F77  F77_BLAS_MANGLE(dpotrf, DPOTRF)
+#define DPOTRS_F77  F77_BLAS_MANGLE(dpotrs, DPOTRS)
+#define DPORFS_F77  F77_BLAS_MANGLE(dporfs, DPORFS)
 #define DLATRS_F77  F77_BLAS_MANGLE(dlatrs, DLATRS)
 #define DGTTRS_F77  F77_BLAS_MANGLE(dgttrs, DGTTRS)
 #define DGTTRF_F77  F77_BLAS_MANGLE(dgttrf, DGTTRF)
@@ -79,11 +84,16 @@ functions that include the macro:
 #define DPTTRS_F77  F77_BLAS_MANGLE(dpttrs, DPTTRS)
 #define DPTTRF_F77  F77_BLAS_MANGLE(dpttrf, DPTTRF)
 
-namespace ROL { 
+namespace ROL {
   extern "C" {
     void PREFIX DGEEV_F77(ROL_fcd, ROL_fcd, const int* n, double* a, const int* lda, double* wr, double* wi, double* vl, const int* ldvl, double* vr, const int* ldvr, double* work, const int* lwork, int* info);
     void PREFIX DGELS_F77(ROL_fcd ch, const int* m, const int* n, const int* nrhs, double* a, const int* lda, double* b, const int* ldb, double* work, const int* lwork, int* info);
     void PREFIX DGELSS_F77(const int* m, const int* n, const int* nrhs, double* a, const int* lda, double* b, const int* ldb, double* s, const double* rcond, int* rank, double* work, const int* lwork, int* info);
+    void PREFIX DGESVD_F77(ROL_fcd, ROL_fcd, const int* m, const int* n, double* a, const int* lda, double* s, double* u, const int* ldu, double* v, const int* ldv, double* work, const int* lwork, int* info);
+    void PREFIX DGGEV_F77(ROL_fcd, ROL_fcd, const int *n, double *A, const int *lda, double *B, const int *ldb, double *alphar, double *alphai, double *beta, double *vl, const int *ldvl, double *vr, const int *ldvr, double *work, const int *lwork, int *info);
+    void PREFIX DPOTRF_F77(ROL_fcd, const int* n, double* a, const int* lda, int* info);
+    void PREFIX DPOTRS_F77(ROL_fcd, const int* n, const int* nrhs, const double* a, const int* lda, double*x , const int* ldx, int* info);
+    void PREFIX DPORFS_F77(ROL_fcd, const int* n, const int* nrhs, const double* a, const int* lda, const double* af, const int* ldaf, const double* b, const int* ldb, double* x, const int* ldx, double* ferr, double* berr, double* work, int* iwork, int* info);
     void PREFIX DLATRS_F77(ROL_fcd UPLO, ROL_fcd TRANS, ROL_fcd DIAG, ROL_fcd NORMIN, const int* N, double* A, const int* LDA, double* X, double* SCALE, double* CNORM, int* INFO);
     void PREFIX DGTTRS_F77(ROL_fcd, const int* n, const int* nrhs, const double* dl, const double* d, const double* du, const double* du2, const int* ipiv, double* x , const int* ldx, int* info);
     void PREFIX DGTTRF_F77(const int* n, double* dl, double* d, double* du, double* du2, int* ipiv, int* info);
@@ -99,7 +109,7 @@ namespace ROL {
   }
 
   template<typename Index, typename Real>
-  struct LAPACK { 
+  struct LAPACK {
     /// \brief Computes for an \c n by \c n real nonsymmetric matrix \c A, the eigenvalues and, optionally, the left and/or right eigenvectors.
     ///
     /// Real and imaginary parts of the eigenvalues are returned in
@@ -112,6 +122,21 @@ namespace ROL {
 
     void GELSS(const Index& m, const Index& n, const Index& nrhs, Real* A, const Index& lda, Real* B, const Index& ldb, Real* S, const Real& rcond, Index* rank, Real* WORK, const Index& lwork, Index* info) const;
 
+    void GESVD(const char& JOBU, const char& JOBVT, const Index& m, const Index& n, Real* A, const Index& lda, Real* S, Real* U, const Index& ldu, Real* V, const Index& ldv, Real* WORK, const Index& lwork, Real* /* RWORK */, Index* info) const;
+
+    /*! Computes for a pair of \c n by \c n nonsymmetric matrices (\c A,\c B) the generalized eigenvalues, and optionally, the left and/or right generalized eigenvectors.
+       \note (This is the function is only defined for \c Real = \c float or \c double.)
+    */
+    void GGEV(const char& JOBVL, const char& JOBVR, const Index& n, Real* A, const Index& lda, Real* B, const Index& ldb, Real *ALPHAR, Real *ALPHAI, Real* BETA, Real* VL, const Index& ldvl, Real* VR, const Index& ldvr, Real* WORK, const Index& lwork, Index* info) const;
+
+    //! Computes Cholesky factorization of a real symmetric positive definite matrix \c A.
+    void POTRF(const char& UPLO, const Index& n, Real* A, const Index& lda, Index* info) const;
+
+    //! Solves a system of linear equations \c A*X=B, where \c A is a symmetric positive definite matrix factored by POTRF and the \c nrhs solutions are returned in \c B.
+    void POTRS(const char& UPLO, const Index& n, const Index& nrhs, const Real* A, const Index& lda, Real* B, const Index& ldb, Index* info) const;
+
+    //! Improves the computed solution to a system of linear equations when the coefficient matrix is symmetric positive definite, and provides error bounds and backward error estimates for the solution.
+    void PORFS(const char& UPLO, const Index& n, const Index& nrhs, const Real* A, const Index& lda, const Real* AF, const Index& ldaf, const Real* B, const Index& ldb, Real* X, const Index& ldx, Real* FERR, Real* BERR, Real* WORK, Index* IWORK, Index* info) const;
 
     /// \brief Robustly solve a possibly singular triangular linear system.
     ///
@@ -165,7 +190,7 @@ namespace ROL {
   };
 
   template<>
-  struct LAPACK<int, double> { 
+  struct LAPACK<int, double> {
 
     void LATRS (const char& UPLO, const char& TRANS, const char& DIAG, const char& NORMIN,
                 const int& N, double* A, const int& LDA, double* X, double* SCALE, double* CNORM,
@@ -176,7 +201,7 @@ namespace ROL {
 
     void GTTRS(const char& TRANS, const int& n, const int& nrhs, const double* dl, const double* d,
                const double* du, const double* du2, const int* IPIV, double* B, const int& ldb, int* info) const {
-      DGTTRS_F77(CHAR_MACRO(TRANS), &n, &nrhs, dl, d, du, du2, IPIV, B, &ldb, info); 
+      DGTTRS_F77(CHAR_MACRO(TRANS), &n, &nrhs, dl, d, du, du2, IPIV, B, &ldb, info);
     }
 
     void GTTRF(const int& n, double* dl, double* d, double* du, double* du2, int* IPIV, int* info) const {
@@ -187,31 +212,31 @@ namespace ROL {
       DSTEQR_F77(CHAR_MACRO(COMPZ), &n, D, E, Z, &ldz, WORK, info);
     }
 
-    void TRTRS(const char& UPLO, const char& TRANS, const char& DIAG, const int& n, const int& nrhs, const double* A, const int& lda, double* B, const int& ldb, int* info) const { 
+    void TRTRS(const char& UPLO, const char& TRANS, const char& DIAG, const int& n, const int& nrhs, const double* A, const int& lda, double* B, const int& ldb, int* info) const {
       DTRTRS_F77(CHAR_MACRO(UPLO), CHAR_MACRO(TRANS), CHAR_MACRO(DIAG), &n, &nrhs, A, &lda, B, &ldb, info);
     }
 
     void GETRF(const int& m, const int& n, double* A, const int& lda, int* IPIV, int* info) const {
-      DGETRF_F77(&m, &n, A, &lda, IPIV, info); 
+      DGETRF_F77(&m, &n, A, &lda, IPIV, info);
     }
- 
+
     void GETRI(const int& n, double* A, const int& lda, const int* IPIV, double* WORK, const int& lwork, int* info) const {
-      DGETRI_F77(&n, A, &lda, IPIV, WORK, &lwork, info); 
+      DGETRI_F77(&n, A, &lda, IPIV, WORK, &lwork, info);
     }
 
     void GETRS(const char& TRANS, const int& n, const int& nrhs, const double* A, const int& lda, const int* IPIV, double* B, const int& ldb, int* info) const {
-      DGETRS_F77(CHAR_MACRO(TRANS), &n, &nrhs, A, &lda, IPIV, B, &ldb, info); 
+      DGETRS_F77(CHAR_MACRO(TRANS), &n, &nrhs, A, &lda, IPIV, B, &ldb, info);
     }
 
     void GEQRF( const int& m, const int& n, double* A, const int& lda, double* TAU, double* WORK, const int& lwork, int* info) const {
-      DGEQRF_F77(&m, &n, A, &lda, TAU, WORK, &lwork, info); 
+      DGEQRF_F77(&m, &n, A, &lda, TAU, WORK, &lwork, info);
     }
 
     void ORGQR(const int& m, const int& n, const int& k, double* A, const int& lda, const double* TAU, double* WORK, const int& lwork, int* info) const {
       DORGQR_F77( &m, &n, &k, A, &lda, TAU, WORK, &lwork, info);
     }
 
-    void PTTRS(const int& n, const int& nrhs, const double* d, const double* e, double* B, const int& ldb, int* info) const { 
+    void PTTRS(const int& n, const int& nrhs, const double* d, const double* e, double* B, const int& ldb, int* info) const {
       DPTTRS_F77(&n,&nrhs,d,e,B,&ldb,info);
     }
 
@@ -219,16 +244,36 @@ namespace ROL {
       DGEEV_F77(CHAR_MACRO(JOBVL), CHAR_MACRO(JOBVR), &n, A, &lda, WR, WI, VL, &ldvl, VR, &ldvr, WORK, &lwork, info);
     }
 
-    void GELS(const char& TRANS, const int& m, const int& n, const int& nrhs, double* A, const int& lda, double* B, const int& ldb, double* WORK, const int& lwork, int* info) const { 
-      DGELS_F77(CHAR_MACRO(TRANS), &m, &n, &nrhs, A, &lda, B, &ldb, WORK, &lwork, info); 
+    void GELS(const char& TRANS, const int& m, const int& n, const int& nrhs, double* A, const int& lda, double* B, const int& ldb, double* WORK, const int& lwork, int* info) const {
+      DGELS_F77(CHAR_MACRO(TRANS), &m, &n, &nrhs, A, &lda, B, &ldb, WORK, &lwork, info);
+    }
+
+    void GELSS(const int& m, const int& n, const int& nrhs, double* A, const int& lda, double* B, const int& ldb, double* S, const double& rcond, int* rank, double* WORK, const int& lwork, int* info) const {
+      DGELSS_F77(&m, &n, &nrhs, A, &lda, B, &ldb, S, &rcond, rank, WORK, &lwork, info);
+    }
+
+    void GESVD(const char& JOBU, const char& JOBVT, const int& m, const int& n, double* A, const int& lda, double* S, double* U, const int& ldu, double* V, const int& ldv, double* WORK, const int& lwork, double* /* RWORK */, int* info) const {
+      DGESVD_F77(CHAR_MACRO(JOBU), CHAR_MACRO(JOBVT), &m, &n, A, &lda, S, U, &ldu, V, &ldv, WORK, &lwork, info);
+    }
+
+    void GGEV(const char& JOBVL, const char& JOBVR, const int& n, double* A, const int& lda, double* B, const int& ldb, double* ALPHAR, double* ALPHAI, double* BETA, double* VL, const int& ldvl, double* VR, const int& ldvr, double* WORK, const int& lwork, int* info) const {
+      DGGEV_F77(CHAR_MACRO(JOBVL), CHAR_MACRO(JOBVR), &n, A, &lda, B, &ldb, ALPHAR, ALPHAI, BETA, VL, &ldvl, VR, &ldvr, WORK, &lwork, info);
+    }
+
+    void POTRF(const char& UPLO, const int& n, double* A, const int& lda, int* info) const {
+      DPOTRF_F77(CHAR_MACRO(UPLO), &n, A, &lda, info);
+    }
+
+    void POTRS(const char& UPLO, const int& n, const int& nrhs, const double* A, const int& lda, double* B, const int& ldb, int* info) const {
+      DPOTRS_F77(CHAR_MACRO(UPLO), &n, &nrhs, A, &lda, B, &ldb, info);
     }
 
-    void GELSS(const int& m, const int& n, const int& nrhs, double* A, const int& lda, double* B, const int& ldb, double* S, const double& rcond, int* rank, double* WORK, const int& lwork, int* info) const { 
-      DGELSS_F77(&m, &n, &nrhs, A, &lda, B, &ldb, S, &rcond, rank, WORK, &lwork, info); 
+    void PORFS(const char& UPLO, const int& n, const int& nrhs, const double* A, const int& lda, const double* AF, const int& ldaf, const double* B, const int& ldb, double* X, const int& ldx, double* FERR, double* BERR, double* WORK, int* IWORK, int* info) const {
+      DPORFS_F77(CHAR_MACRO(UPLO), &n, &nrhs, A, &lda, AF, &ldaf, B, &ldb, X, &ldx, FERR, BERR, WORK, IWORK, info);
     }
 
     void PTTRF(const int& n, double* d, double* e, int* info) const {
-      DPTTRF_F77(&n,d,e,info); 
+      DPTTRF_F77(&n,d,e,info);
     }
   };
 
diff --git a/packages/rol/src/compatibility/simple/mpi/ROL_GlobalMPISession.hpp b/packages/rol/src/compatibility/simple/mpi/ROL_GlobalMPISession.hpp
new file mode 100644
index 000000000000..c9d99ff007c9
--- /dev/null
+++ b/packages/rol/src/compatibility/simple/mpi/ROL_GlobalMPISession.hpp
@@ -0,0 +1,28 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+#ifndef ROL_GlobalMPISession_HPP
+#define ROL_GlobalMPISession_HPP
+
+
+namespace ROL {
+
+struct GlobalMPISession {
+  GlobalMPISession(...) {}
+  static void abort() {}
+  static void barrier() {}
+  static int getNProc() { return 1; }
+  static int getRank() { return 1; }
+  static int sum(int localVal) { return localVal; }
+};  
+
+} 
+
+#endif // ROL_GlobalMPISession_HPP
+
diff --git a/packages/rol/src/compatibility/simple/parameterlist/ROL_ParameterList.hpp b/packages/rol/src/compatibility/simple/parameterlist/ROL_ParameterList.hpp
index b7294ab5a0f0..bfc481329b99 100644
--- a/packages/rol/src/compatibility/simple/parameterlist/ROL_ParameterList.hpp
+++ b/packages/rol/src/compatibility/simple/parameterlist/ROL_ParameterList.hpp
@@ -16,19 +16,21 @@
 #include <string>
 #include <vector>
 #include <type_traits>
+#include <sstream>
+#include <iostream>
 
 #include "ROL_Stream.hpp"
 
 namespace ROL {
 
-#if __cplusplus >= 201703L 
+#if __cplusplus >= 201703L
 template<class...B>
 inline constexpr bool disjunction_v = std::disjunction<B...>::value;
 #else
 template<class...> struct disjunction : std::false_type { };
 template<class B1> struct disjunction<B1> : B1 { };
 template<class B1, class... Bn>
-struct disjunction<B1, Bn...> 
+struct disjunction<B1, Bn...>
     : std::conditional_t<bool(B1::value), B1, disjunction<Bn...>>  {};
 template<class...B>
 constexpr bool disjunction_v = disjunction<B...>::value;
@@ -45,19 +47,19 @@ std::ostream& operator << ( std::ostream& os, const std::vector<value_type>& v )
 }
 
 
-template<typename ValueType> 
+template<typename ValueType>
 class PList {
 public:
- 
+
   using key_type   = std::string;
   using value_type = ValueType;
-  
-  inline void set( key_type   key, 
+
+  inline void set( key_type   key,
                    value_type value ) {
     values_[key] = value;
   }
- 
-  inline value_type get( key_type   key, 
+
+  inline value_type get( key_type   key,
                          value_type value ) {
     if( values_.find(key) == values_.end() ) values_[key] = value;
     return values_.at(key);
@@ -69,18 +71,19 @@ class PList {
 
   virtual int get_level() const = 0;
 
-  friend std::ostream& operator << ( std::ostream& os, 
+  friend std::ostream& operator << ( std::ostream& os,
                                      const PList& plist ) {
-    for( auto it=plist.values_.begin(); 
+    for( auto it=plist.values_.begin();
               it != plist.values_.end(); ++it )
-      os << std::string(2*(plist.get_level()),' ') << 
+      os << std::string(2*(plist.get_level()),' ') <<
             it->first << " : " << it->second << std::endl;
     return os;
   }
 
-private:
   const std::map<key_type,value_type>& get_values() const { return values_; }
-  std::map<key_type,value_type> values_;  
+
+private:
+  std::map<key_type,value_type> values_;
 
 }; // class detail::PList
 
@@ -99,25 +102,25 @@ class ParameterList : public PList<ValueTypes>... {
   ParameterList( int level = 0 ) : level_(level) {}
 
   template<typename value_type>
-  inline void set( key_type   key, 
+  inline void set( key_type   key,
                    value_type value ) {
     static_assert( is_valid_type_v<value_type>, "" );
     static_cast<PList<value_type>&>(*this).set( key, value );
   }
 
-  inline void set( key_type    key, 
+  inline void set( key_type    key,
                    const char* value ) {
     set(key,key_type(value));
   }
 
-  inline std::string get( key_type    key, 
+  inline std::string get( key_type    key,
                           const char* value ) {
     return get(key,std::string(value));
   }
 
   template<typename value_type>
-  inline 
-  std::enable_if_t<is_valid_type_v<value_type>,value_type> 
+  inline
+  std::enable_if_t<is_valid_type_v<value_type>,value_type>
   get( key_type key ) const {
     //static_assert( is_valid_type_v<value_type>, "" );
     return PList<value_type>::get(key);
@@ -136,16 +139,16 @@ class ParameterList : public PList<ValueTypes>... {
 }; // class detail::ParameterList
 
 template<typename...ValueTypes>
-inline void display( std::ostream& os, 
+inline void display( std::ostream& os,
                      const ParameterList<ValueTypes...>& parlist ) {
   using expander = int[];
   (void)expander{0, (void( os << static_cast<const PList<ValueTypes>&>(parlist) ),0) ... };
 }
 
-} // namespace detail 
+} // namespace detail
 
 
-class ParameterList : public detail::ParameterList<bool, 
+class ParameterList : public detail::ParameterList<bool,
                                                    int,
                                                    double,
                                                    std::string,
@@ -156,9 +159,45 @@ class ParameterList : public detail::ParameterList<bool,
 
   using key_type = std::string;
 
+  // Iterator types for ParameterListConverters compatibility
+  class ConstIterator {
+  private:
+    std::vector<std::string> keys_;
+    std::size_t index_;
+
+  public:
+    ConstIterator(const std::vector<std::string>& keys, std::size_t index)
+      : keys_(keys), index_(index) {}
+
+    ConstIterator& operator++() { ++index_; return *this; }
+    bool operator!=(const ConstIterator& other) const { return index_ != other.index_; }
+    bool operator==(const ConstIterator& other) const { return index_ == other.index_; }
+    std::size_t getIndex() const { return index_; }
+  };
+
   ParameterList( int level = 0 ) : level_(level) {}
 
-  ParameterList& sublist( key_type key, 
+  // Constructor with name (for Teuchos compatibility) - name is ignored in simple implementation
+  ParameterList( const std::string& name, int level = 0 ) : level_(level) {}
+
+  // Deep copy constructor
+  ParameterList( const ParameterList& other ) : level_(other.level_) {
+    // Copy all parameter values from each base class
+    copyValues<bool>(other);
+    copyValues<int>(other);
+    copyValues<double>(other);
+    copyValues<std::string>(other);
+    copyValues<std::vector<int>>(other);
+    copyValues<std::vector<double>>(other);
+    copyValues<std::vector<std::string>>(other);
+
+    // Deep copy all sublists
+    for (const auto& sublist_pair : other.sublists_) {
+      sublists_[sublist_pair.first] = std::make_shared<ParameterList>(*sublist_pair.second);
+    }
+  }
+
+  ParameterList& sublist( key_type key,
                           bool     mustAlreadyExist=false );
 
 //  template<typename value_type>
@@ -177,9 +216,42 @@ class ParameterList : public detail::ParameterList<bool,
     return sublists_.find(key) != sublists_.end();
   }
 
+  // Iterator methods for ParameterListConverters compatibility
+  ConstIterator begin() const {
+    return ConstIterator(getAllKeys(), 0);
+  }
+
+  ConstIterator end() const {
+    std::vector<std::string> keys = getAllKeys();
+    return ConstIterator(keys, keys.size());
+  }
+
+  // Get parameter name from iterator
+  std::string name(const ConstIterator& it) const {
+    std::vector<std::string> keys = getAllKeys();
+    return keys[it.getIndex()];
+  }
+
+  // Type checking methods
+  template<typename T>
+  bool isType(const std::string& key) const {
+    return hasKey<T>(key);
+  }
+
+  // Check if parameter exists (any supported type)
+  bool isParameter(const std::string& key) const {
+    return hasKey<bool>(key) ||
+           hasKey<int>(key) ||
+           hasKey<double>(key) ||
+           hasKey<std::string>(key) ||
+           hasKey<std::vector<int>>(key) ||
+           hasKey<std::vector<double>>(key) ||
+           hasKey<std::vector<std::string>>(key);
+  }
+
   int get_level() const override { return level_; }
 
-  friend std::ostream& operator << ( std::ostream&  os, 
+  friend std::ostream& operator << ( std::ostream&  os,
                                      const ParameterList& parlist ) {
     detail::display( os , parlist );
     os << std::endl;
@@ -191,24 +263,134 @@ class ParameterList : public detail::ParameterList<bool,
 
 private:
 
+  // Helper method to get all parameter keys across all base classes
+  std::vector<std::string> getAllKeys() const {
+    std::vector<std::string> keys;
+    collectKeys<bool>(keys);
+    collectKeys<int>(keys);
+    collectKeys<double>(keys);
+    collectKeys<std::string>(keys);
+    collectKeys<std::vector<int>>(keys);
+    collectKeys<std::vector<double>>(keys);
+    collectKeys<std::vector<std::string>>(keys);
+    return keys;
+  }
+
+  // Helper template to collect keys for specific type
+  template<typename T>
+  void collectKeys(std::vector<std::string>& keys) const {
+    const auto& values = static_cast<const detail::PList<T>&>(*this).get_values();
+    for (const auto& pair : values) {
+      keys.push_back(pair.first);
+    }
+  }
+
+  // Helper template to check if key exists for specific type
+  template<typename T>
+  bool hasKey(const std::string& key) const {
+    const auto& values = static_cast<const detail::PList<T>&>(*this).get_values();
+    return values.find(key) != values.end();
+  }
+
+  // Helper template to copy values for specific type from another ParameterList
+  template<typename T>
+  void copyValues(const ParameterList& other) {
+    const auto& other_values = static_cast<const detail::PList<T>&>(other).get_values();
+    for (const auto& pair : other_values) {
+      static_cast<detail::PList<T>&>(*this).set(pair.first, pair.second);
+    }
+  }
+
   std::map<key_type,std::shared_ptr<ParameterList>> sublists_;
-  int level_ = 0;  
+  int level_ = 0;
 
 }; // ParameterList
 
+// Array parsing utilities similar to Teuchos::getArrayFromStringParameter
+template<typename value_type>
+std::vector<value_type> fromStringToArray(const std::string& arrayStr) {
+  std::vector<value_type> result;
+
+  // Remove braces and spaces
+  std::string cleaned = arrayStr;
+  size_t start = cleaned.find('{');
+  size_t end = cleaned.find('}');
+  if (start != std::string::npos && end != std::string::npos && end > start) {
+    cleaned = cleaned.substr(start + 1, end - start - 1);
+  }
+
+  // Parse comma-separated values
+  std::istringstream iss(cleaned);
+  std::string token;
+
+  while (std::getline(iss, token, ',')) {
+    // Trim whitespace
+    size_t first = token.find_first_not_of(" \t");
+    if (first != std::string::npos) {
+      size_t last = token.find_last_not_of(" \t");
+      token = token.substr(first, last - first + 1);
+
+      // Convert to appropriate type
+      if (!token.empty()) {
+        if constexpr (std::is_same_v<value_type, int>) {
+          result.push_back(std::stoi(token));
+        } else if constexpr (std::is_same_v<value_type, double>) {
+          result.push_back(std::stod(token));
+        } else if constexpr (std::is_same_v<value_type, std::string>) {
+          result.push_back(token);
+        }
+      }
+    }
+  }
+
+  return result;
+}
+
 template<typename value_type>
-inline std::vector<value_type>
-getArrayFromStringParameter( const ParameterList& parlist, std::string key ) {
-  return parlist.get<std::vector<value_type>>(key);
+std::vector<value_type> getArrayFromStringParameter(const ParameterList& parlist, const std::string& key) {
+  // First try to get as vector directly
+  try {
+    return parlist.get<std::vector<value_type>>(key);
+  } catch (...) {
+    // If that fails, try to get as string and parse it
+    try {
+      std::string arrayStr = parlist.get<std::string>(key);
+      return fromStringToArray<value_type>(arrayStr);
+    } catch (...) {
+      throw std::runtime_error("Parameter '" + key + "' not found or not convertible to array");
+    }
+  }
 }
 
 
 std::ostream& operator << ( std::ostream& os, const ParameterList& parlist );
 
+// Forward declaration for XML reader
+class XMLParameterListReader;
+
+/// \brief Create ParameterList from XML file using pugixml
+ROL::Ptr<ParameterList> getParametersFromXmlFile(const std::string& filename);
+
+/// \brief Write ParameterList to XML file using pugixml
+void writeParameterListToXmlFile(const ParameterList& params, const std::string& filename);
+
+
+
 } // namespace ROL
 
 #include "ROL_ParameterList.cpp"
+#include "ROL_XMLReader.hpp"
 
-#endif // ROL_PARAMETERLIST_HPP
+namespace ROL{
+// Global function for compatibility
+inline ROL::Ptr<ParameterList> getParametersFromXmlFile(const std::string& filename) {
+    return XMLReader::readParametersFromFile(filename);
+}
 
+// Global function for writing parameters to XML
+inline void writeParameterListToXmlFile(const ParameterList& params, const std::string& filename) {
+    XMLWriter::writeParametersToFile(params, filename);
+}
 
+}
+#endif // ROL_PARAMETERLIST_HPP
diff --git a/packages/rol/src/compatibility/simple/parameterlist/ROL_XMLReader.hpp b/packages/rol/src/compatibility/simple/parameterlist/ROL_XMLReader.hpp
new file mode 100644
index 000000000000..ea68fb65c679
--- /dev/null
+++ b/packages/rol/src/compatibility/simple/parameterlist/ROL_XMLReader.hpp
@@ -0,0 +1,282 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+#ifndef ROL_XMLREADER_HPP
+#define ROL_XMLREADER_HPP
+
+#include <pugixml.hpp>
+#include <string>
+#include <sstream>
+#include <stdexcept>
+#include <vector>
+#include <type_traits>
+#include "ROL_ParameterList.hpp"
+#include "ROL_Ptr.hpp"
+
+namespace ROL {
+
+class XMLReader {
+private:
+    enum class ArrayType { DOUBLE, INT, STRING, UNKNOWN };
+
+    // Helper function to detect if string looks like an array
+    static bool isArrayString(const std::string& value) {
+        size_t start = value.find('{');
+        size_t end = value.find('}');
+        return (start != std::string::npos && end != std::string::npos && end > start);
+    }
+
+    // Helper function to detect array element type
+    static ArrayType detectArrayType(const std::string& value) {
+        std::string cleaned = value;
+        size_t start = cleaned.find('{');
+        size_t end = cleaned.find('}');
+        if (start != std::string::npos && end != std::string::npos && end > start) {
+            cleaned = cleaned.substr(start + 1, end - start - 1);
+        }
+
+        // Check first non-whitespace element to determine type
+        std::istringstream iss(cleaned);
+        std::string token;
+
+        if (std::getline(iss, token, ',')) {
+            // Trim whitespace
+            size_t first = token.find_first_not_of(" \t");
+            if (first != std::string::npos) {
+                size_t last = token.find_last_not_of(" \t");
+                token = token.substr(first, last - first + 1);
+
+                // Try to parse as double, then int
+                try {
+                    std::stod(token);
+                    // Check if it contains decimal point
+                    if (token.find('.') != std::string::npos) {
+                        return ArrayType::DOUBLE;
+                    } else {
+                        return ArrayType::INT;
+                    }
+                } catch (...) {
+                    return ArrayType::STRING;
+                }
+            }
+        }
+
+        return ArrayType::UNKNOWN;
+    }
+
+public:
+    static ROL::Ptr<ParameterList> readParametersFromFile(const std::string& filename) {
+        pugi::xml_document doc;
+        pugi::xml_parse_result result = doc.load_file(filename.c_str());
+
+        if (!result) {
+            throw std::runtime_error("XMLReader: Failed to load XML file: " + filename +
+                                   " Error: " + result.description());
+        }
+
+        ROL::Ptr<ParameterList> params = ROL::makePtr<ParameterList>();
+
+        // Find the root ParameterList element
+        pugi::xml_node root = doc.child("ParameterList");
+        if (!root) {
+            throw std::runtime_error("XMLReader: No ParameterList element found in XML file");
+        }
+
+        parseParameterList(root, *params);
+        return params;
+    }
+
+private:
+    static void parseParameterList(const pugi::xml_node& node, ParameterList& params) {
+        for (pugi::xml_node child : node.children()) {
+            std::string name = child.name();
+
+            if (name == "Parameter") {
+                parseParameter(child, params);
+            } else if (name == "ParameterList") {
+                parseSublist(child, params);
+            }
+        }
+    }
+
+    static void parseParameter(const pugi::xml_node& node, ParameterList& params) {
+        std::string name = node.attribute("name").as_string();
+        std::string type = node.attribute("type").as_string();
+        std::string value = node.attribute("value").as_string();
+
+        if (name.empty()) {
+            throw std::runtime_error("XMLReader: Parameter element missing 'name' attribute");
+        }
+
+        if (type.empty()) {
+            throw std::runtime_error("XMLReader: Parameter element missing 'type' attribute");
+        }
+
+        // Parse value based on type
+        if (type == "double") {
+            params.set(name, std::stod(value));
+        } else if (type == "int") {
+            params.set(name, std::stoi(value));
+        } else if (type == "bool") {
+            bool bval = (value == "true" || value == "1");
+            params.set(name, bval);
+        } else if (type == "string") {
+            // Check if this is an array-like string and try to parse it as an array
+            if (isArrayString(value)) {
+                if (detectArrayType(value) == ArrayType::DOUBLE) {
+                    try {
+                        std::vector<double> arr = fromStringToArray<double>(value);
+                        params.set(name, arr);
+                        return;
+                    } catch (...) {
+                        // Fall back to string if parsing fails
+                    }
+                } else if (detectArrayType(value) == ArrayType::INT) {
+                    try {
+                        std::vector<int> arr = fromStringToArray<int>(value);
+                        params.set(name, arr);
+                        return;
+                    } catch (...) {
+                        // Fall back to string if parsing fails
+                    }
+                }
+            }
+            params.set(name, value);
+        } else {
+            // Default to string for unknown types, but check for arrays
+            if (isArrayString(value)) {
+                if (detectArrayType(value) == ArrayType::DOUBLE) {
+                    try {
+                        std::vector<double> arr = fromStringToArray<double>(value);
+                        params.set(name, arr);
+                        return;
+                    } catch (...) {
+                        // Fall back to string if parsing fails
+                    }
+                } else if (detectArrayType(value) == ArrayType::INT) {
+                    try {
+                        std::vector<int> arr = fromStringToArray<int>(value);
+                        params.set(name, arr);
+                        return;
+                    } catch (...) {
+                        // Fall back to string if parsing fails
+                    }
+                }
+            }
+            params.set(name, value);
+        }
+    }
+
+    static void parseSublist(const pugi::xml_node& node, ParameterList& params) {
+        std::string name = node.attribute("name").as_string();
+        if (name.empty()) {
+            throw std::runtime_error("XMLReader: ParameterList element missing 'name' attribute");
+        }
+
+        ParameterList& sublist = params.sublist(name);
+        parseParameterList(node, sublist);
+    }
+};
+
+class XMLWriter {
+public:
+    static void writeParametersToFile(const ParameterList& params, const std::string& filename) {
+        pugi::xml_document doc;
+
+        // Create root ParameterList element
+        pugi::xml_node root = doc.append_child("ParameterList");
+        root.append_attribute("name") = "Main";
+
+        // Write parameters to XML
+        writeParameterList(params, root);
+
+        // Save to file
+        if (!doc.save_file(filename.c_str(), "  ")) {
+            throw std::runtime_error("XMLWriter: Failed to write XML file: " + filename);
+        }
+    }
+
+private:
+    static void writeParameterList(const ParameterList& params, pugi::xml_node& node) {
+        // Write all parameters using the iterator interface
+        for (auto it = params.begin(); it != params.end(); ++it) {
+            std::string key = params.name(it);
+            writeParameter(params, key, node);
+        }
+
+        // Note: Sublists would need additional implementation if supported
+        // For now, this handles the basic parameter types
+    }
+
+    static void writeParameter(const ParameterList& params, const std::string& key, pugi::xml_node& node) {
+        pugi::xml_node param_node = node.append_child("Parameter");
+        param_node.append_attribute("name") = key.c_str();
+
+        // Check type and write accordingly
+        if (params.isType<bool>(key)) {
+            param_node.append_attribute("type") = "bool";
+            bool value = params.get<bool>(key);
+            param_node.append_attribute("value") = value ? "true" : "false";
+        }
+        else if (params.isType<int>(key)) {
+            param_node.append_attribute("type") = "int";
+            int value = params.get<int>(key);
+            param_node.append_attribute("value") = std::to_string(value).c_str();
+        }
+        else if (params.isType<double>(key)) {
+            param_node.append_attribute("type") = "double";
+            double value = params.get<double>(key);
+            param_node.append_attribute("value") = std::to_string(value).c_str();
+        }
+        else if (params.isType<std::string>(key)) {
+            param_node.append_attribute("type") = "string";
+            std::string value = params.get<std::string>(key);
+            param_node.append_attribute("value") = value.c_str();
+        }
+        else if (params.isType<std::vector<double>>(key)) {
+            param_node.append_attribute("type") = "string";
+            std::vector<double> value = params.get<std::vector<double>>(key);
+            param_node.append_attribute("value") = vectorToString(value).c_str();
+        }
+        else if (params.isType<std::vector<int>>(key)) {
+            param_node.append_attribute("type") = "string";
+            std::vector<int> value = params.get<std::vector<int>>(key);
+            param_node.append_attribute("value") = vectorToString(value).c_str();
+        }
+        else if (params.isType<std::vector<std::string>>(key)) {
+            param_node.append_attribute("type") = "string";
+            std::vector<std::string> value = params.get<std::vector<std::string>>(key);
+            param_node.append_attribute("value") = vectorToString(value).c_str();
+        }
+        else {
+            // Default case - try as string
+            param_node.append_attribute("type") = "string";
+            param_node.append_attribute("value") = "unknown_type";
+        }
+    }
+
+    // Helper function to convert vector to string representation
+    template<typename T>
+    static std::string vectorToString(const std::vector<T>& vec) {
+        if (vec.empty()) return "{ }";
+
+        std::ostringstream oss;
+        oss << "{ ";
+        for (size_t i = 0; i < vec.size(); ++i) {
+            if (i > 0) oss << ", ";
+            oss << vec[i];
+        }
+        oss << " }";
+        return oss.str();
+    }
+};
+
+} // namespace ROL
+
+#endif // ROL_XMLREADER_HPP
diff --git a/packages/rol/src/compatibility/teuchos-lite/TeuchosCore_config.h b/packages/rol/src/compatibility/teuchos-lite/TeuchosCore_config.h
new file mode 100644
index 000000000000..e58b68ea6515
--- /dev/null
+++ b/packages/rol/src/compatibility/teuchos-lite/TeuchosCore_config.h
@@ -0,0 +1,24 @@
+#ifndef TEUCHOSCORE_CONFIG_H_
+#define TEUCHOSCORE_CONFIG_H_
+
+/* #undef HAVE_TEUCHOSCORE_KOKKOS */
+
+#define HAVE_TEUCHOSCORE_CXX11
+
+/* #undef HAVE_TEUCHOSCORE_BOOST */
+
+/* #undef HAVE_TEUCHOSCORE_BOOST_IS_POLYMORPHIC */
+
+/* #undef HAVE_TEUCHOSCORE_QT */
+
+/* Deprecated */
+/* #undef HAVE_TEUCHOSCORE_QD */
+
+/* #undef HAVE_TEUCHOSCORE_QUADMATH */
+
+/* Deprecated */
+/* #undef HAVE_TEUCHOSCORE_ARPREC */
+
+/* #undef HAVE_TEUCHOSCORE_PTHREAD */
+
+#endif // TEUCHOSCORE_CONFIG_H_
diff --git a/packages/rol/src/compatibility/teuchos-lite/Teuchos_Assert.hpp b/packages/rol/src/compatibility/teuchos-lite/Teuchos_Assert.hpp
new file mode 100644
index 000000000000..9070b2250000
--- /dev/null
+++ b/packages/rol/src/compatibility/teuchos-lite/Teuchos_Assert.hpp
@@ -0,0 +1,135 @@
+// @HEADER
+// ***********************************************************************
+//
+//                    Teuchos: Common Tools Package
+//                 Copyright (2004) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
+//
+// ***********************************************************************
+// @HEADER
+
+#ifndef TEUCHOS_ASSERT_HPP
+#define TEUCHOS_ASSERT_HPP
+
+
+#include "Teuchos_TestForException.hpp"
+
+
+/** \brief This macro is throws when an assert fails.
+ *
+ * \note <tt>The std::exception</tt> thrown is <tt>std::logic_error</tt>.
+ *
+ * \ingroup TestForException_grp
+ */
+#define TEUCHOS_ASSERT(assertion_test) TEUCHOS_TEST_FOR_EXCEPT(!(assertion_test))
+
+#ifdef TEUCHOS_DEBUG
+/** \brief Behaves as TEUCHOS_ASSERT only if debugging is enabled.
+ *
+ * \ingroup TestForException_grp
+ */
+#define TEUCHOS_DEBUG_ASSERT(assertion_test) TEUCHOS_ASSERT(assertion_test)
+#else
+#define TEUCHOS_DEBUG_ASSERT(assertion_test)
+#endif
+
+
+/** \brief This macro asserts that an integral number fallis in the range
+ * <tt>[lower_inclusive,upper_exclusive)</tt>
+ *
+ * \note <tt>The std::exception</tt> thrown is <tt>std::out_of_range</tt>.
+ *
+ * WARNING: This assert will evaluate <tt>index</tt>,
+ * <tt>lower_inclusive</tt>, and <tt>upper_inclusive</tt> more than once if
+ * there is a failure which will cause the side-effect of an additional
+ * evaluation.  This is needed because the return types of these values are
+ * unknown.  Therefore, only pass in arguments that are objects or function
+ * calls that have not side-effects!
+ *
+ * \ingroup TestForException_grp
+ */
+#define TEUCHOS_ASSERT_IN_RANGE_UPPER_EXCLUSIVE( index, lower_inclusive, upper_exclusive ) \
+  { \
+    TEUCHOS_TEST_FOR_EXCEPTION( \
+      !( (lower_inclusive) <= (index) && (index) < (upper_exclusive) ), \
+      std::out_of_range, \
+      "Error, the index " #index " = " << (index) << " does not fall in the range" \
+      "["<<(lower_inclusive)<<","<<(upper_exclusive)<<")!" ); \
+  }
+
+
+/** \brief This macro is checks that to numbers are equal and if not then
+ * throws an exception with a good error message.
+ *
+ * \note The <tt>std::exception</tt> thrown is <tt>std::out_of_range</tt>.
+ *
+ * WARNING: This assert will evaluate <tt>val1</tt> and <tt>val2</tt> more
+ * than once if there is a failure which will cause the side-effect of an
+ * additional evaluation.  This is needed because the return types of
+ * <tt>val1</tt> and <tt>val2</tt> are unknown.  Therefore, only pass in
+ * arguments that are objects or function calls that have not side-effects!
+ *
+ * \ingroup TestForException_grp
+ */
+#define TEUCHOS_ASSERT_EQUALITY( val1, val2 ) \
+  { \
+    TEUCHOS_TEST_FOR_EXCEPTION( \
+      (val1) != (val2), std::out_of_range, \
+      "Error, (" #val1 " = " << (val1) << ") != (" #val2 " = " << (val2) << ")!" ); \
+  }
+
+
+/** \brief This macro is checks that an inequality between two numbers is
+ * satisified and if not then throws a good exception message.
+ *
+ * \note The <tt>std::exception</tt> thrown is <tt>std::out_of_range</tt>.
+ *
+ * WARNING: This assert will evaluate <tt>val1</tt> and <tt>val2</tt> more
+ * than once if there is a failure which will cause the side-effect of an
+ * additional evaluation.  This is needed because the return types of
+ * <tt>val1</tt> and <tt>val2</tt> are unknown.  Therefore, only pass in
+ * arguments that are objects or function calls that have not side-effects!
+ *
+ * \ingroup TestForException_grp
+ */
+#define TEUCHOS_ASSERT_INEQUALITY( val1, comp, val2 ) \
+  { \
+    TEUCHOS_TEST_FOR_EXCEPTION( \
+      !( (val1) comp (val2) ), std::out_of_range, \
+      "Error, (" #val1 " = " << (val1) << ") " \
+      #comp " (" #val2 " = " << (val2) << ")! FAILED!" ); \
+  }
+
+
+#endif // TEUCHOS_ASSERT_HPP
diff --git a/packages/rol/src/compatibility/teuchos-lite/Teuchos_BLAS.cpp b/packages/rol/src/compatibility/teuchos-lite/Teuchos_BLAS.cpp
new file mode 100644
index 000000000000..f436afa89954
--- /dev/null
+++ b/packages/rol/src/compatibility/teuchos-lite/Teuchos_BLAS.cpp
@@ -0,0 +1,564 @@
+// @HEADER
+// ***********************************************************************
+//
+//                    Teuchos: Common Tools Package
+//                 Copyright (2004) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
+//
+// ***********************************************************************
+// @HEADER
+
+#include "Teuchos_BLAS.hpp"
+#include "Teuchos_BLAS_wrappers.hpp"
+
+/* for INTEL_CXML, the second arg may need to be changed to 'one'.  If so
+the appropriate declaration of one will need to be added back into
+functions that include the macro:
+*/
+
+namespace {
+#if defined (INTEL_CXML)
+        unsigned int one=1;
+#endif
+} // namespace
+
+#ifdef CHAR_MACRO
+#undef CHAR_MACRO
+#endif
+#if defined (INTEL_CXML)
+#define CHAR_MACRO(char_var) &char_var, one
+#else
+#define CHAR_MACRO(char_var) &char_var
+#endif
+
+
+const char Teuchos::ESideChar[] = {'L' , 'R' };
+const char Teuchos::ETranspChar[] = {'N' , 'T' , 'C' };
+const char Teuchos::EUploChar[] = {'U' , 'L' };
+const char Teuchos::EDiagChar[] = {'U' , 'N' };
+const char Teuchos::ETypeChar[] = {'G' , 'L', 'U', 'H', 'B', 'Q', 'Z' };
+//const char Teuchos::EFactChar[] = {'F', 'N' };
+//const char Teuchos::ENormChar[] = {'O', 'I' };
+//const char Teuchos::ECompQChar[] = {'N', 'I', 'V' };
+//const char Teuchos::EJobChar[] = {'E', 'V', 'B' };
+//const char Teuchos::EJobSChar[] = {'E', 'S' };
+//const char Teuchos::EJobVSChar[] = {'V', 'N' };
+//const char Teuchos::EHowmnyChar[] = {'A', 'S' };
+//const char Teuchos::ECMachChar[] = {'E', 'S', 'B', 'P', 'N', 'R', 'M', 'U', 'L', 'O' };
+//const char Teuchos::ESortChar[] = {'N', 'S'};
+
+
+namespace {
+
+
+template<typename Scalar>
+Scalar generic_dot(const int& n, const Scalar* x, const int& incx,
+  const Scalar* y, const int& incy)
+{
+  typedef Teuchos::ScalarTraits<Scalar> ST;
+  Scalar dot = 0.0;
+  if (incx==1 && incy==1) {
+    for (int i = 0; i < n; ++i)
+      dot += (*x++)*ST::conjugate(*y++);
+  }
+  else {
+    if (incx < 0)
+      x = x - incx*(n-1);
+    if (incy < 0)
+      y = y - incy*(n-1);
+    for (int i = 0; i < n; ++i, x+=incx, y+=incy)
+      dot += (*x)*ST::conjugate(*y);
+  }
+  return dot;
+}
+
+
+} // namespace
+
+
+namespace Teuchos {
+
+//Explicitly instantiating these templates for windows due to an issue with
+//resolving them when linking dlls.
+#ifdef _MSC_VER
+#  ifdef HAVE_TEUCHOS_COMPLEX
+     template class BLAS<long int, std::complex<float> >;
+     template class BLAS<long int, std::complex<double> >;
+#  endif
+     template class BLAS<long int, float>;
+     template class BLAS<long int, double>;
+#endif
+
+  // *************************** BLAS<int,float> DEFINITIONS ******************************
+
+  void BLAS<int, float>::ROTG(float* da, float* db, float* c, float* s) const
+  { SROTG_F77(da, db, c, s ); }
+
+  void BLAS<int, float>::ROT(const int& n, float* dx, const int& incx, float* dy, const int& incy, float* c, float* s) const
+  { SROT_F77(&n, dx, &incx, dy, &incy, c, s); }
+
+
+  float BLAS<int, float>::ASUM(const int& n, const float* x, const int& incx) const
+  {
+#if defined(HAVE_TEUCHOS_BLASFLOAT_APPLE_VECLIB_BUGFIX)
+    return cblas_sasum(n, x, incx);
+#elif defined(HAVE_TEUCHOS_BLASFLOAT)
+    float tmp = SASUM_F77(&n, x, &incx);
+    return tmp;
+#else
+    typedef ScalarTraits<float> ST;
+    float sum = 0.0;
+    if (incx == 1) {
+      for (int i = 0; i < n; ++i)
+        sum += ST::magnitude(*x++);
+    }
+    else {
+      for (int i = 0; i < n; ++i, x+=incx)
+        sum += ST::magnitude(*x);
+    }
+    return sum;
+#endif
+  }
+
+  void BLAS<int, float>::AXPY(const int& n, const float& alpha, const float* x, const int& incx, float* y, const int& incy) const
+  { SAXPY_F77(&n, &alpha, x, &incx, y, &incy); }
+
+  void BLAS<int, float>::COPY(const int& n, const float* x, const int& incx, float* y, const int& incy) const
+  { SCOPY_F77(&n, x, &incx, y, &incy); }
+
+  float BLAS<int, float>::DOT(const int& n, const float* x, const int& incx, const float* y, const int& incy) const
+  {
+#if defined(HAVE_TEUCHOS_BLASFLOAT_APPLE_VECLIB_BUGFIX)
+    return cblas_sdot(n, x, incx, y, incy);
+#elif defined(HAVE_TEUCHOS_BLASFLOAT)
+    return SDOT_F77(&n, x, &incx, y, &incy);
+#else
+    return generic_dot(n, x, incx, y, incy);
+#endif
+  }
+
+  int BLAS<int, float>::IAMAX(const int& n, const float* x, const int& incx) const
+  { return ISAMAX_F77(&n, x, &incx); }
+
+  float BLAS<int, float>::NRM2(const int& n, const float* x, const int& incx) const
+  {
+#if defined(HAVE_TEUCHOS_BLASFLOAT_APPLE_VECLIB_BUGFIX)
+    return cblas_snrm2(n, x, incx);
+#elif defined(HAVE_TEUCHOS_BLASFLOAT)
+    return SNRM2_F77(&n, x, &incx);
+#else
+    return ScalarTraits<float>::squareroot(generic_dot(n, x, incx, x, incx));
+#endif
+  }
+
+  void BLAS<int, float>::SCAL(const int& n, const float& alpha, float* x, const int& incx) const
+  { SSCAL_F77(&n, &alpha, x, &incx); }
+
+  void BLAS<int, float>::GEMV(ETransp trans, const int& m, const int& n, const float& alpha, const float* A, const int& lda, const float* x, const int& incx, const float& beta, float* y, const int& incy) const
+  { SGEMV_F77(CHAR_MACRO(ETranspChar[trans]), &m, &n, &alpha, A, &lda, x, &incx, &beta, y, &incy); }
+
+  void BLAS<int, float>::GER(const int& m, const int& n, const float& alpha, const float* x, const int& incx, const float* y, const int& incy, float* A, const int& lda) const
+  { SGER_F77(&m, &n, &alpha, x, &incx, y, &incy, A, &lda); }
+
+  void BLAS<int, float>::TRMV(EUplo uplo, ETransp trans, EDiag diag, const int& n, const float* A, const int& lda, float* x, const int& incx) const
+  { STRMV_F77(CHAR_MACRO(EUploChar[uplo]), CHAR_MACRO(ETranspChar[trans]), CHAR_MACRO(EDiagChar[diag]), &n, A, &lda, x, &incx); }
+
+  void BLAS<int, float>::GEMM(ETransp transa, ETransp transb, const int& m, const int& n, const int& k, const float& alpha, const float* A, const int& lda, const float* B, const int& ldb, const float& beta, float* C, const int& ldc) const
+  { SGEMM_F77(CHAR_MACRO(ETranspChar[transa]), CHAR_MACRO(ETranspChar[transb]), &m, &n, &k, &alpha, A, &lda, B, &ldb, &beta, C, &ldc); }
+
+  void BLAS<int, float>::SWAP(const int& n, float* const x, const int& incx, float* const y, const int& incy) const
+  {
+    SSWAP_F77 (&n, x, &incx, y, &incy);
+  }
+
+  void BLAS<int, float>::SYMM(ESide side, EUplo uplo, const int& m, const int& n, const float& alpha, const float* A, const int& lda, const float* B, const int& ldb, const float& beta, float* C, const int& ldc) const
+  { SSYMM_F77(CHAR_MACRO(ESideChar[side]), CHAR_MACRO(EUploChar[uplo]), &m, &n, &alpha, A, &lda, B, &ldb, &beta, C, &ldc); }
+
+  void BLAS<int, float>::SYRK(EUplo uplo, ETransp trans, const int& n, const int& k, const float& alpha, const float* A, const int& lda, const float& beta, float* C, const int& ldc) const
+  { SSYRK_F77(CHAR_MACRO(EUploChar[uplo]), CHAR_MACRO(ETranspChar[trans]), &n, &k, &alpha, A, &lda, &beta, C, &ldc); }
+
+  void BLAS<int, float>::HERK(EUplo uplo, ETransp trans, const int& n, const int& k, const float& alpha, const float* A, const int& lda, const float& beta, float* C, const int& ldc) const
+  { SSYRK_F77(CHAR_MACRO(EUploChar[uplo]), CHAR_MACRO(ETranspChar[trans]), &n, &k, &alpha, A, &lda, &beta, C, &ldc); }
+
+  void BLAS<int, float>::TRMM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int& m, const int& n, const float& alpha, const float* A, const int& lda, float* B, const int& ldb) const
+  { STRMM_F77(CHAR_MACRO(ESideChar[side]), CHAR_MACRO(EUploChar[uplo]), CHAR_MACRO(ETranspChar[transa]), CHAR_MACRO(EDiagChar[diag]), &m, &n, &alpha, A, &lda, B, &ldb); }
+
+  void BLAS<int, float>::TRSM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int& m, const int& n, const float& alpha, const float* A, const int& lda, float* B, const int& ldb) const
+  { STRSM_F77(CHAR_MACRO(ESideChar[side]), CHAR_MACRO(EUploChar[uplo]), CHAR_MACRO(ETranspChar[transa]), CHAR_MACRO(EDiagChar[diag]), &m, &n, &alpha, A, &lda, B, &ldb); }
+
+  // *************************** BLAS<int,double> DEFINITIONS ******************************
+
+  void BLAS<int, double>::ROTG(double* da, double* db, double* c, double* s) const
+  { DROTG_F77(da, db, c, s); }
+
+  void BLAS<int, double>::ROT(const int& n, double* dx, const int& incx, double* dy, const int& incy, double* c, double* s) const
+  { DROT_F77(&n, dx, &incx, dy, &incy, c, s); }
+
+  double BLAS<int, double>::ASUM(const int& n, const double* x, const int& incx) const
+  { return DASUM_F77(&n, x, &incx); }
+
+  void BLAS<int, double>::AXPY(const int& n, const double& alpha, const double* x, const int& incx, double* y, const int& incy) const
+  { DAXPY_F77(&n, &alpha, x, &incx, y, &incy); }
+
+  void BLAS<int, double>::COPY(const int& n, const double* x, const int& incx, double* y, const int& incy) const
+  { DCOPY_F77(&n, x, &incx, y, &incy); }
+
+  double BLAS<int, double>::DOT(const int& n, const double* x, const int& incx, const double* y, const int& incy) const
+  {
+    return DDOT_F77(&n, x, &incx, y, &incy);
+  }
+
+  int BLAS<int, double>::IAMAX(const int& n, const double* x, const int& incx) const
+  { return IDAMAX_F77(&n, x, &incx); }
+
+  double BLAS<int, double>::NRM2(const int& n, const double* x, const int& incx) const
+  { return DNRM2_F77(&n, x, &incx); }
+
+  void BLAS<int, double>::SCAL(const int& n, const double& alpha, double* x, const int& incx) const
+  { DSCAL_F77(&n, &alpha, x, &incx); }
+
+  void BLAS<int, double>::GEMV(ETransp trans, const int& m, const int& n, const double& alpha, const double* A, const int& lda, const double* x, const int& incx, const double& beta, double* y, const int& incy) const
+  { DGEMV_F77(CHAR_MACRO(ETranspChar[trans]), &m, &n, &alpha, A, &lda, x, &incx, &beta, y, &incy); }
+
+  void BLAS<int, double>::GER(const int& m, const int& n, const double& alpha, const double* x, const int& incx, const double* y, const int& incy, double* A, const int& lda) const
+  { DGER_F77(&m, &n, &alpha, x, &incx, y, &incy, A, &lda); }
+
+  void BLAS<int, double>::TRMV(EUplo uplo, ETransp trans, EDiag diag, const int& n, const double* A, const int& lda, double* x, const int& incx) const
+  { DTRMV_F77(CHAR_MACRO(EUploChar[uplo]), CHAR_MACRO(ETranspChar[trans]), CHAR_MACRO(EDiagChar[diag]), &n, A, &lda, x, &incx); }
+
+  void BLAS<int, double>::GEMM(ETransp transa, ETransp transb, const int& m, const int& n, const int& k, const double& alpha, const double* A, const int& lda, const double* B, const int& ldb, const double& beta, double* C, const int& ldc) const
+  { DGEMM_F77(CHAR_MACRO(ETranspChar[transa]), CHAR_MACRO(ETranspChar[transb]), &m, &n, &k, &alpha, A, &lda, B, &ldb, &beta, C, &ldc); }
+
+  void BLAS<int, double>::SWAP(const int& n, double* const x, const int& incx, double* const y, const int& incy) const
+  {
+    DSWAP_F77 (&n, x, &incx, y, &incy);
+  }
+
+  void BLAS<int, double>::SYMM(ESide side, EUplo uplo, const int& m, const int& n, const double& alpha, const double* A, const int& lda, const double* B, const int& ldb, const double& beta, double* C, const int& ldc) const
+  { DSYMM_F77(CHAR_MACRO(ESideChar[side]), CHAR_MACRO(EUploChar[uplo]), &m, &n, &alpha, A, &lda, B, &ldb, &beta, C, &ldc); }
+
+  void BLAS<int, double>::SYRK(EUplo uplo, ETransp trans, const int& n, const int& k, const double& alpha, const double* A, const int& lda, const double& beta, double* C, const int& ldc) const
+  { DSYRK_F77(CHAR_MACRO(EUploChar[uplo]), CHAR_MACRO(ETranspChar[trans]), &n, &k, &alpha, A, &lda, &beta, C, &ldc); }
+
+  void BLAS<int, double>::HERK(EUplo uplo, ETransp trans, const int& n, const int& k, const double& alpha, const double* A, const int& lda, const double& beta, double* C, const int& ldc) const
+  { DSYRK_F77(CHAR_MACRO(EUploChar[uplo]), CHAR_MACRO(ETranspChar[trans]), &n, &k, &alpha, A, &lda, &beta, C, &ldc); }
+
+  void BLAS<int, double>::TRMM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int& m, const int& n, const double& alpha, const double* A, const int& lda, double* B, const int& ldb) const
+  { DTRMM_F77(CHAR_MACRO(ESideChar[side]), CHAR_MACRO(EUploChar[uplo]), CHAR_MACRO(ETranspChar[transa]), CHAR_MACRO(EDiagChar[diag]), &m, &n, &alpha, A, &lda, B, &ldb); }
+
+  void BLAS<int, double>::TRSM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int& m, const int& n, const double& alpha, const double* A, const int& lda, double* B, const int& ldb) const
+  { DTRSM_F77(CHAR_MACRO(ESideChar[side]), CHAR_MACRO(EUploChar[uplo]), CHAR_MACRO(ETranspChar[transa]), CHAR_MACRO(EDiagChar[diag]), &m, &n, &alpha, A, &lda, B, &ldb); }
+
+#ifdef HAVE_TEUCHOS_COMPLEX
+
+  // *************************** BLAS<int,std::complex<float> > DEFINITIONS ******************************
+
+  void BLAS<int, std::complex<float> >::ROTG(std::complex<float>* da, std::complex<float>* db, float* c, std::complex<float>* s) const
+  { CROTG_F77(da, db, c, s ); }
+
+  void BLAS<int, std::complex<float> >::ROT(const int& n, std::complex<float>* dx, const int& incx, std::complex<float>* dy, const int& incy, float* c, std::complex<float>* s) const
+  { CROT_F77(&n, dx, &incx, dy, &incy, c, s); }
+
+  float BLAS<int, std::complex<float> >::ASUM(const int& n, const std::complex<float>* x, const int& incx) const
+  {
+#if defined(HAVE_TEUCHOS_BLASFLOAT_APPLE_VECLIB_BUGFIX)
+    return cblas_scasum(n, x, incx);
+#elif defined(HAVE_TEUCHOS_BLASFLOAT_DOUBLE_RETURN)
+    return (float) SCASUM_F77(&n, x, &incx);
+#elif defined(HAVE_TEUCHOS_BLASFLOAT)
+    return SCASUM_F77(&n, x, &incx);
+#else // Wow, you just plain don't have this routine.
+    // mfh 01 Feb 2013: See www.netlib.org/blas/scasum.f.
+    // I've enhanced this by accumulating in double precision.
+    double result = 0;
+    if (incx == 1) {
+      for (int i = 0; i < n; ++i) {
+        result += std::abs (std::real (x[i])) + std::abs (std::imag (x[i]));
+      }
+    } else {
+      const int nincx = n * incx;
+      for (int i = 0; i < nincx; i += incx) {
+        result += std::abs (std::real (x[i])) + std::abs (std::imag (x[i]));
+      }
+    }
+    return static_cast<float> (result);
+#endif
+  }
+
+  void BLAS<int, std::complex<float> >::AXPY(const int& n, const std::complex<float> alpha, const std::complex<float>* x, const int& incx, std::complex<float>* y, const int& incy) const
+  { CAXPY_F77(&n, &alpha, x, &incx, y, &incy); }
+
+  void BLAS<int, std::complex<float> >::COPY(const int& n, const std::complex<float>* x, const int& incx, std::complex<float>* y, const int& incy) const
+  { CCOPY_F77(&n, x, &incx, y, &incy); }
+
+  std::complex<float> BLAS<int, std::complex<float> >::DOT(const int& n, const std::complex<float>* x, const int& incx, const std::complex<float>* y, const int& incy) const
+  {
+#if defined(HAVE_TEUCHOS_BLASFLOAT_APPLE_VECLIB_BUGFIX)
+    std::complex<float> z;
+    cblas_cdotc_sub(n,x,incx,y,incy,&z);
+    return z;
+#elif defined(HAVE_COMPLEX_BLAS_PROBLEM) && defined(HAVE_FIXABLE_COMPLEX_BLAS_PROBLEM)
+    std::complex<float> z;
+    CDOT_F77(&z, &n, x, &incx, y, &incy);
+    return z;
+#elif defined(HAVE_TEUCHOS_BLASFLOAT)
+    Teuchos_Complex_float_type_name z = CDOT_F77(&n, x, &incx, y, &incy);
+    return TEUCHOS_BLAS_CONVERT_COMPLEX_FORTRAN_TO_CXX(float, z);
+#else // Wow, you just plain don't have this routine.
+    // mfh 01 Feb 2013: See www.netlib.org/blas/cdotc.f.
+    // I've enhanced this by accumulating in double precision.
+    std::complex<double> result (0, 0);
+    if (n >= 0) {
+      if (incx == 1 && incy == 1) {
+        for (int i = 0; i < n; ++i) {
+          result += std::conj (x[i]) * y[i];
+        }
+      } else {
+        int ix = 0;
+        int iy = 0;
+        if (incx < 0) {
+          ix = (1-n) * incx;
+        }
+        if (incy < 0) {
+          iy = (1-n) * incy;
+        }
+        for (int i = 0; i < n; ++i) {
+          result += std::conj (x[ix]) * y[iy];
+          ix += incx;
+          iy += incy;
+        }
+      }
+    }
+    return static_cast<std::complex<float> > (result);
+#endif
+  }
+
+  int BLAS<int, std::complex<float> >::IAMAX(const int& n, const std::complex<float>* x, const int& incx) const
+  { return ICAMAX_F77(&n, x, &incx); }
+
+  float BLAS<int, std::complex<float> >::NRM2(const int& n, const std::complex<float>* x, const int& incx) const
+  {
+#if defined(HAVE_TEUCHOS_BLASFLOAT_APPLE_VECLIB_BUGFIX)
+    return cblas_scnrm2(n, x, incx);
+#elif defined(HAVE_TEUCHOS_BLASFLOAT_DOUBLE_RETURN)
+    return (float) SCNRM2_F77(&n, x, &incx);
+#elif defined(HAVE_TEUCHOS_BLASFLOAT)
+    return SCNRM2_F77(&n, x, &incx);
+#else // Wow, you just plain don't have this routine.
+    // mfh 01 Feb 2013: See www.netlib.org/blas/scnrm2.f.
+    // I've enhanced this by accumulating in double precision.
+    if (n < 1 || incx < 1) {
+      return 0;
+    } else {
+      double scale = 0;
+      double ssq = 1;
+
+      const int upper = 1 + (n-1)*incx;
+      for (int ix = 0; ix < upper; ix += incx) {
+        // The reference BLAS implementation cleverly scales the
+        // intermediate result. so that even if the square of the norm
+        // would overflow, computing the norm itself does not.  Hence,
+        // "ssq" for "scaled square root."
+        if (std::real (x[ix]) != 0) {
+          const double temp = std::abs (std::real (x[ix]));
+          if (scale < temp) {
+            const double scale_over_temp = scale / temp;
+            ssq = 1 + ssq * scale_over_temp*scale_over_temp;
+            // New scaling factor: biggest (in magnitude) real or imaginary part seen thus far.
+            scale = temp;
+          } else {
+            const double temp_over_scale = temp / scale;
+            ssq = ssq + temp_over_scale*temp_over_scale;
+          }
+        }
+        if (std::imag (x[ix]) != 0) {
+          const double temp = std::abs (std::imag (x[ix]));
+          if (scale < temp) {
+            const double scale_over_temp = scale / temp;
+            ssq = 1 + ssq * scale_over_temp*scale_over_temp;
+            // New scaling factor: biggest (in magnitude) real or imaginary part seen thus far.
+            scale = temp;
+          } else {
+            const double temp_over_scale = temp / scale;
+            ssq = ssq + temp_over_scale*temp_over_scale;
+          }
+        }
+      }
+      return static_cast<float> (scale * std::sqrt (ssq));
+    }
+#endif
+  }
+
+  void BLAS<int, std::complex<float> >::SCAL(const int& n, const std::complex<float> alpha, std::complex<float>* x, const int& incx) const
+  { CSCAL_F77(&n, &alpha, x, &incx); }
+
+  void BLAS<int, std::complex<float> >::GEMV(ETransp trans, const int& m, const int& n, const std::complex<float> alpha, const std::complex<float>* A, const int& lda, const std::complex<float>* x, const int& incx, const std::complex<float> beta, std::complex<float>* y, const int& incy) const
+  { CGEMV_F77(CHAR_MACRO(ETranspChar[trans]), &m, &n, &alpha, A, &lda, x, &incx, &beta, y, &incy); }
+
+  void BLAS<int, std::complex<float> >::GER(const int& m, const int& n, const std::complex<float> alpha, const std::complex<float>* x, const int& incx, const std::complex<float>* y, const int& incy, std::complex<float>* A, const int& lda) const
+  { CGER_F77(&m, &n, &alpha, x, &incx, y, &incy, A, &lda); }
+
+  void BLAS<int, std::complex<float> >::TRMV(EUplo uplo, ETransp trans, EDiag diag, const int& n, const std::complex<float>* A, const int& lda, std::complex<float>* x, const int& incx) const
+  { CTRMV_F77(CHAR_MACRO(EUploChar[uplo]), CHAR_MACRO(ETranspChar[trans]), CHAR_MACRO(EDiagChar[diag]), &n, A, &lda, x, &incx); }
+
+  void BLAS<int, std::complex<float> >::GEMM(ETransp transa, ETransp transb, const int& m, const int& n, const int& k, const std::complex<float> alpha, const std::complex<float>* A, const int& lda, const std::complex<float>* B, const int& ldb, const std::complex<float> beta, std::complex<float>* C, const int& ldc) const
+  { CGEMM_F77(CHAR_MACRO(ETranspChar[transa]), CHAR_MACRO(ETranspChar[transb]), &m, &n, &k, &alpha, A, &lda, B, &ldb, &beta, C, &ldc); }
+
+  void BLAS<int, std::complex<float> >::SWAP(const int& n, std::complex<float>* const x, const int& incx, std::complex<float>* const y, const int& incy) const
+  {
+    CSWAP_F77 (&n, x, &incx, y, &incy);
+  }
+
+  void BLAS<int, std::complex<float> >::SYMM(ESide side, EUplo uplo, const int& m, const int& n, const std::complex<float> alpha, const std::complex<float>* A, const int& lda, const std::complex<float>* B, const int& ldb, const std::complex<float> beta, std::complex<float>* C, const int& ldc) const
+  { CSYMM_F77(CHAR_MACRO(ESideChar[side]), CHAR_MACRO(EUploChar[uplo]), &m, &n, &alpha, A, &lda, B, &ldb, &beta, C, &ldc); }
+
+  void BLAS<int, std::complex<float> >::SYRK(EUplo uplo, ETransp trans, const int& n, const int& k, const std::complex<float> alpha, const std::complex<float>* A, const int& lda, const std::complex<float> beta, std::complex<float>* C, const int& ldc) const
+  { CSYRK_F77(CHAR_MACRO(EUploChar[uplo]), CHAR_MACRO(ETranspChar[trans]), &n, &k, &alpha, A, &lda, &beta, C, &ldc); }
+
+  void BLAS<int, std::complex<float> >::HERK(EUplo uplo, ETransp trans, const int& n, const int& k, const std::complex<float> alpha, const std::complex<float>* A, const int& lda, const std::complex<float> beta, std::complex<float>* C, const int& ldc) const
+  { CHERK_F77(CHAR_MACRO(EUploChar[uplo]), CHAR_MACRO(ETranspChar[trans]), &n, &k, &alpha, A, &lda, &beta, C, &ldc); }
+
+  void BLAS<int, std::complex<float> >::TRMM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int& m, const int& n, const std::complex<float> alpha, const std::complex<float>* A, const int& lda, std::complex<float>* B, const int& ldb) const
+  { CTRMM_F77(CHAR_MACRO(ESideChar[side]), CHAR_MACRO(EUploChar[uplo]), CHAR_MACRO(ETranspChar[transa]), CHAR_MACRO(EDiagChar[diag]), &m, &n, &alpha, A, &lda, B, &ldb); }
+
+  void BLAS<int, std::complex<float> >::TRSM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int& m, const int& n, const std::complex<float> alpha, const std::complex<float>* A, const int& lda, std::complex<float>* B, const int& ldb) const
+  { CTRSM_F77(CHAR_MACRO(ESideChar[side]), CHAR_MACRO(EUploChar[uplo]), CHAR_MACRO(ETranspChar[transa]), CHAR_MACRO(EDiagChar[diag]), &m, &n, &alpha, A, &lda, B, &ldb); }
+
+  // *************************** BLAS<int,std::complex<double> > DEFINITIONS ******************************
+
+  void BLAS<int, std::complex<double> >::ROTG(std::complex<double>* da, std::complex<double>* db, double* c, std::complex<double>* s) const
+  { ZROTG_F77(da, db, c, s); }
+
+  void BLAS<int, std::complex<double> >::ROT(const int& n, std::complex<double>* dx, const int& incx, std::complex<double>* dy, const int& incy, double* c, std::complex<double>* s) const
+  { ZROT_F77(&n, dx, &incx, dy, &incy, c, s); }
+
+  double BLAS<int, std::complex<double> >::ASUM(const int& n, const std::complex<double>* x, const int& incx) const
+  { return ZASUM_F77(&n, x, &incx); }
+
+  void BLAS<int, std::complex<double> >::AXPY(const int& n, const std::complex<double> alpha, const std::complex<double>* x, const int& incx, std::complex<double>* y, const int& incy) const
+  { ZAXPY_F77(&n, &alpha, x, &incx, y, &incy); }
+
+  void BLAS<int, std::complex<double> >::COPY(const int& n, const std::complex<double>* x, const int& incx, std::complex<double>* y, const int& incy) const
+  { ZCOPY_F77(&n, x, &incx, y, &incy); }
+
+  std::complex<double> BLAS<int, std::complex<double> >::DOT(const int& n, const std::complex<double>* x, const int& incx, const std::complex<double>* y, const int& incy) const
+  {
+#if defined(HAVE_TEUCHOS_BLASFLOAT_APPLE_VECLIB_BUGFIX)
+    std::complex<double> z;
+    cblas_zdotc_sub(n,x,incx,y,incy,&z);
+    return z;
+#elif defined(HAVE_COMPLEX_BLAS_PROBLEM)
+#  if defined(HAVE_FIXABLE_COMPLEX_BLAS_PROBLEM)
+    std::complex<double> z;
+    ZDOT_F77(&z, &n, x, &incx, y, &incy);
+    return z;
+#  else
+    // mfh 01 Feb 2013: Your complex BLAS is broken, but the problem
+    // doesn't have the easy workaround.  I'll just reimplement the
+    // missing routine here.  See www.netlib.org/blas/zdotc.f.
+    std::complex<double> ztemp (0, 0);
+    if (n > 0) {
+      if (incx == 1 && incy == 1) {
+        for (int i = 0; i < n; ++i) {
+          ztemp += std::conj (x[i]) * y[i];
+        }
+      } else {
+        int ix = 0;
+        int iy = 0;
+        if (incx < 0) {
+          ix = (1-n)*incx;
+        }
+        if (incy < 0) {
+          iy = (1-n)*incy;
+        }
+        for (int i = 0; i < n; ++i) {
+          ztemp += std::conj (x[ix]) * y[iy];
+          ix += incx;
+          iy += incy;
+        }
+      }
+    }
+    return ztemp;
+
+#  endif // defined(HAVE_FIXABLE_COMPLEX_BLAS_PROBLEM)
+#else
+    Teuchos_Complex_double_type_name z = ZDOT_F77(&n, x, &incx, y, &incy);
+    return TEUCHOS_BLAS_CONVERT_COMPLEX_FORTRAN_TO_CXX(double, z);
+#endif
+  }
+
+  int BLAS<int, std::complex<double> >::IAMAX(const int& n, const std::complex<double>* x, const int& incx) const
+  { return IZAMAX_F77(&n, x, &incx); }
+
+  double BLAS<int, std::complex<double> >::NRM2(const int& n, const std::complex<double>* x, const int& incx) const
+  { return ZNRM2_F77(&n, x, &incx); }
+
+  void BLAS<int, std::complex<double> >::SCAL(const int& n, const std::complex<double> alpha, std::complex<double>* x, const int& incx) const
+  { ZSCAL_F77(&n, &alpha, x, &incx); }
+
+  void BLAS<int, std::complex<double> >::GEMV(ETransp trans, const int& m, const int& n, const std::complex<double> alpha, const std::complex<double>* A, const int& lda, const std::complex<double>* x, const int& incx, const std::complex<double> beta, std::complex<double>* y, const int& incy) const
+  { ZGEMV_F77(CHAR_MACRO(ETranspChar[trans]), &m, &n, &alpha, A, &lda, x, &incx, &beta, y, &incy); }
+
+  void BLAS<int, std::complex<double> >::GER(const int& m, const int& n, const std::complex<double> alpha, const std::complex<double>* x, const int& incx, const std::complex<double>* y, const int& incy, std::complex<double>* A, const int& lda) const
+  { ZGER_F77(&m, &n, &alpha, x, &incx, y, &incy, A, &lda); }
+
+  void BLAS<int, std::complex<double> >::TRMV(EUplo uplo, ETransp trans, EDiag diag, const int& n, const std::complex<double>* A, const int& lda, std::complex<double>* x, const int& incx) const
+  { ZTRMV_F77(CHAR_MACRO(EUploChar[uplo]), CHAR_MACRO(ETranspChar[trans]), CHAR_MACRO(EDiagChar[diag]), &n, A, &lda, x, &incx); }
+
+  void BLAS<int, std::complex<double> >::GEMM(ETransp transa, ETransp transb, const int& m, const int& n, const int& k, const std::complex<double> alpha, const std::complex<double>* A, const int& lda, const std::complex<double>* B, const int& ldb, const std::complex<double> beta, std::complex<double>* C, const int& ldc) const
+  { ZGEMM_F77(CHAR_MACRO(ETranspChar[transa]), CHAR_MACRO(ETranspChar[transb]), &m, &n, &k, &alpha, A, &lda, B, &ldb, &beta, C, &ldc); }
+
+  void BLAS<int, std::complex<double> >::SWAP(const int& n, std::complex<double>* const x, const int& incx, std::complex<double>* const y, const int& incy) const
+  {
+    ZSWAP_F77 (&n, x, &incx, y, &incy);
+  }
+
+  void BLAS<int, std::complex<double> >::SYMM(ESide side, EUplo uplo, const int& m, const int& n, const std::complex<double> alpha, const std::complex<double>* A, const int& lda, const std::complex<double> *B, const int& ldb, const std::complex<double> beta, std::complex<double> *C, const int& ldc) const
+  { ZSYMM_F77(CHAR_MACRO(ESideChar[side]), CHAR_MACRO(EUploChar[uplo]), &m, &n, &alpha, A, &lda, B, &ldb, &beta, C, &ldc); }
+
+  void BLAS<int, std::complex<double> >::SYRK(EUplo uplo, ETransp trans, const int& n, const int& k, const std::complex<double> alpha, const std::complex<double>* A, const int& lda, const std::complex<double> beta, std::complex<double>* C, const int& ldc) const
+  { ZSYRK_F77(CHAR_MACRO(EUploChar[uplo]), CHAR_MACRO(ETranspChar[trans]), &n, &k, &alpha, A, &lda, &beta, C, &ldc); }
+
+  void BLAS<int, std::complex<double> >::HERK(EUplo uplo, ETransp trans, const int& n, const int& k, const std::complex<double> alpha, const std::complex<double>* A, const int& lda, const std::complex<double> beta, std::complex<double>* C, const int& ldc) const
+  { ZHERK_F77(CHAR_MACRO(EUploChar[uplo]), CHAR_MACRO(ETranspChar[trans]), &n, &k, &alpha, A, &lda, &beta, C, &ldc); }
+
+  void BLAS<int, std::complex<double> >::TRMM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int& m, const int& n, const std::complex<double> alpha, const std::complex<double>* A, const int& lda, std::complex<double>* B, const int& ldb) const
+  { ZTRMM_F77(CHAR_MACRO(ESideChar[side]), CHAR_MACRO(EUploChar[uplo]), CHAR_MACRO(ETranspChar[transa]), CHAR_MACRO(EDiagChar[diag]), &m, &n, &alpha, A, &lda, B, &ldb); }
+
+  void BLAS<int, std::complex<double> >::TRSM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int& m, const int& n, const std::complex<double> alpha, const std::complex<double>* A, const int& lda, std::complex<double>* B, const int& ldb) const
+  { ZTRSM_F77(CHAR_MACRO(ESideChar[side]), CHAR_MACRO(EUploChar[uplo]), CHAR_MACRO(ETranspChar[transa]), CHAR_MACRO(EDiagChar[diag]), &m, &n, &alpha, A, &lda, B, &ldb); }
+
+#endif // HAVE_TEUCHOS_COMPLEX
+
+}
diff --git a/packages/rol/src/compatibility/teuchos-lite/Teuchos_BLAS.hpp b/packages/rol/src/compatibility/teuchos-lite/Teuchos_BLAS.hpp
new file mode 100644
index 000000000000..79bcc9234288
--- /dev/null
+++ b/packages/rol/src/compatibility/teuchos-lite/Teuchos_BLAS.hpp
@@ -0,0 +1,2374 @@
+// @HEADER
+// ***********************************************************************
+//
+//                    Teuchos: Common Tools Package
+//                 Copyright (2004) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
+//
+// ***********************************************************************
+// @HEADER
+
+// Kris
+// 06.16.03 -- Start over from scratch
+// 06.16.03 -- Initial templatization (Tpetra_BLAS.cpp is no longer needed)
+// 06.18.03 -- Changed xxxxx_() function calls to XXXXX_F77()
+//          -- Added warning messages for generic calls
+// 07.08.03 -- Move into Teuchos package/namespace
+// 07.24.03 -- The first iteration of BLAS generics is nearing completion. Caveats:
+//             * TRSM isn't finished yet; it works for one case at the moment (left side, upper tri., no transpose, no unit diag.)
+//             * Many of the generic implementations are quite inefficient, ugly, or both. I wrote these to be easy to debug, not for efficiency or legibility. The next iteration will improve both of these aspects as much as possible.
+//             * Very little verification of input parameters is done, save for the character-type arguments (TRANS, etc.) which is quite robust.
+//             * All of the routines that make use of both an incx and incy parameter (which includes much of the L1 BLAS) are set up to work iff incx == incy && incx > 0. Allowing for differing/negative values of incx/incy should be relatively trivial.
+//             * All of the L2/L3 routines assume that the entire matrix is being used (that is, if A is mxn, lda = m); they don't work on submatrices yet. This *should* be a reasonably trivial thing to fix, as well.
+//          -- Removed warning messages for generic calls
+// 08.08.03 -- TRSM now works for all cases where SIDE == L and DIAG == N. DIAG == U is implemented but does not work correctly; SIDE == R is not yet implemented.
+// 08.14.03 -- TRSM now works for all cases and accepts (and uses) leading-dimension information.
+// 09.26.03 -- character input replaced with enumerated input to cause compiling errors and not run-time errors ( suggested by RAB ).
+
+#ifndef _TEUCHOS_BLAS_HPP_
+#define _TEUCHOS_BLAS_HPP_
+
+/*! \file Teuchos_BLAS.hpp
+    \brief Templated interface class to BLAS routines.
+*/
+/** \example BLAS/cxx_main.cpp
+    This is an example of how to use the Teuchos::BLAS class.
+*/
+
+#include "Teuchos_ConfigDefs.hpp"
+#include "Teuchos_ScalarTraits.hpp"
+#include "Teuchos_OrdinalTraits.hpp"
+#include "Teuchos_BLAS_types.hpp"
+#include "Teuchos_Assert.hpp"
+
+/*! \class Teuchos::BLAS
+    \brief Templated BLAS wrapper.
+
+    The Teuchos::BLAS class provides functionality similar to the BLAS
+    (Basic Linear Algebra Subprograms).  The BLAS provide portable,
+    high- performance implementations of kernels such as dense vector
+    sums, inner products, and norms (the BLAS 1 routines), dense
+    matrix-vector multiplication and triangular solve (the BLAS 2
+    routines), and dense matrix-matrix multiplication and triangular
+    solve with multiple right-hand sides (the BLAS 3 routines).
+
+    The standard BLAS interface is Fortran-specific.  Unfortunately,
+    the interface between C++ and Fortran is not standard across all
+    computer platforms.  The Teuchos::BLAS class provides C++ bindings
+    for the BLAS kernels in order to insulate the rest of Petra from
+    the details of C++ to Fortran translation.
+
+    In addition to giving access to the standard BLAS functionality,
+    Teuchos::BLAS also provides a generic fall-back implementation for
+    any ScalarType class that defines the +, - * and / operators.
+
+    Teuchos::BLAS only operates within a single shared-memory space,
+    just like the BLAS.  It does not attempt to implement
+    distributed-memory parallel matrix operations.
+
+    \note This class has specializations for ScalarType=float and
+      double, which use the BLAS library directly.  If you configure
+      Teuchos to enable complex arithmetic support, via the CMake
+      option -DTeuchos_ENABLE_COMPLEX:BOOL=ON, then this class will
+      also invoke the BLAS library directly for
+      ScalarType=std::complex<float> and std::complex<double>.
+*/
+namespace Teuchos
+{
+  extern TEUCHOSNUMERICS_LIB_DLL_EXPORT const char ESideChar[];
+  extern TEUCHOSNUMERICS_LIB_DLL_EXPORT const char ETranspChar[];
+  extern TEUCHOSNUMERICS_LIB_DLL_EXPORT const char EUploChar[];
+  extern TEUCHOSNUMERICS_LIB_DLL_EXPORT const char EDiagChar[];
+  extern TEUCHOSNUMERICS_LIB_DLL_EXPORT const char ETypeChar[];
+
+  // Forward declaration
+  namespace details {
+    template<typename ScalarType,
+             bool isComplex = Teuchos::ScalarTraits<ScalarType>::isComplex>
+    class GivensRotator;
+  }
+
+  //! Default implementation for BLAS routines
+  /*!
+   * This class provides the default implementation for the BLAS routines.  It
+   * is put in a separate class so that specializations of BLAS for other types
+   * still have this implementation available.
+   */
+  template<typename OrdinalType, typename ScalarType>
+  class DefaultBLASImpl
+  {
+
+    typedef typename Teuchos::ScalarTraits<ScalarType>::magnitudeType MagnitudeType;
+
+  public:
+    //! @name Constructor/Destructor.
+    //@{
+
+    //! Default constructor.
+    inline DefaultBLASImpl(void) {}
+
+    //! Copy constructor.
+    inline DefaultBLASImpl(const DefaultBLASImpl<OrdinalType, ScalarType>& /*BLAS_source*/) {}
+
+    //! Destructor.
+    inline virtual ~DefaultBLASImpl(void) {}
+    //@}
+
+    //! @name Level 1 BLAS Routines.
+    //@{
+
+    //! The type used for c in ROTG
+    /*! This is MagnitudeType if ScalarType is complex and ScalarType otherwise.
+     */
+    typedef typename details::GivensRotator<ScalarType>::c_type rotg_c_type;
+
+    //! Computes a Givens plane rotation.
+    void ROTG(ScalarType* da, ScalarType* db, rotg_c_type* c, ScalarType* s) const;
+
+    //! Applies a Givens plane rotation.
+    void ROT(const OrdinalType& n, ScalarType* dx, const OrdinalType& incx, ScalarType* dy, const OrdinalType& incy, MagnitudeType* c, ScalarType* s) const;
+
+    //! Scale the vector \c x by the constant \c alpha.
+    void SCAL(const OrdinalType& n, const ScalarType& alpha, ScalarType* x, const OrdinalType& incx) const;
+
+    //! Copy the vector \c x to the vector \c y.
+    void COPY(const OrdinalType& n, const ScalarType* x, const OrdinalType& incx, ScalarType* y, const OrdinalType& incy) const;
+
+    //! Perform the operation: \c y \c <- \c y+alpha*x.
+    template <typename alpha_type, typename x_type>
+    void AXPY(const OrdinalType& n, const alpha_type alpha, const x_type* x, const OrdinalType& incx, ScalarType* y, const OrdinalType& incy) const;
+
+    //! Sum the absolute values of the entries of \c x.
+    typename ScalarTraits<ScalarType>::magnitudeType ASUM(const OrdinalType& n, const ScalarType* x, const OrdinalType& incx) const;
+
+    //! Form the dot product of the vectors \c x and \c y.
+    template <typename x_type, typename y_type>
+    ScalarType DOT(const OrdinalType& n, const x_type* x, const OrdinalType& incx, const y_type* y, const OrdinalType& incy) const;
+
+    //! Compute the 2-norm of the vector \c x.
+    typename ScalarTraits<ScalarType>::magnitudeType NRM2(const OrdinalType& n, const ScalarType* x, const OrdinalType& incx) const;
+
+    //! Return the index of the element of \c x with the maximum magnitude.
+    OrdinalType IAMAX(const OrdinalType& n, const ScalarType* x, const OrdinalType& incx) const;
+    //@}
+
+    //! @name Level 2 BLAS Routines.
+    //@{
+
+    //! Performs the matrix-vector operation:  \c y \c <- \c alpha*A*x+beta*y or \c y \c <- \c alpha*A'*x+beta*y where \c A is a general \c m by \c n matrix.
+    template <typename alpha_type, typename A_type, typename x_type, typename beta_type>
+    void GEMV(ETransp trans, const OrdinalType& m, const OrdinalType& n, const alpha_type alpha, const A_type* A,
+              const OrdinalType& lda, const x_type* x, const OrdinalType& incx, const beta_type beta, ScalarType* y, const OrdinalType& incy) const;
+
+    //! Performs the matrix-vector operation:  \c x \c <- \c A*x or \c x \c <- \c A'*x where \c A is a unit/non-unit \c n by \c n upper/lower triangular matrix.
+    template <typename A_type>
+    void TRMV(EUplo uplo, ETransp trans, EDiag diag, const OrdinalType& n, const A_type* A,
+              const OrdinalType& lda, ScalarType* x, const OrdinalType& incx) const;
+
+    //! \brief Performs the rank 1 operation:  \c A \c <- \c alpha*x*y'+A.
+    /// \note  For complex arithmetic, this routine performs [Z/C]GERU.
+    template <typename alpha_type, typename x_type, typename y_type>
+    void GER(const OrdinalType& m, const OrdinalType& n, const alpha_type alpha, const x_type* x, const OrdinalType& incx,
+             const y_type* y, const OrdinalType& incy, ScalarType* A, const OrdinalType& lda) const;
+    //@}
+
+    //! @name Level 3 BLAS Routines.
+    //@{
+
+    /// \brief General matrix-matrix multiply.
+    ///
+    /// This computes C = alpha*op(A)*op(B) + beta*C.  op(X) here may
+    /// be either X, the transpose of X, or the conjugate transpose of
+    /// X.  op(A) has m rows and k columns, op(B) has k rows and n
+    /// columns, and C has m rows and n columns.
+    template <typename alpha_type, typename A_type, typename B_type, typename beta_type>
+    void GEMM(ETransp transa, ETransp transb, const OrdinalType& m, const OrdinalType& n, const OrdinalType& k, const alpha_type alpha, const A_type* A, const OrdinalType& lda, const B_type* B, const OrdinalType& ldb, const beta_type beta, ScalarType* C, const OrdinalType& ldc) const;
+
+    //! Swap the entries of x and y.
+    void
+    SWAP (const OrdinalType& n, ScalarType* const x, const OrdinalType& incx,
+          ScalarType* const y, const OrdinalType& incy) const;
+
+    //! Performs the matrix-matrix operation: \c C \c <- \c alpha*A*B+beta*C or \c C \c <- \c alpha*B*A+beta*C where \c A is an \c m by \c m or \c n by \c n symmetric matrix and \c B is a general matrix.
+    template <typename alpha_type, typename A_type, typename B_type, typename beta_type>
+    void SYMM(ESide side, EUplo uplo, const OrdinalType& m, const OrdinalType& n, const alpha_type alpha, const A_type* A, const OrdinalType& lda, const B_type* B, const OrdinalType& ldb, const beta_type beta, ScalarType* C, const OrdinalType& ldc) const;
+
+    //! Performs the symmetric rank k operation: \c C <- \c alpha*A*A'+beta*C or \c C <- \c alpha*A'*A+beta*C, where \c alpha and \c beta are scalars, \c C is an \c n by \c n symmetric matrix and \c A is an \c n by \c k matrix in the first case or \c k by \c n matrix in the second case.
+    template <typename alpha_type, typename A_type, typename beta_type>
+    void SYRK(EUplo uplo, ETransp trans, const OrdinalType& n, const OrdinalType& k, const alpha_type alpha, const A_type* A, const OrdinalType& lda, const beta_type beta, ScalarType* C, const OrdinalType& ldc) const;
+
+    //! Performs the matrix-matrix operation: \c B \c <- \c alpha*op(A)*B or \c B \c <- \c alpha*B*op(A) where \c op(A) is an unit/non-unit, upper/lower triangular matrix and \c B is a general matrix.
+    template <typename alpha_type, typename A_type>
+    void TRMM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const OrdinalType& m, const OrdinalType& n,
+                const alpha_type alpha, const A_type* A, const OrdinalType& lda, ScalarType* B, const OrdinalType& ldb) const;
+
+    //! Solves the matrix equations:  \c op(A)*X=alpha*B or \c X*op(A)=alpha*B where \c X and \c B are \c m by \c n matrices, \c A is a unit/non-unit, upper/lower triangular matrix and \c op(A) is \c A or \c A'.  The matrix \c X is overwritten on \c B.
+    template <typename alpha_type, typename A_type>
+    void TRSM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const OrdinalType& m, const OrdinalType& n,
+                const alpha_type alpha, const A_type* A, const OrdinalType& lda, ScalarType* B, const OrdinalType& ldb) const;
+    //@}
+  };
+
+  template<typename OrdinalType, typename ScalarType>
+  class TEUCHOSNUMERICS_LIB_DLL_EXPORT BLAS : public DefaultBLASImpl<OrdinalType,ScalarType>
+  {
+
+    typedef typename Teuchos::ScalarTraits<ScalarType>::magnitudeType MagnitudeType;
+
+  public:
+    //! @name Constructor/Destructor.
+    //@{
+
+    //! Default constructor.
+    inline BLAS(void) {}
+
+    //! Copy constructor.
+    inline BLAS(const BLAS<OrdinalType, ScalarType>& /*BLAS_source*/) {}
+
+    //! Destructor.
+    inline virtual ~BLAS(void) {}
+    //@}
+  };
+
+//------------------------------------------------------------------------------------------
+//      LEVEL 1 BLAS ROUTINES
+//------------------------------------------------------------------------------------------
+
+  /// \namespace details
+  /// \brief Teuchos implementation details.
+  ///
+  /// \warning Teuchos users should not use anything in this
+  ///   namespace.  They should not even assume that the namespace
+  ///   will continue to exist between releases.  The namespace's name
+  ///   itself or anything it contains may change at any time.
+  namespace details {
+
+    // Compute magnitude.
+    template<typename ScalarType, bool isComplex>
+    class MagValue {
+    public:
+      void
+      blas_dabs1(const ScalarType* a, typename ScalarTraits<ScalarType>::magnitudeType* ret) const;
+    };
+
+    // Complex-arithmetic specialization.
+    template<typename ScalarType>
+    class MagValue<ScalarType, true> {
+    public:
+      void
+      blas_dabs1(const ScalarType* a, typename ScalarTraits<ScalarType>::magnitudeType* ret) const;
+    };
+
+    // Real-arithmetic specialization.
+    template<typename ScalarType>
+    class MagValue<ScalarType, false> {
+    public:
+      void
+      blas_dabs1(const ScalarType* a, ScalarType* ret) const;
+    };
+
+    template<typename ScalarType, bool isComplex>
+    class GivensRotator {};
+
+    // Complex-arithmetic specialization.
+    template<typename ScalarType>
+    class GivensRotator<ScalarType, true> {
+    public:
+      typedef typename ScalarTraits<ScalarType>::magnitudeType c_type;
+      void
+      ROTG (ScalarType* ca,
+            ScalarType* cb,
+            typename ScalarTraits<ScalarType>::magnitudeType* c,
+            ScalarType* s) const;
+    };
+
+    // Real-arithmetic specialization.
+    template<typename ScalarType>
+    class GivensRotator<ScalarType, false> {
+    public:
+      typedef ScalarType c_type;
+      void
+      ROTG (ScalarType* da,
+            ScalarType* db,
+            ScalarType* c,
+            ScalarType* s) const;
+
+      /// Return ABS(x) if y > 0 or y is +0, else -ABS(x) (if y is -0 or < 0).
+      ///
+      /// Note that SIGN respects IEEE 754 floating-point signed zero.
+      /// This is a hopefully correct implementation of the Fortran
+      /// type-generic SIGN intrinsic.  ROTG for complex arithmetic
+      /// doesn't require this function.  C99 provides a copysign()
+      /// math library function, but we are not able to rely on the
+      /// existence of C99 functions here.
+      ///
+      /// We provide this method on purpose only for the
+      /// real-arithmetic specialization of GivensRotator.  Complex
+      /// numbers don't have a sign; they have an angle.
+      ScalarType SIGN (const ScalarType& x, const ScalarType& y) const {
+        typedef ScalarTraits<ScalarType> STS;
+
+        if (y > STS::zero()) {
+          return STS::magnitude (x);
+        } else if (y < STS::zero()) {
+          return -STS::magnitude (x);
+        } else { // y == STS::zero()
+          // Suppose that ScalarType& implements signed zero, as IEEE
+          // 754 - compliant floating-point numbers should.  You can't
+          // use == to test for signed zero, since +0 == -0.  However,
+          // 1/0 = Inf > 0 and 1/-0 = -Inf < 0.  Let's hope ScalarType
+          // supports Inf... we don't need to test for Inf, just see
+          // if it's greater than or less than zero.
+          //
+          // NOTE: This ONLY works if ScalarType is real.  Complex
+          // infinity doesn't have a sign, so we can't compare it with
+          // zero.  That's OK, because finite complex numbers don't
+          // have a sign either; they have an angle.
+          ScalarType signedInfinity = STS::one() / y;
+          if (signedInfinity > STS::zero()) {
+            return STS::magnitude (x);
+          } else {
+            // Even if ScalarType doesn't implement signed zero,
+            // Fortran's SIGN intrinsic returns -ABS(X) if the second
+            // argument Y is zero.  We imitate this behavior here.
+            return -STS::magnitude (x);
+          }
+        }
+      }
+    };
+
+    // Implementation of complex-arithmetic specialization.
+    template<typename ScalarType>
+    void
+    GivensRotator<ScalarType, true>::
+    ROTG (ScalarType* ca,
+          ScalarType* cb,
+          typename ScalarTraits<ScalarType>::magnitudeType* c,
+          ScalarType* s) const
+    {
+      typedef ScalarTraits<ScalarType> STS;
+      typedef typename STS::magnitudeType MagnitudeType;
+      typedef ScalarTraits<MagnitudeType> STM;
+
+      // This is a straightforward translation into C++ of the
+      // reference BLAS' implementation of ZROTG.  You can get
+      // the Fortran 77 source code of ZROTG here:
+      //
+      // http://www.netlib.org/blas/zrotg.f
+      //
+      // I used the following rules to translate Fortran types and
+      // intrinsic functions into C++:
+      //
+      // DOUBLE PRECISION -> MagnitudeType
+      // DOUBLE COMPLEX -> ScalarType
+      // CDABS -> STS::magnitude
+      // DCMPLX -> ScalarType constructor (assuming that ScalarType
+      //   is std::complex<MagnitudeType>)
+      // DCONJG -> STS::conjugate
+      // DSQRT -> STM::squareroot
+      ScalarType alpha;
+      MagnitudeType norm, scale;
+
+      if (STS::magnitude (*ca) == STM::zero()) {
+        *c = STM::zero();
+        *s = STS::one();
+        *ca = *cb;
+      } else {
+        scale = STS::magnitude (*ca) + STS::magnitude (*cb);
+        { // I introduced temporaries into the translated BLAS code in
+          // order to make the expression easier to read and also save a
+          // few floating-point operations.
+          const MagnitudeType ca_scaled =
+            STS::magnitude (*ca / ScalarType(scale, STM::zero()));
+          const MagnitudeType cb_scaled =
+            STS::magnitude (*cb / ScalarType(scale, STM::zero()));
+          norm = scale *
+            STM::squareroot (ca_scaled*ca_scaled + cb_scaled*cb_scaled);
+        }
+        alpha = *ca / STS::magnitude (*ca);
+        *c = STS::magnitude (*ca) / norm;
+        *s = alpha * STS::conjugate (*cb) / norm;
+        *ca = alpha * norm;
+      }
+    }
+
+    // Implementation of real-arithmetic specialization.
+    template<typename ScalarType>
+    void
+    GivensRotator<ScalarType, false>::
+    ROTG (ScalarType* da,
+          ScalarType* db,
+          ScalarType* c,
+          ScalarType* s) const
+    {
+      typedef ScalarTraits<ScalarType> STS;
+
+      // This is a straightforward translation into C++ of the
+      // reference BLAS' implementation of DROTG.  You can get
+      // the Fortran 77 source code of DROTG here:
+      //
+      // http://www.netlib.org/blas/drotg.f
+      //
+      // I used the following rules to translate Fortran types and
+      // intrinsic functions into C++:
+      //
+      // DOUBLE PRECISION -> ScalarType
+      // DABS -> STS::magnitude
+      // DSQRT -> STM::squareroot
+      // DSIGN -> SIGN (see below)
+      //
+      // DSIGN(x,y) (the old DOUBLE PRECISION type-specific form of
+      // the Fortran type-generic SIGN intrinsic) required special
+      // translation, which we did in a separate utility function in
+      // the specializaton of GivensRotator for real arithmetic.
+      // (ROTG for complex arithmetic doesn't require this function.)
+      // C99 provides a copysign() math library function, but we are
+      // not able to rely on the existence of C99 functions here.
+      ScalarType r, roe, scale, z;
+
+      roe = *db;
+      if (STS::magnitude (*da) > STS::magnitude (*db)) {
+        roe = *da;
+      }
+      scale = STS::magnitude (*da) + STS::magnitude (*db);
+      if (scale == STS::zero()) {
+        *c = STS::one();
+        *s = STS::zero();
+        r = STS::zero();
+        z = STS::zero();
+      } else {
+        // I introduced temporaries into the translated BLAS code in
+        // order to make the expression easier to read and also save
+        // a few floating-point& operations.
+        const ScalarType da_scaled = *da / scale;
+        const ScalarType db_scaled = *db / scale;
+        r = scale * STS::squareroot (da_scaled*da_scaled + db_scaled*db_scaled);
+        r = SIGN (STS::one(), roe) * r;
+        *c = *da / r;
+        *s = *db / r;
+        z = STS::one();
+        if (STS::magnitude (*da) > STS::magnitude (*db)) {
+          z = *s;
+        }
+        if (STS::magnitude (*db) >= STS::magnitude (*da) && *c != STS::zero()) {
+          z = STS::one() / *c;
+        }
+      }
+
+      *da = r;
+      *db = z;
+    }
+
+    // Real-valued implementation of MagValue
+    template<typename ScalarType>
+    void
+    MagValue<ScalarType, false>::
+    blas_dabs1(const ScalarType* a, ScalarType* ret) const
+    {
+      *ret = Teuchos::ScalarTraits<ScalarType>::magnitude( *a );
+    }
+
+    // Complex-valued implementation of MagValue
+    template<typename ScalarType>
+    void
+    MagValue<ScalarType, true>::
+    blas_dabs1(const ScalarType* a, typename ScalarTraits<ScalarType>::magnitudeType* ret) const
+    {
+      *ret = ScalarTraits<typename ScalarTraits<ScalarType>::magnitudeType>::magnitude(a->real());
+      *ret += ScalarTraits<typename ScalarTraits<ScalarType>::magnitudeType>::magnitude(a->imag());
+    }
+
+  } // namespace details
+
+  template<typename OrdinalType, typename ScalarType>
+  void
+  DefaultBLASImpl<OrdinalType, ScalarType>::
+  ROTG (ScalarType* da,
+        ScalarType* db,
+        rotg_c_type* c,
+        ScalarType* s) const
+  {
+    details::GivensRotator<ScalarType> rotator;
+    rotator.ROTG (da, db, c, s);
+  }
+
+  template<typename OrdinalType, typename ScalarType>
+  void DefaultBLASImpl<OrdinalType,ScalarType>::ROT(const OrdinalType& n, ScalarType* dx, const OrdinalType& incx, ScalarType* dy, const OrdinalType& incy, MagnitudeType* c, ScalarType* s) const
+  {
+    OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
+    ScalarType sconj = Teuchos::ScalarTraits<ScalarType>::conjugate(*s);
+    if (n <= 0) return;
+    if (incx==1 && incy==1) {
+      for(OrdinalType i=0; i<n; ++i) {
+        ScalarType temp = *c*dx[i] + *s*dy[i];
+        dy[i] = *c*dy[i] - sconj*dx[i];
+        dx[i] = temp;
+      }
+    }
+    else {
+      OrdinalType ix = 0, iy = 0;
+      if (incx < izero) ix = (-n+1)*incx;
+      if (incy < izero) iy = (-n+1)*incy;
+      for(OrdinalType i=0; i<n; ++i) {
+        ScalarType temp = *c*dx[ix] + *s*dy[iy];
+        dy[iy] = *c*dy[iy] - sconj*dx[ix];
+        dx[ix] = temp;
+        ix += incx;
+        iy += incy;
+      }
+    }
+  }
+
+  template<typename OrdinalType, typename ScalarType>
+  void DefaultBLASImpl<OrdinalType, ScalarType>::SCAL(const OrdinalType& n, const ScalarType& alpha, ScalarType* x, const OrdinalType& incx) const
+  {
+    OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
+    OrdinalType ione = OrdinalTraits<OrdinalType>::one();
+    OrdinalType i, ix = izero;
+
+    if ( n < ione || incx < ione )
+      return;
+
+    // Scale the vector.
+    for(i = izero; i < n; i++)
+    {
+      x[ix] *= alpha;
+      ix += incx;
+    }
+  } /* end SCAL */
+
+  template<typename OrdinalType, typename ScalarType>
+  void DefaultBLASImpl<OrdinalType, ScalarType>::COPY(const OrdinalType& n, const ScalarType* x, const OrdinalType& incx, ScalarType* y, const OrdinalType& incy) const
+  {
+    OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
+    OrdinalType ione = OrdinalTraits<OrdinalType>::one();
+    OrdinalType i, ix = izero, iy = izero;
+    if ( n > izero ) {
+        // Set the initial indices (ix, iy).
+        if (incx < izero) { ix = (-n+ione)*incx; }
+        if (incy < izero) { iy = (-n+ione)*incy; }
+
+        for(i = izero; i < n; i++)
+          {
+            y[iy] = x[ix];
+            ix += incx;
+            iy += incy;
+          }
+      }
+  } /* end COPY */
+
+  template<typename OrdinalType, typename ScalarType>
+  template <typename alpha_type, typename x_type>
+  void DefaultBLASImpl<OrdinalType, ScalarType>::AXPY(const OrdinalType& n, const alpha_type alpha, const x_type* x, const OrdinalType& incx, ScalarType* y, const OrdinalType& incy) const
+  {
+    OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
+    OrdinalType ione = OrdinalTraits<OrdinalType>::one();
+    OrdinalType i, ix = izero, iy = izero;
+    if( n > izero && alpha != ScalarTraits<alpha_type>::zero())
+      {
+        // Set the initial indices (ix, iy).
+        if (incx < izero) { ix = (-n+ione)*incx; }
+        if (incy < izero) { iy = (-n+ione)*incy; }
+
+        for(i = izero; i < n; i++)
+          {
+            y[iy] += alpha * x[ix];
+            ix += incx;
+            iy += incy;
+          }
+      }
+  } /* end AXPY */
+
+  template<typename OrdinalType, typename ScalarType>
+  typename ScalarTraits<ScalarType>::magnitudeType DefaultBLASImpl<OrdinalType, ScalarType>::ASUM(const OrdinalType& n, const ScalarType* x, const OrdinalType& incx) const
+  {
+    OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
+    OrdinalType ione = OrdinalTraits<OrdinalType>::one();
+    typename ScalarTraits<ScalarType>::magnitudeType temp, result =
+      ScalarTraits<typename ScalarTraits<ScalarType>::magnitudeType>::zero();
+    OrdinalType i, ix = izero;
+
+    if ( n < ione || incx < ione )
+      return result;
+
+    details::MagValue<ScalarType, ScalarTraits<ScalarType>::isComplex> mval;
+    for (i = izero; i < n; i++)
+      {
+        mval.blas_dabs1( &x[ix], &temp );
+        result += temp;
+        ix += incx;
+      }
+
+    return result;
+  } /* end ASUM */
+
+  template<typename OrdinalType, typename ScalarType>
+  template <typename x_type, typename y_type>
+  ScalarType DefaultBLASImpl<OrdinalType, ScalarType>::DOT(const OrdinalType& n, const x_type* x, const OrdinalType& incx, const y_type* y, const OrdinalType& incy) const
+  {
+    OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
+    OrdinalType ione = OrdinalTraits<OrdinalType>::one();
+    ScalarType result = ScalarTraits<ScalarType>::zero();
+    OrdinalType i, ix = izero, iy = izero;
+    if( n > izero )
+      {
+        // Set the initial indices (ix,iy).
+        if (incx < izero) { ix = (-n+ione)*incx; }
+        if (incy < izero) { iy = (-n+ione)*incy; }
+
+        for(i = izero; i < n; i++)
+          {
+            result += ScalarTraits<x_type>::conjugate(x[ix]) * y[iy];
+            ix += incx;
+            iy += incy;
+          }
+      }
+    return result;
+  } /* end DOT */
+
+  template<typename OrdinalType, typename ScalarType>
+  typename ScalarTraits<ScalarType>::magnitudeType DefaultBLASImpl<OrdinalType, ScalarType>::NRM2(const OrdinalType& n, const ScalarType* x, const OrdinalType& incx) const
+  {
+    OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
+    OrdinalType ione = OrdinalTraits<OrdinalType>::one();
+    typename ScalarTraits<ScalarType>::magnitudeType result =
+      ScalarTraits<typename ScalarTraits<ScalarType>::magnitudeType>::zero();
+    OrdinalType i, ix = izero;
+
+    if ( n < ione || incx < ione )
+      return result;
+
+    for(i = izero; i < n; i++)
+      {
+        result += ScalarTraits<ScalarType>::magnitude(ScalarTraits<ScalarType>::conjugate(x[ix]) * x[ix]);
+        ix += incx;
+      }
+    result = ScalarTraits<typename ScalarTraits<ScalarType>::magnitudeType>::squareroot(result);
+    return result;
+  } /* end NRM2 */
+
+  template<typename OrdinalType, typename ScalarType>
+  OrdinalType DefaultBLASImpl<OrdinalType, ScalarType>::IAMAX(const OrdinalType& n, const ScalarType* x, const OrdinalType& incx) const
+  {
+    OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
+    OrdinalType ione = OrdinalTraits<OrdinalType>::one();
+    OrdinalType result = izero, ix = izero, i;
+    typename ScalarTraits<ScalarType>::magnitudeType curval =
+      ScalarTraits<typename ScalarTraits<ScalarType>::magnitudeType>::zero();
+    typename ScalarTraits<ScalarType>::magnitudeType maxval =
+      ScalarTraits<typename ScalarTraits<ScalarType>::magnitudeType>::zero();
+
+    if ( n < ione || incx < ione )
+      return result;
+
+    details::MagValue<ScalarType, ScalarTraits<ScalarType>::isComplex> mval;
+
+    mval.blas_dabs1( &x[ix], &maxval );
+    ix += incx;
+    for(i = ione; i < n; i++)
+      {
+        mval.blas_dabs1( &x[ix], &curval );
+        if(curval > maxval)
+          {
+            result = i;
+            maxval = curval;
+          }
+        ix += incx;
+      }
+
+    return result + 1; // the BLAS I?AMAX functions return 1-indexed (Fortran-style) values
+  } /* end IAMAX */
+
+//------------------------------------------------------------------------------------------
+//      LEVEL 2 BLAS ROUTINES
+//------------------------------------------------------------------------------------------
+  template<typename OrdinalType, typename ScalarType>
+  template <typename alpha_type, typename A_type, typename x_type, typename beta_type>
+  void DefaultBLASImpl<OrdinalType, ScalarType>::GEMV(ETransp trans, const OrdinalType& m, const OrdinalType& n, const alpha_type alpha, const A_type* A, const OrdinalType& lda, const x_type* x, const OrdinalType& incx, const beta_type beta, ScalarType* y, const OrdinalType& incy) const
+  {
+    OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
+    OrdinalType ione = OrdinalTraits<OrdinalType>::one();
+    alpha_type alpha_zero = ScalarTraits<alpha_type>::zero();
+    beta_type beta_zero = ScalarTraits<beta_type>::zero();
+    x_type x_zero = ScalarTraits<x_type>::zero();
+    ScalarType y_zero = ScalarTraits<ScalarType>::zero();
+    beta_type beta_one = ScalarTraits<beta_type>::one();
+    bool noConj = true;
+    bool BadArgument = false;
+
+    // Quick return if there is nothing to do!
+    if( m == izero || n == izero || ( alpha == alpha_zero && beta == beta_one ) ){ return; }
+
+    // Otherwise, we need to check the argument list.
+    if( m < izero ) {
+        std::cout << "BLAS::GEMV Error: M == " << m << std::endl;
+        BadArgument = true;
+    }
+    if( n < izero ) {
+        std::cout << "BLAS::GEMV Error: N == " << n << std::endl;
+        BadArgument = true;
+    }
+    if( lda < m ) {
+        std::cout << "BLAS::GEMV Error: LDA < MAX(1,M)"<< std::endl;
+        BadArgument = true;
+    }
+    if( incx == izero ) {
+        std::cout << "BLAS::GEMV Error: INCX == 0"<< std::endl;
+        BadArgument = true;
+    }
+    if( incy == izero ) {
+        std::cout << "BLAS::GEMV Error: INCY == 0"<< std::endl;
+        BadArgument = true;
+    }
+
+    if(!BadArgument) {
+      OrdinalType i, j, lenx, leny, ix, iy, jx, jy;
+      OrdinalType kx = izero, ky = izero;
+      ScalarType temp;
+
+      // Determine the lengths of the vectors x and y.
+      if(ETranspChar[trans] == 'N') {
+        lenx = n;
+        leny = m;
+      } else {
+        lenx = m;
+        leny = n;
+      }
+
+      // Determine if this is a conjugate tranpose
+      noConj = (ETranspChar[trans] == 'T');
+
+      // Set the starting pointers for the vectors x and y if incx/y < 0.
+      if (incx < izero ) { kx =  (ione - lenx)*incx; }
+      if (incy < izero ) { ky =  (ione - leny)*incy; }
+
+      // Form y = beta*y
+      ix = kx; iy = ky;
+      if(beta != beta_one) {
+        if (incy == ione) {
+          if (beta == beta_zero) {
+            for(i = izero; i < leny; i++) { y[i] = y_zero; }
+          } else {
+            for(i = izero; i < leny; i++) { y[i] *= beta; }
+          }
+        } else {
+          if (beta == beta_zero) {
+            for(i = izero; i < leny; i++) {
+              y[iy] = y_zero;
+              iy += incy;
+            }
+          } else {
+            for(i = izero; i < leny; i++) {
+              y[iy] *= beta;
+              iy += incy;
+            }
+          }
+        }
+      }
+
+      // Return if we don't have to do anything more.
+      if(alpha == alpha_zero) { return; }
+
+      if( ETranspChar[trans] == 'N' ) {
+        // Form y = alpha*A*y
+        jx = kx;
+        if (incy == ione) {
+          for(j = izero; j < n; j++) {
+            if (x[jx] != x_zero) {
+              temp = alpha*x[jx];
+              for(i = izero; i < m; i++) {
+                y[i] += temp*A[j*lda + i];
+              }
+            }
+            jx += incx;
+          }
+        } else {
+          for(j = izero; j < n; j++) {
+            if (x[jx] != x_zero) {
+              temp = alpha*x[jx];
+              iy = ky;
+              for(i = izero; i < m; i++) {
+                y[iy] += temp*A[j*lda + i];
+                iy += incy;
+              }
+            }
+            jx += incx;
+          }
+        }
+      } else {
+        jy = ky;
+        if (incx == ione) {
+          for(j = izero; j < n; j++) {
+            temp = y_zero;
+            if ( noConj ) {
+              for(i = izero; i < m; i++) {
+                temp += A[j*lda + i]*x[i];
+              }
+            } else {
+              for(i = izero; i < m; i++) {
+                temp += ScalarTraits<A_type>::conjugate(A[j*lda + i])*x[i];
+              }
+            }
+            y[jy] += alpha*temp;
+            jy += incy;
+          }
+        } else {
+          for(j = izero; j < n; j++) {
+            temp = y_zero;
+            ix = kx;
+            if ( noConj ) {
+              for (i = izero; i < m; i++) {
+                temp += A[j*lda + i]*x[ix];
+                ix += incx;
+              }
+            } else {
+              for (i = izero; i < m; i++) {
+                temp += ScalarTraits<A_type>::conjugate(A[j*lda + i])*x[ix];
+                ix += incx;
+              }
+            }
+            y[jy] += alpha*temp;
+            jy += incy;
+          }
+        }
+      }
+    } /* if (!BadArgument) */
+  } /* end GEMV */
+
+ template<typename OrdinalType, typename ScalarType>
+ template <typename A_type>
+ void DefaultBLASImpl<OrdinalType, ScalarType>::TRMV(EUplo uplo, ETransp trans, EDiag diag, const OrdinalType& n, const A_type* A, const OrdinalType& lda, ScalarType* x, const OrdinalType& incx) const
+  {
+    OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
+    OrdinalType ione = OrdinalTraits<OrdinalType>::one();
+    ScalarType zero = ScalarTraits<ScalarType>::zero();
+    bool BadArgument = false;
+    bool noConj = true;
+
+    // Quick return if there is nothing to do!
+    if( n == izero ){ return; }
+
+    // Otherwise, we need to check the argument list.
+    if( n < izero ) {
+      std::cout << "BLAS::TRMV Error: N == " << n << std::endl;
+      BadArgument = true;
+    }
+    if( lda < n ) {
+      std::cout << "BLAS::TRMV Error: LDA < MAX(1,N)"<< std::endl;
+      BadArgument = true;
+    }
+    if( incx == izero ) {
+      std::cout << "BLAS::TRMV Error: INCX == 0"<< std::endl;
+      BadArgument = true;
+    }
+
+    if(!BadArgument) {
+      OrdinalType i, j, ix, jx, kx = izero;
+      ScalarType temp;
+      bool noUnit = (EDiagChar[diag] == 'N');
+
+      // Determine if this is a conjugate tranpose
+      noConj = (ETranspChar[trans] == 'T');
+
+      // Set the starting pointer for the vector x if incx < 0.
+      if (incx < izero) { kx = (-n+ione)*incx; }
+
+      // Start the operations for a nontransposed triangular matrix
+      if (ETranspChar[trans] == 'N') {
+        /* Compute x = A*x */
+        if (EUploChar[uplo] == 'U') {
+          /* A is an upper triangular matrix */
+          if (incx == ione) {
+            for (j=izero; j<n; j++) {
+              if (x[j] != zero) {
+                temp = x[j];
+                for (i=izero; i<j; i++) {
+                  x[i] += temp*A[j*lda + i];
+                }
+                if ( noUnit )
+                  x[j] *= A[j*lda + j];
+              }
+            }
+          } else {
+            jx = kx;
+            for (j=izero; j<n; j++) {
+              if (x[jx] != zero) {
+                temp = x[jx];
+                ix = kx;
+                for (i=izero; i<j; i++) {
+                  x[ix] += temp*A[j*lda + i];
+                  ix += incx;
+                }
+                if ( noUnit )
+                  x[jx] *= A[j*lda + j];
+              }
+              jx += incx;
+            }
+          } /* if (incx == ione) */
+        } else { /* A is a lower triangular matrix */
+          if (incx == ione) {
+            for (j=n-ione; j>-ione; j--) {
+              if (x[j] != zero) {
+                temp = x[j];
+                for (i=n-ione; i>j; i--) {
+                  x[i] += temp*A[j*lda + i];
+                }
+                if ( noUnit )
+                  x[j] *= A[j*lda + j];
+              }
+            }
+          } else {
+            kx += (n-ione)*incx;
+            jx = kx;
+            for (j=n-ione; j>-ione; j--) {
+              if (x[jx] != zero) {
+                temp = x[jx];
+                ix = kx;
+                for (i=n-ione; i>j; i--) {
+                  x[ix] += temp*A[j*lda + i];
+                  ix -= incx;
+                }
+                if ( noUnit )
+                  x[jx] *= A[j*lda + j];
+              }
+              jx -= incx;
+            }
+          }
+        } /* if (EUploChar[uplo]=='U') */
+      } else { /* A is transposed/conjugated */
+        /* Compute x = A'*x */
+        if (EUploChar[uplo]=='U') {
+          /* A is an upper triangular matrix */
+          if (incx == ione) {
+            for (j=n-ione; j>-ione; j--) {
+              temp = x[j];
+              if ( noConj ) {
+                if ( noUnit )
+                  temp *= A[j*lda + j];
+                for (i=j-ione; i>-ione; i--) {
+                  temp += A[j*lda + i]*x[i];
+                }
+              } else {
+                if ( noUnit )
+                  temp *= ScalarTraits<A_type>::conjugate(A[j*lda + j]);
+                for (i=j-ione; i>-ione; i--) {
+                  temp += ScalarTraits<A_type>::conjugate(A[j*lda + i])*x[i];
+                }
+              }
+              x[j] = temp;
+            }
+          } else {
+            jx = kx + (n-ione)*incx;
+            for (j=n-ione; j>-ione; j--) {
+              temp = x[jx];
+              ix = jx;
+              if ( noConj ) {
+                if ( noUnit )
+                  temp *= A[j*lda + j];
+                for (i=j-ione; i>-ione; i--) {
+                  ix -= incx;
+                  temp += A[j*lda + i]*x[ix];
+                }
+              } else {
+                if ( noUnit )
+                  temp *= ScalarTraits<A_type>::conjugate(A[j*lda + j]);
+                for (i=j-ione; i>-ione; i--) {
+                  ix -= incx;
+                  temp += ScalarTraits<A_type>::conjugate(A[j*lda + i])*x[ix];
+                }
+              }
+              x[jx] = temp;
+              jx -= incx;
+            }
+          }
+        } else {
+          /* A is a lower triangular matrix */
+          if (incx == ione) {
+            for (j=izero; j<n; j++) {
+              temp = x[j];
+              if ( noConj ) {
+                if ( noUnit )
+                  temp *= A[j*lda + j];
+                for (i=j+ione; i<n; i++) {
+                  temp += A[j*lda + i]*x[i];
+                }
+              } else {
+                if ( noUnit )
+                  temp *= ScalarTraits<A_type>::conjugate(A[j*lda + j]);
+                for (i=j+ione; i<n; i++) {
+                  temp += ScalarTraits<A_type>::conjugate(A[j*lda + i])*x[i];
+                }
+              }
+              x[j] = temp;
+            }
+          } else {
+            jx = kx;
+            for (j=izero; j<n; j++) {
+              temp = x[jx];
+              ix = jx;
+              if ( noConj ) {
+                if ( noUnit )
+                  temp *= A[j*lda + j];
+                for (i=j+ione; i<n; i++) {
+                  ix += incx;
+                  temp += A[j*lda + i]*x[ix];
+                }
+              } else {
+                if ( noUnit )
+                  temp *= ScalarTraits<A_type>::conjugate(A[j*lda + j]);
+                for (i=j+ione; i<n; i++) {
+                  ix += incx;
+                  temp += ScalarTraits<A_type>::conjugate(A[j*lda + i])*x[ix];
+                }
+              }
+              x[jx] = temp;
+              jx += incx;
+            }
+          }
+        } /* if (EUploChar[uplo]=='U') */
+      } /* if (ETranspChar[trans]=='N') */
+    } /* if (!BadArgument) */
+  } /* end TRMV */
+
+  template<typename OrdinalType, typename ScalarType>
+  template <typename alpha_type, typename x_type, typename y_type>
+  void DefaultBLASImpl<OrdinalType, ScalarType>::GER(const OrdinalType& m, const OrdinalType& n, const alpha_type alpha, const x_type* x, const OrdinalType& incx, const y_type* y, const OrdinalType& incy, ScalarType* A, const OrdinalType& lda) const
+  {
+    OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
+    OrdinalType ione = OrdinalTraits<OrdinalType>::one();
+    alpha_type alpha_zero = ScalarTraits<alpha_type>::zero();
+    y_type y_zero = ScalarTraits<y_type>::zero();
+    bool BadArgument = false;
+
+    // Quick return if there is nothing to do!
+    if( m == izero || n == izero || alpha == alpha_zero ){ return; }
+
+    // Otherwise, we need to check the argument list.
+    if( m < izero ) {
+        std::cout << "BLAS::GER Error: M == " << m << std::endl;
+        BadArgument = true;
+    }
+    if( n < izero ) {
+        std::cout << "BLAS::GER Error: N == " << n << std::endl;
+        BadArgument = true;
+    }
+    if( lda < m ) {
+        std::cout << "BLAS::GER Error: LDA < MAX(1,M)"<< std::endl;
+        BadArgument = true;
+    }
+    if( incx == 0 ) {
+        std::cout << "BLAS::GER Error: INCX == 0"<< std::endl;
+        BadArgument = true;
+    }
+    if( incy == 0 ) {
+        std::cout << "BLAS::GER Error: INCY == 0"<< std::endl;
+        BadArgument = true;
+    }
+
+    if(!BadArgument) {
+      OrdinalType i, j, ix, jy = izero, kx = izero;
+      ScalarType temp;
+
+      // Set the starting pointers for the vectors x and y if incx/y < 0.
+      if (incx < izero) { kx = (-m+ione)*incx; }
+      if (incy < izero) { jy = (-n+ione)*incy; }
+
+      // Start the operations for incx == 1
+      if( incx == ione ) {
+        for( j=izero; j<n; j++ ) {
+          if ( y[jy] != y_zero ) {
+            temp = alpha*y[jy];
+            for ( i=izero; i<m; i++ ) {
+              A[j*lda + i] += x[i]*temp;
+            }
+          }
+          jy += incy;
+        }
+      }
+      else { // Start the operations for incx != 1
+        for( j=izero; j<n; j++ ) {
+          if ( y[jy] != y_zero ) {
+            temp = alpha*y[jy];
+            ix = kx;
+            for( i=izero; i<m; i++ ) {
+              A[j*lda + i] += x[ix]*temp;
+              ix += incx;
+            }
+          }
+          jy += incy;
+        }
+      }
+    } /* if(!BadArgument) */
+  } /* end GER */
+
+//------------------------------------------------------------------------------------------
+//      LEVEL 3 BLAS ROUTINES
+//------------------------------------------------------------------------------------------
+
+  template<typename OrdinalType, typename ScalarType>
+  template <typename alpha_type, typename A_type, typename B_type, typename beta_type>
+  void DefaultBLASImpl<OrdinalType, ScalarType>::GEMM(ETransp transa, ETransp transb, const OrdinalType& m, const OrdinalType& n, const OrdinalType& k, const alpha_type alpha, const A_type* A, const OrdinalType& lda, const B_type* B, const OrdinalType& ldb, const beta_type beta, ScalarType* C, const OrdinalType& ldc) const
+  {
+
+    OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
+    alpha_type alpha_zero = ScalarTraits<alpha_type>::zero();
+    beta_type beta_zero = ScalarTraits<beta_type>::zero();
+    B_type B_zero = ScalarTraits<B_type>::zero();
+    ScalarType C_zero = ScalarTraits<ScalarType>::zero();
+    beta_type beta_one = ScalarTraits<beta_type>::one();
+    OrdinalType i, j, p;
+    OrdinalType NRowA = m, NRowB = k;
+    ScalarType temp;
+    bool BadArgument = false;
+    bool conjA = false, conjB = false;
+
+    // Change dimensions of matrix if either matrix is transposed
+    if( !(ETranspChar[transa]=='N') ) {
+      NRowA = k;
+    }
+    if( !(ETranspChar[transb]=='N') ) {
+      NRowB = n;
+    }
+
+    // Quick return if there is nothing to do!
+    if( (m==izero) || (n==izero) || (((alpha==alpha_zero)||(k==izero)) && (beta==beta_one)) ){ return; }
+    if( m < izero ) {
+      std::cout << "BLAS::GEMM Error: M == " << m << std::endl;
+      BadArgument = true;
+    }
+    if( n < izero ) {
+      std::cout << "BLAS::GEMM Error: N == " << n << std::endl;
+      BadArgument = true;
+    }
+    if( k < izero ) {
+      std::cout << "BLAS::GEMM Error: K == " << k << std::endl;
+      BadArgument = true;
+    }
+    if( lda < NRowA ) {
+      std::cout << "BLAS::GEMM Error: LDA < "<<NRowA<<std::endl;
+      BadArgument = true;
+    }
+    if( ldb < NRowB ) {
+      std::cout << "BLAS::GEMM Error: LDB < "<<NRowB<<std::endl;
+      BadArgument = true;
+    }
+     if( ldc < m ) {
+      std::cout << "BLAS::GEMM Error: LDC < MAX(1,M)"<< std::endl;
+      BadArgument = true;
+    }
+
+    if(!BadArgument) {
+
+      // Determine if this is a conjugate tranpose
+      conjA = (ETranspChar[transa] == 'C');
+      conjB = (ETranspChar[transb] == 'C');
+
+      // Only need to scale the resulting matrix C.
+      if( alpha == alpha_zero ) {
+        if( beta == beta_zero ) {
+          for (j=izero; j<n; j++) {
+            for (i=izero; i<m; i++) {
+              C[j*ldc + i] = C_zero;
+            }
+          }
+        } else {
+          for (j=izero; j<n; j++) {
+            for (i=izero; i<m; i++) {
+              C[j*ldc + i] *= beta;
+            }
+          }
+        }
+        return;
+      }
+      //
+      // Now start the operations.
+      //
+      if ( ETranspChar[transb]=='N' ) {
+        if ( ETranspChar[transa]=='N' ) {
+          // Compute C = alpha*A*B + beta*C
+          for (j=izero; j<n; j++) {
+            if( beta == beta_zero ) {
+              for (i=izero; i<m; i++){
+                C[j*ldc + i] = C_zero;
+              }
+            } else if( beta != beta_one ) {
+              for (i=izero; i<m; i++){
+                C[j*ldc + i] *= beta;
+              }
+            }
+            for (p=izero; p<k; p++){
+              if (B[j*ldb + p] != B_zero ){
+                temp = alpha*B[j*ldb + p];
+                for (i=izero; i<m; i++) {
+                  C[j*ldc + i] += temp*A[p*lda + i];
+                }
+              }
+            }
+          }
+        } else if ( conjA ) {
+          // Compute C = alpha*conj(A')*B + beta*C
+          for (j=izero; j<n; j++) {
+            for (i=izero; i<m; i++) {
+              temp = C_zero;
+              for (p=izero; p<k; p++) {
+                temp += ScalarTraits<A_type>::conjugate(A[i*lda + p])*B[j*ldb + p];
+              }
+              if (beta == beta_zero) {
+                C[j*ldc + i] = alpha*temp;
+              } else {
+                C[j*ldc + i] = alpha*temp + beta*C[j*ldc + i];
+              }
+            }
+          }
+        } else {
+          // Compute C = alpha*A'*B + beta*C
+          for (j=izero; j<n; j++) {
+            for (i=izero; i<m; i++) {
+              temp = C_zero;
+              for (p=izero; p<k; p++) {
+                temp += A[i*lda + p]*B[j*ldb + p];
+              }
+              if (beta == beta_zero) {
+                C[j*ldc + i] = alpha*temp;
+              } else {
+                C[j*ldc + i] = alpha*temp + beta*C[j*ldc + i];
+              }
+            }
+          }
+        }
+      } else if ( ETranspChar[transa]=='N' ) {
+        if ( conjB ) {
+          // Compute C = alpha*A*conj(B') + beta*C
+          for (j=izero; j<n; j++) {
+            if (beta == beta_zero) {
+              for (i=izero; i<m; i++) {
+                C[j*ldc + i] = C_zero;
+              }
+            } else if ( beta != beta_one ) {
+              for (i=izero; i<m; i++) {
+                C[j*ldc + i] *= beta;
+              }
+            }
+            for (p=izero; p<k; p++) {
+              if (B[p*ldb + j] != B_zero) {
+                temp = alpha*ScalarTraits<B_type>::conjugate(B[p*ldb + j]);
+                for (i=izero; i<m; i++) {
+                  C[j*ldc + i] += temp*A[p*lda + i];
+                }
+              }
+            }
+          }
+        } else {
+          // Compute C = alpha*A*B' + beta*C
+          for (j=izero; j<n; j++) {
+            if (beta == beta_zero) {
+              for (i=izero; i<m; i++) {
+                C[j*ldc + i] = C_zero;
+              }
+            } else if ( beta != beta_one ) {
+              for (i=izero; i<m; i++) {
+                C[j*ldc + i] *= beta;
+              }
+            }
+            for (p=izero; p<k; p++) {
+              if (B[p*ldb + j] != B_zero) {
+                temp = alpha*B[p*ldb + j];
+                for (i=izero; i<m; i++) {
+                  C[j*ldc + i] += temp*A[p*lda + i];
+                }
+              }
+            }
+          }
+        }
+      } else if ( conjA ) {
+        if ( conjB ) {
+          // Compute C = alpha*conj(A')*conj(B') + beta*C
+          for (j=izero; j<n; j++) {
+            for (i=izero; i<m; i++) {
+              temp = C_zero;
+              for (p=izero; p<k; p++) {
+                temp += ScalarTraits<A_type>::conjugate(A[i*lda + p])
+                      * ScalarTraits<B_type>::conjugate(B[p*ldb + j]);
+              }
+              if (beta == beta_zero) {
+                C[j*ldc + i] = alpha*temp;
+              } else {
+                C[j*ldc + i] = alpha*temp + beta*C[j*ldc + i];
+              }
+            }
+          }
+        } else {
+          // Compute C = alpha*conj(A')*B' + beta*C
+          for (j=izero; j<n; j++) {
+            for (i=izero; i<m; i++) {
+              temp = C_zero;
+              for (p=izero; p<k; p++) {
+                temp += ScalarTraits<A_type>::conjugate(A[i*lda + p])
+                      * B[p*ldb + j];
+              }
+              if (beta == beta_zero) {
+                C[j*ldc + i] = alpha*temp;
+              } else {
+                C[j*ldc + i] = alpha*temp + beta*C[j*ldc + i];
+              }
+            }
+          }
+        }
+     } else {
+       if ( conjB ) {
+         // Compute C = alpha*A'*conj(B') + beta*C
+         for (j=izero; j<n; j++) {
+            for (i=izero; i<m; i++) {
+              temp = C_zero;
+              for (p=izero; p<k; p++) {
+                temp += A[i*lda + p]
+                      * ScalarTraits<B_type>::conjugate(B[p*ldb + j]);
+              }
+              if (beta == beta_zero) {
+                C[j*ldc + i] = alpha*temp;
+              } else {
+                C[j*ldc + i] = alpha*temp + beta*C[j*ldc + i];
+              }
+            }
+          }
+        } else {
+          // Compute C = alpha*A'*B' + beta*C
+          for (j=izero; j<n; j++) {
+            for (i=izero; i<m; i++) {
+              temp = C_zero;
+              for (p=izero; p<k; p++) {
+                temp += A[i*lda + p]*B[p*ldb + j];
+              }
+              if (beta == beta_zero) {
+                C[j*ldc + i] = alpha*temp;
+              } else {
+                C[j*ldc + i] = alpha*temp + beta*C[j*ldc + i];
+              }
+            }
+          }
+        } // end if (ETranspChar[transa]=='N') ...
+      } // end if (ETranspChar[transb]=='N') ...
+    } // end if (!BadArgument) ...
+  } // end of GEMM
+
+
+  template<typename OrdinalType, typename ScalarType>
+  void DefaultBLASImpl<OrdinalType, ScalarType>::
+  SWAP (const OrdinalType& n, ScalarType* const x, const OrdinalType& incx,
+        ScalarType* const y, const OrdinalType& incy) const
+  {
+    if (n <= 0) {
+      return;
+    }
+
+    if (incx == 1 && incy == 1) {
+      for (int i = 0; i < n; ++i) {
+        ScalarType tmp = x[i];
+        x[i] = y[i];
+        y[i] = tmp;
+      }
+      return;
+    }
+
+    int ix = 1;
+    int iy = 1;
+    if (incx < 0) {
+      ix = (1-n) * incx + 1;
+    }
+    if (incy < 0) {
+      iy = (1-n) * incy + 1;
+    }
+
+    for (int i = 1; i <= n; ++i) {
+      ScalarType tmp = x[ix - 1];
+      x[ix - 1] = y[iy - 1];
+      y[iy - 1] = tmp;
+      ix += incx;
+      iy += incy;
+    }
+  }
+
+
+  template<typename OrdinalType, typename ScalarType>
+  template <typename alpha_type, typename A_type, typename B_type, typename beta_type>
+  void DefaultBLASImpl<OrdinalType, ScalarType>::SYMM(ESide side, EUplo uplo, const OrdinalType& m, const OrdinalType& n, const alpha_type alpha, const A_type* A, const OrdinalType& lda, const B_type* B, const OrdinalType& ldb, const beta_type beta, ScalarType* C, const OrdinalType& ldc) const
+  {
+    OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
+    OrdinalType ione = OrdinalTraits<OrdinalType>::one();
+    alpha_type alpha_zero = ScalarTraits<alpha_type>::zero();
+    beta_type beta_zero = ScalarTraits<beta_type>::zero();
+    ScalarType C_zero = ScalarTraits<ScalarType>::zero();
+    beta_type beta_one = ScalarTraits<beta_type>::one();
+    OrdinalType i, j, k, NRowA = m;
+    ScalarType temp1, temp2;
+    bool BadArgument = false;
+    bool Upper = (EUploChar[uplo] == 'U');
+    if (ESideChar[side] == 'R') { NRowA = n; }
+
+    // Quick return.
+    if ( (m==izero) || (n==izero) || ( (alpha==alpha_zero)&&(beta==beta_one) ) ) { return; }
+    if( m < izero ) {
+      std::cout << "BLAS::SYMM Error: M == "<< m << std::endl;
+      BadArgument = true; }
+    if( n < izero ) {
+      std::cout << "BLAS::SYMM Error: N == "<< n << std::endl;
+      BadArgument = true; }
+    if( lda < NRowA ) {
+      std::cout << "BLAS::SYMM Error: LDA < "<<NRowA<<std::endl;
+      BadArgument = true; }
+    if( ldb < m ) {
+      std::cout << "BLAS::SYMM Error: LDB < MAX(1,M)"<<std::endl;
+      BadArgument = true; }
+    if( ldc < m ) {
+      std::cout << "BLAS::SYMM Error: LDC < MAX(1,M)"<<std::endl;
+      BadArgument = true; }
+
+    if(!BadArgument) {
+
+      // Only need to scale C and return.
+      if (alpha == alpha_zero) {
+        if (beta == beta_zero ) {
+          for (j=izero; j<n; j++) {
+            for (i=izero; i<m; i++) {
+              C[j*ldc + i] = C_zero;
+            }
+          }
+        } else {
+          for (j=izero; j<n; j++) {
+            for (i=izero; i<m; i++) {
+              C[j*ldc + i] *= beta;
+            }
+          }
+        }
+        return;
+      }
+
+      if ( ESideChar[side] == 'L') {
+        // Compute C = alpha*A*B + beta*C
+
+        if (Upper) {
+          // The symmetric part of A is stored in the upper triangular part of the matrix.
+          for (j=izero; j<n; j++) {
+            for (i=izero; i<m; i++) {
+              temp1 = alpha*B[j*ldb + i];
+              temp2 = C_zero;
+              for (k=izero; k<i; k++) {
+                C[j*ldc + k] += temp1*A[i*lda + k];
+                temp2 += B[j*ldb + k]*A[i*lda + k];
+              }
+              if (beta == beta_zero) {
+                C[j*ldc + i] = temp1*A[i*lda + i] + alpha*temp2;
+              } else {
+                C[j*ldc + i] = beta*C[j*ldc + i] + temp1*A[i*lda + i] + alpha*temp2;
+              }
+            }
+          }
+        } else {
+          // The symmetric part of A is stored in the lower triangular part of the matrix.
+          for (j=izero; j<n; j++) {
+            for (i=m-ione; i>-ione; i--) {
+              temp1 = alpha*B[j*ldb + i];
+              temp2 = C_zero;
+              for (k=i+ione; k<m; k++) {
+                C[j*ldc + k] += temp1*A[i*lda + k];
+                temp2 += B[j*ldb + k]*A[i*lda + k];
+              }
+              if (beta == beta_zero) {
+                C[j*ldc + i] = temp1*A[i*lda + i] + alpha*temp2;
+              } else {
+                C[j*ldc + i] = beta*C[j*ldc + i] + temp1*A[i*lda + i] + alpha*temp2;
+              }
+            }
+          }
+        }
+      } else {
+        // Compute C = alpha*B*A + beta*C.
+        for (j=izero; j<n; j++) {
+          temp1 = alpha*A[j*lda + j];
+          if (beta == beta_zero) {
+            for (i=izero; i<m; i++) {
+              C[j*ldc + i] = temp1*B[j*ldb + i];
+            }
+          } else {
+            for (i=izero; i<m; i++) {
+              C[j*ldc + i] = beta*C[j*ldc + i] + temp1*B[j*ldb + i];
+            }
+          }
+          for (k=izero; k<j; k++) {
+            if (Upper) {
+              temp1 = alpha*A[j*lda + k];
+            } else {
+              temp1 = alpha*A[k*lda + j];
+            }
+            for (i=izero; i<m; i++) {
+              C[j*ldc + i] += temp1*B[k*ldb + i];
+            }
+          }
+          for (k=j+ione; k<n; k++) {
+            if (Upper) {
+              temp1 = alpha*A[k*lda + j];
+            } else {
+              temp1 = alpha*A[j*lda + k];
+            }
+            for (i=izero; i<m; i++) {
+              C[j*ldc + i] += temp1*B[k*ldb + i];
+            }
+          }
+        }
+      } // end if (ESideChar[side]=='L') ...
+    } // end if(!BadArgument) ...
+} // end SYMM
+
+  template<typename OrdinalType, typename ScalarType>
+  template <typename alpha_type, typename A_type, typename beta_type>
+  void DefaultBLASImpl<OrdinalType, ScalarType>::SYRK(EUplo uplo, ETransp trans, const OrdinalType& n, const OrdinalType& k, const alpha_type alpha, const A_type* A, const OrdinalType& lda, const beta_type beta, ScalarType* C, const OrdinalType& ldc) const
+  {
+    typedef TypeNameTraits<OrdinalType> OTNT;
+    typedef TypeNameTraits<ScalarType> STNT;
+
+    OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
+    alpha_type alpha_zero = ScalarTraits<alpha_type>::zero();
+    beta_type beta_zero = ScalarTraits<beta_type>::zero();
+    beta_type beta_one = ScalarTraits<beta_type>::one();
+    A_type temp, A_zero = ScalarTraits<A_type>::zero();
+    ScalarType C_zero = ScalarTraits<ScalarType>::zero();
+    OrdinalType i, j, l, NRowA = n;
+    bool BadArgument = false;
+    bool Upper = (EUploChar[uplo] == 'U');
+
+    TEUCHOS_TEST_FOR_EXCEPTION(
+      Teuchos::ScalarTraits<ScalarType>::isComplex
+      && (trans == CONJ_TRANS),
+      std::logic_error,
+            "Teuchos::BLAS<"<<OTNT::name()<<","<<STNT::name()<<">::SYRK()"
+            " does not support CONJ_TRANS for complex data types."
+      );
+
+    // Change dimensions of A matrix is transposed
+    if( !(ETranspChar[trans]=='N') ) {
+      NRowA = k;
+    }
+
+    // Quick return.
+    if ( n==izero ) { return; }
+    if ( ( (alpha==alpha_zero) || (k==izero) )  && (beta==beta_one) ) { return; }
+    if( n < izero ) {
+      std::cout << "BLAS::SYRK Error: N == "<< n <<std::endl;
+      BadArgument = true; }
+    if( k < izero ) {
+      std::cout << "BLAS::SYRK Error: K == "<< k <<std::endl;
+      BadArgument = true; }
+    if( lda < NRowA ) {
+      std::cout << "BLAS::SYRK Error: LDA < "<<NRowA<<std::endl;
+      BadArgument = true; }
+    if( ldc < n ) {
+      std::cout << "BLAS::SYRK Error: LDC < MAX(1,N)"<<std::endl;
+      BadArgument = true; }
+
+    if(!BadArgument) {
+
+      // Scale C when alpha is zero
+      if (alpha == alpha_zero) {
+        if (Upper) {
+          if (beta==beta_zero) {
+            for (j=izero; j<n; j++) {
+              for (i=izero; i<=j; i++) {
+                C[j*ldc + i] = C_zero;
+              }
+            }
+          }
+          else {
+            for (j=izero; j<n; j++) {
+              for (i=izero; i<=j; i++) {
+                C[j*ldc + i] *= beta;
+              }
+            }
+          }
+        }
+        else {
+          if (beta==beta_zero) {
+            for (j=izero; j<n; j++) {
+              for (i=j; i<n; i++) {
+                C[j*ldc + i] = C_zero;
+              }
+            }
+          }
+          else {
+            for (j=izero; j<n; j++) {
+              for (i=j; i<n; i++) {
+                C[j*ldc + i] *= beta;
+              }
+            }
+          }
+        }
+        return;
+      }
+
+      // Now we can start the computation
+
+      if ( ETranspChar[trans]=='N' ) {
+
+        // Form C <- alpha*A*A' + beta*C
+        if (Upper) {
+          for (j=izero; j<n; j++) {
+            if (beta==beta_zero) {
+              for (i=izero; i<=j; i++) {
+                C[j*ldc + i] = C_zero;
+              }
+            }
+            else if (beta!=beta_one) {
+              for (i=izero; i<=j; i++) {
+                C[j*ldc + i] *= beta;
+              }
+            }
+            for (l=izero; l<k; l++) {
+              if (A[l*lda + j] != A_zero) {
+                temp = alpha*A[l*lda + j];
+                for (i = izero; i <=j; i++) {
+                  C[j*ldc + i] += temp*A[l*lda + i];
+                }
+              }
+            }
+          }
+        }
+        else {
+          for (j=izero; j<n; j++) {
+            if (beta==beta_zero) {
+              for (i=j; i<n; i++) {
+                C[j*ldc + i] = C_zero;
+              }
+            }
+            else if (beta!=beta_one) {
+              for (i=j; i<n; i++) {
+                C[j*ldc + i] *= beta;
+              }
+            }
+            for (l=izero; l<k; l++) {
+              if (A[l*lda + j] != A_zero) {
+                temp = alpha*A[l*lda + j];
+                for (i=j; i<n; i++) {
+                  C[j*ldc + i] += temp*A[l*lda + i];
+                }
+              }
+            }
+          }
+        }
+      }
+      else {
+
+        // Form C <- alpha*A'*A + beta*C
+        if (Upper) {
+          for (j=izero; j<n; j++) {
+            for (i=izero; i<=j; i++) {
+              temp = A_zero;
+              for (l=izero; l<k; l++) {
+                temp += A[i*lda + l]*A[j*lda + l];
+              }
+              if (beta==beta_zero) {
+                C[j*ldc + i] = alpha*temp;
+              }
+              else {
+                C[j*ldc + i] = alpha*temp + beta*C[j*ldc + i];
+              }
+            }
+          }
+        }
+        else {
+          for (j=izero; j<n; j++) {
+            for (i=j; i<n; i++) {
+              temp = A_zero;
+              for (l=izero; l<k; ++l) {
+                temp += A[i*lda + l]*A[j*lda + l];
+              }
+              if (beta==beta_zero) {
+                C[j*ldc + i] = alpha*temp;
+              }
+              else {
+                C[j*ldc + i] = alpha*temp + beta*C[j*ldc + i];
+              }
+            }
+          }
+        }
+      }
+    } /* if (!BadArgument) */
+  } /* END SYRK */
+
+  template<typename OrdinalType, typename ScalarType>
+  template <typename alpha_type, typename A_type>
+  void DefaultBLASImpl<OrdinalType, ScalarType>::TRMM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const OrdinalType& m, const OrdinalType& n, const alpha_type alpha, const A_type* A, const OrdinalType& lda, ScalarType* B, const OrdinalType& ldb) const
+  {
+    OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
+    OrdinalType ione = OrdinalTraits<OrdinalType>::one();
+    alpha_type alpha_zero = ScalarTraits<alpha_type>::zero();
+    A_type A_zero = ScalarTraits<A_type>::zero();
+    ScalarType B_zero = ScalarTraits<ScalarType>::zero();
+    ScalarType one = ScalarTraits<ScalarType>::one();
+    OrdinalType i, j, k, NRowA = m;
+    ScalarType temp;
+    bool BadArgument = false;
+    bool LSide = (ESideChar[side] == 'L');
+    bool noUnit = (EDiagChar[diag] == 'N');
+    bool Upper = (EUploChar[uplo] == 'U');
+    bool noConj = (ETranspChar[transa] == 'T');
+
+    if(!LSide) { NRowA = n; }
+
+    // Quick return.
+    if (n==izero || m==izero) { return; }
+    if( m < izero ) {
+      std::cout << "BLAS::TRMM Error: M == "<< m <<std::endl;
+      BadArgument = true; }
+    if( n < izero ) {
+      std::cout << "BLAS::TRMM Error: N == "<< n <<std::endl;
+      BadArgument = true; }
+    if( lda < NRowA ) {
+      std::cout << "BLAS::TRMM Error: LDA < "<<NRowA<<std::endl;
+      BadArgument = true; }
+    if( ldb < m ) {
+      std::cout << "BLAS::TRMM Error: LDB < MAX(1,M)"<<std::endl;
+      BadArgument = true; }
+
+    if(!BadArgument) {
+
+      // B only needs to be zeroed out.
+      if( alpha == alpha_zero ) {
+        for( j=izero; j<n; j++ ) {
+          for( i=izero; i<m; i++ ) {
+            B[j*ldb + i] = B_zero;
+          }
+        }
+        return;
+      }
+
+      //  Start the computations.
+      if ( LSide ) {
+        // A is on the left side of B.
+
+        if ( ETranspChar[transa]=='N' ) {
+          // Compute B = alpha*A*B
+
+          if ( Upper ) {
+            // A is upper triangular
+            for( j=izero; j<n; j++ ) {
+              for( k=izero; k<m; k++) {
+                if ( B[j*ldb + k] != B_zero ) {
+                  temp = alpha*B[j*ldb + k];
+                  for( i=izero; i<k; i++ ) {
+                    B[j*ldb + i] += temp*A[k*lda + i];
+                  }
+                  if ( noUnit )
+                    temp *=A[k*lda + k];
+                  B[j*ldb + k] = temp;
+                }
+              }
+            }
+          } else {
+            // A is lower triangular
+            for( j=izero; j<n; j++ ) {
+              for( k=m-ione; k>-ione; k-- ) {
+                if( B[j*ldb + k] != B_zero ) {
+                  temp = alpha*B[j*ldb + k];
+                  B[j*ldb + k] = temp;
+                  if ( noUnit )
+                    B[j*ldb + k] *= A[k*lda + k];
+                  for( i=k+ione; i<m; i++ ) {
+                    B[j*ldb + i] += temp*A[k*lda + i];
+                  }
+                }
+              }
+            }
+          }
+        } else {
+          // Compute B = alpha*A'*B or B = alpha*conj(A')*B
+          if( Upper ) {
+            for( j=izero; j<n; j++ ) {
+              for( i=m-ione; i>-ione; i-- ) {
+                temp = B[j*ldb + i];
+                if ( noConj ) {
+                  if( noUnit )
+                    temp *= A[i*lda + i];
+                  for( k=izero; k<i; k++ ) {
+                    temp += A[i*lda + k]*B[j*ldb + k];
+                  }
+                } else {
+                  if( noUnit )
+                    temp *= ScalarTraits<A_type>::conjugate(A[i*lda + i]);
+                  for( k=izero; k<i; k++ ) {
+                    temp += ScalarTraits<A_type>::conjugate(A[i*lda + k])*B[j*ldb + k];
+                  }
+                }
+                B[j*ldb + i] = alpha*temp;
+              }
+            }
+          } else {
+            for( j=izero; j<n; j++ ) {
+              for( i=izero; i<m; i++ ) {
+                temp = B[j*ldb + i];
+                if ( noConj ) {
+                  if( noUnit )
+                    temp *= A[i*lda + i];
+                  for( k=i+ione; k<m; k++ ) {
+                    temp += A[i*lda + k]*B[j*ldb + k];
+                  }
+                } else {
+                  if( noUnit )
+                    temp *= ScalarTraits<A_type>::conjugate(A[i*lda + i]);
+                  for( k=i+ione; k<m; k++ ) {
+                    temp += ScalarTraits<A_type>::conjugate(A[i*lda + k])*B[j*ldb + k];
+                  }
+                }
+                B[j*ldb + i] = alpha*temp;
+              }
+            }
+          }
+        }
+      } else {
+        // A is on the right hand side of B.
+
+        if( ETranspChar[transa] == 'N' ) {
+          // Compute B = alpha*B*A
+
+          if( Upper ) {
+            // A is upper triangular.
+            for( j=n-ione; j>-ione; j-- ) {
+              temp = alpha;
+              if( noUnit )
+                temp *= A[j*lda + j];
+              for( i=izero; i<m; i++ ) {
+                B[j*ldb + i] *= temp;
+              }
+              for( k=izero; k<j; k++ ) {
+                if( A[j*lda + k] != A_zero ) {
+                  temp = alpha*A[j*lda + k];
+                  for( i=izero; i<m; i++ ) {
+                    B[j*ldb + i] += temp*B[k*ldb + i];
+                  }
+                }
+              }
+            }
+          } else {
+            // A is lower triangular.
+            for( j=izero; j<n; j++ ) {
+              temp = alpha;
+              if( noUnit )
+                temp *= A[j*lda + j];
+              for( i=izero; i<m; i++ ) {
+                B[j*ldb + i] *= temp;
+              }
+              for( k=j+ione; k<n; k++ ) {
+                if( A[j*lda + k] != A_zero ) {
+                  temp = alpha*A[j*lda + k];
+                  for( i=izero; i<m; i++ ) {
+                    B[j*ldb + i] += temp*B[k*ldb + i];
+                  }
+                }
+              }
+            }
+          }
+        } else {
+          // Compute B = alpha*B*A' or B = alpha*B*conj(A')
+
+          if( Upper ) {
+            for( k=izero; k<n; k++ ) {
+              for( j=izero; j<k; j++ ) {
+                if( A[k*lda + j] != A_zero ) {
+                  if ( noConj )
+                    temp = alpha*A[k*lda + j];
+                  else
+                    temp = alpha*ScalarTraits<A_type>::conjugate(A[k*lda + j]);
+                  for( i=izero; i<m; i++ ) {
+                    B[j*ldb + i] += temp*B[k*ldb + i];
+                  }
+                }
+              }
+              temp = alpha;
+              if( noUnit ) {
+                if ( noConj )
+                  temp *= A[k*lda + k];
+                else
+                  temp *= ScalarTraits<A_type>::conjugate(A[k*lda + k]);
+              }
+              if( temp != one ) {
+                for( i=izero; i<m; i++ ) {
+                  B[k*ldb + i] *= temp;
+                }
+              }
+            }
+          } else {
+            for( k=n-ione; k>-ione; k-- ) {
+              for( j=k+ione; j<n; j++ ) {
+                if( A[k*lda + j] != A_zero ) {
+                  if ( noConj )
+                    temp = alpha*A[k*lda + j];
+                  else
+                    temp = alpha*ScalarTraits<A_type>::conjugate(A[k*lda + j]);
+                  for( i=izero; i<m; i++ ) {
+                    B[j*ldb + i] += temp*B[k*ldb + i];
+                  }
+                }
+              }
+              temp = alpha;
+              if( noUnit ) {
+                if ( noConj )
+                  temp *= A[k*lda + k];
+                else
+                  temp *= ScalarTraits<A_type>::conjugate(A[k*lda + k]);
+              }
+              if( temp != one ) {
+                for( i=izero; i<m; i++ ) {
+                  B[k*ldb + i] *= temp;
+                }
+              }
+            }
+          }
+        } // end if( ETranspChar[transa] == 'N' ) ...
+      } // end if ( LSide ) ...
+    } // end if (!BadArgument)
+  } // end TRMM
+
+  template<typename OrdinalType, typename ScalarType>
+  template <typename alpha_type, typename A_type>
+  void DefaultBLASImpl<OrdinalType, ScalarType>::TRSM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const OrdinalType& m, const OrdinalType& n, const alpha_type alpha, const A_type* A, const OrdinalType& lda, ScalarType* B, const OrdinalType& ldb) const
+  {
+    OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
+    OrdinalType ione = OrdinalTraits<OrdinalType>::one();
+    alpha_type alpha_zero = ScalarTraits<alpha_type>::zero();
+    A_type A_zero = ScalarTraits<A_type>::zero();
+    ScalarType B_zero = ScalarTraits<ScalarType>::zero();
+    alpha_type alpha_one = ScalarTraits<alpha_type>::one();
+    ScalarType B_one = ScalarTraits<ScalarType>::one();
+    ScalarType temp;
+    OrdinalType NRowA = m;
+    bool BadArgument = false;
+    bool noUnit = (EDiagChar[diag]=='N');
+    bool noConj = (ETranspChar[transa] == 'T');
+
+    if (!(ESideChar[side] == 'L')) { NRowA = n; }
+
+    // Quick return.
+    if (n == izero || m == izero) { return; }
+    if( m < izero ) {
+      std::cout << "BLAS::TRSM Error: M == "<<m<<std::endl;
+      BadArgument = true; }
+    if( n < izero ) {
+      std::cout << "BLAS::TRSM Error: N == "<<n<<std::endl;
+      BadArgument = true; }
+    if( lda < NRowA ) {
+      std::cout << "BLAS::TRSM Error: LDA < "<<NRowA<<std::endl;
+      BadArgument = true; }
+    if( ldb < m ) {
+      std::cout << "BLAS::TRSM Error: LDB < MAX(1,M)"<<std::endl;
+      BadArgument = true; }
+
+    if(!BadArgument)
+      {
+        int i, j, k;
+        // Set the solution to the zero vector.
+        if(alpha == alpha_zero) {
+            for(j = izero; j < n; j++) {
+                for( i = izero; i < m; i++) {
+                    B[j*ldb+i] = B_zero;
+                }
+            }
+        }
+        else
+        { // Start the operations.
+            if(ESideChar[side] == 'L') {
+                //
+                // Perform computations for OP(A)*X = alpha*B
+                //
+                if(ETranspChar[transa] == 'N') {
+                    //
+                    //  Compute B = alpha*inv( A )*B
+                    //
+                    if(EUploChar[uplo] == 'U') {
+                        // A is upper triangular.
+                        for(j = izero; j < n; j++) {
+                            // Perform alpha*B if alpha is not 1.
+                          if(alpha != alpha_one) {
+                                for( i = izero; i < m; i++) {
+                                    B[j*ldb+i] *= alpha;
+                                }
+                            }
+                            // Perform a backsolve for column j of B.
+                            for(k = (m - ione); k > -ione; k--) {
+                                // If this entry is zero, we don't have to do anything.
+                                if (B[j*ldb + k] != B_zero) {
+                                    if ( noUnit ) {
+                                        B[j*ldb + k] /= A[k*lda + k];
+                                    }
+                                    for(i = izero; i < k; i++) {
+                                        B[j*ldb + i] -= B[j*ldb + k] * A[k*lda + i];
+                                    }
+                                }
+                            }
+                        }
+                    }
+                    else
+                    { // A is lower triangular.
+                        for(j = izero; j < n; j++) {
+                            // Perform alpha*B if alpha is not 1.
+                            if(alpha != alpha_one) {
+                                for( i = izero; i < m; i++) {
+                                    B[j*ldb+i] *= alpha;
+                                }
+                            }
+                            // Perform a forward solve for column j of B.
+                            for(k = izero; k < m; k++) {
+                                // If this entry is zero, we don't have to do anything.
+                                if (B[j*ldb + k] != B_zero) {
+                                    if ( noUnit ) {
+                                        B[j*ldb + k] /= A[k*lda + k];
+                                    }
+                                    for(i = k+ione; i < m; i++) {
+                                        B[j*ldb + i] -= B[j*ldb + k] * A[k*lda + i];
+                                    }
+                                }
+                            }
+                        }
+                    } // end if (uplo == 'U')
+                }  // if (transa =='N')
+                else {
+                    //
+                    //  Compute B = alpha*inv( A' )*B
+                    //  or      B = alpha*inv( conj(A') )*B
+                    //
+                    if(EUploChar[uplo] == 'U') {
+                        // A is upper triangular.
+                        for(j = izero; j < n; j++) {
+                            for( i = izero; i < m; i++) {
+                                temp = alpha*B[j*ldb+i];
+                                if ( noConj ) {
+                                  for(k = izero; k < i; k++) {
+                                      temp -= A[i*lda + k] * B[j*ldb + k];
+                                  }
+                                  if ( noUnit ) {
+                                      temp /= A[i*lda + i];
+                                  }
+                                } else {
+                                  for(k = izero; k < i; k++) {
+                                      temp -= ScalarTraits<A_type>::conjugate(A[i*lda + k])
+                                            * B[j*ldb + k];
+                                  }
+                                  if ( noUnit ) {
+                                      temp /= ScalarTraits<A_type>::conjugate(A[i*lda + i]);
+                                  }
+                                }
+                                B[j*ldb + i] = temp;
+                            }
+                        }
+                    }
+                    else
+                    { // A is lower triangular.
+                        for(j = izero; j < n; j++) {
+                            for(i = (m - ione) ; i > -ione; i--) {
+                                temp = alpha*B[j*ldb+i];
+                                if ( noConj ) {
+                                  for(k = i+ione; k < m; k++) {
+                                    temp -= A[i*lda + k] * B[j*ldb + k];
+                                  }
+                                  if ( noUnit ) {
+                                    temp /= A[i*lda + i];
+                                  }
+                                } else {
+                                  for(k = i+ione; k < m; k++) {
+                                    temp -= ScalarTraits<A_type>::conjugate(A[i*lda + k])
+                                          * B[j*ldb + k];
+                                  }
+                                  if ( noUnit ) {
+                                    temp /= ScalarTraits<A_type>::conjugate(A[i*lda + i]);
+                                  }
+                                }
+                                B[j*ldb + i] = temp;
+                            }
+                        }
+                    }
+                }
+            }  // if (side == 'L')
+            else {
+               // side == 'R'
+               //
+               // Perform computations for X*OP(A) = alpha*B
+               //
+              if (ETranspChar[transa] == 'N') {
+                    //
+                    //  Compute B = alpha*B*inv( A )
+                    //
+                    if(EUploChar[uplo] == 'U') {
+                        // A is upper triangular.
+                        // Perform a backsolve for column j of B.
+                        for(j = izero; j < n; j++) {
+                            // Perform alpha*B if alpha is not 1.
+                            if(alpha != alpha_one) {
+                                for( i = izero; i < m; i++) {
+                                    B[j*ldb+i] *= alpha;
+                                }
+                            }
+                            for(k = izero; k < j; k++) {
+                                // If this entry is zero, we don't have to do anything.
+                                if (A[j*lda + k] != A_zero) {
+                                    for(i = izero; i < m; i++) {
+                                        B[j*ldb + i] -= A[j*lda + k] * B[k*ldb + i];
+                                    }
+                                }
+                            }
+                            if ( noUnit ) {
+                                temp = B_one/A[j*lda + j];
+                                for(i = izero; i < m; i++) {
+                                    B[j*ldb + i] *= temp;
+                                }
+                            }
+                        }
+                    }
+                    else
+                    { // A is lower triangular.
+                        for(j = (n - ione); j > -ione; j--) {
+                            // Perform alpha*B if alpha is not 1.
+                            if(alpha != alpha_one) {
+                                for( i = izero; i < m; i++) {
+                                    B[j*ldb+i] *= alpha;
+                                }
+                            }
+                            // Perform a forward solve for column j of B.
+                            for(k = j+ione; k < n; k++) {
+                                // If this entry is zero, we don't have to do anything.
+                                if (A[j*lda + k] != A_zero) {
+                                    for(i = izero; i < m; i++) {
+                                        B[j*ldb + i] -= A[j*lda + k] * B[k*ldb + i];
+                                    }
+                                }
+                            }
+                            if ( noUnit ) {
+                                temp = B_one/A[j*lda + j];
+                                for(i = izero; i < m; i++) {
+                                    B[j*ldb + i] *= temp;
+                                }
+                            }
+                        }
+                    } // end if (uplo == 'U')
+                }  // if (transa =='N')
+                else {
+                    //
+                    //  Compute B = alpha*B*inv( A' )
+                    //  or      B = alpha*B*inv( conj(A') )
+                    //
+                    if(EUploChar[uplo] == 'U') {
+                        // A is upper triangular.
+                        for(k = (n - ione); k > -ione; k--) {
+                            if ( noUnit ) {
+                                if ( noConj )
+                                  temp = B_one/A[k*lda + k];
+                                else
+                                  temp = B_one/ScalarTraits<A_type>::conjugate(A[k*lda + k]);
+                                for(i = izero; i < m; i++) {
+                                    B[k*ldb + i] *= temp;
+                                }
+                            }
+                            for(j = izero; j < k; j++) {
+                                if (A[k*lda + j] != A_zero) {
+                                    if ( noConj )
+                                      temp = A[k*lda + j];
+                                    else
+                                      temp = ScalarTraits<A_type>::conjugate(A[k*lda + j]);
+                                    for(i = izero; i < m; i++) {
+                                        B[j*ldb + i] -= temp*B[k*ldb + i];
+                                    }
+                                }
+                            }
+                            if (alpha != alpha_one) {
+                                for (i = izero; i < m; i++) {
+                                    B[k*ldb + i] *= alpha;
+                                }
+                            }
+                        }
+                    }
+                    else
+                    { // A is lower triangular.
+                        for(k = izero; k < n; k++) {
+                            if ( noUnit ) {
+                                if ( noConj )
+                                  temp = B_one/A[k*lda + k];
+                                else
+                                  temp = B_one/ScalarTraits<A_type>::conjugate(A[k*lda + k]);
+                                for (i = izero; i < m; i++) {
+                                    B[k*ldb + i] *= temp;
+                                }
+                            }
+                            for(j = k+ione; j < n; j++) {
+                                if(A[k*lda + j] != A_zero) {
+                                    if ( noConj )
+                                      temp = A[k*lda + j];
+                                    else
+                                      temp = ScalarTraits<A_type>::conjugate(A[k*lda + j]);
+                                    for(i = izero; i < m; i++) {
+                                        B[j*ldb + i] -= temp*B[k*ldb + i];
+                                    }
+                                }
+                            }
+                            if (alpha != alpha_one) {
+                                for (i = izero; i < m; i++) {
+                                    B[k*ldb + i] *= alpha;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+  }
+
+  // Explicit instantiation for template<int,float>
+
+  template <>
+  class TEUCHOSNUMERICS_LIB_DLL_EXPORT BLAS<int, float>
+  {
+  public:
+    inline BLAS(void) {}
+    inline BLAS(const BLAS<int, float>& /*BLAS_source*/) {}
+    inline virtual ~BLAS(void) {}
+    void ROTG(float* da, float* db, float* c, float* s) const;
+    void ROT(const int& n, float* dx, const int& incx, float* dy, const int& incy, float* c, float* s) const;
+    float ASUM(const int& n, const float* x, const int& incx) const;
+    void AXPY(const int& n, const float& alpha, const float* x, const int& incx, float* y, const int& incy) const;
+    void COPY(const int& n, const float* x, const int& incx, float* y, const int& incy) const;
+    float DOT(const int& n, const float* x, const int& incx, const float* y, const int& incy) const;
+    float NRM2(const int& n, const float* x, const int& incx) const;
+    void SCAL(const int& n, const float& alpha, float* x, const int& incx) const;
+    int IAMAX(const int& n, const float* x, const int& incx) const;
+    void GEMV(ETransp trans, const int& m, const int& n, const float& alpha, const float* A, const int& lda, const float* x, const int& incx, const float& beta, float* y, const int& incy) const;
+    void TRMV(EUplo uplo, ETransp trans, EDiag diag, const int& n, const float* A, const int& lda, float* x, const int& incx) const;
+    void GER(const int& m, const int& n, const float& alpha, const float* x, const int& incx, const float* y, const int& incy, float* A, const int& lda) const;
+    void GEMM(ETransp transa, ETransp transb, const int& m, const int& n, const int& k, const float& alpha, const float* A, const int& lda, const float* B, const int& ldb, const float& beta, float* C, const int& ldc) const;
+    void SWAP(const int& n, float* const x, const int& incx, float* const y, const int& incy) const;
+    void SYMM(ESide side, EUplo uplo, const int& m, const int& n, const float& alpha, const float* A, const int& lda, const float* B, const int& ldb, const float& beta, float* C, const int& ldc) const;
+    void SYRK(EUplo uplo, ETransp trans, const int& n, const int& k, const float& alpha, const float* A, const int& lda, const float& beta, float* C, const int& ldc) const;
+    void HERK(EUplo uplo, ETransp trans, const int& n, const int& k, const float& alpha, const float* A, const int& lda, const float& beta, float* C, const int& ldc) const;
+    void TRMM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int& m, const int& n, const float& alpha, const float* A, const int& lda, float* B, const int& ldb) const;
+    void TRSM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int& m, const int& n, const float& alpha, const float* A, const int& lda, float* B, const int& ldb) const;
+  };
+
+  // Explicit instantiation for template<int,double>
+
+  template<>
+  class TEUCHOSNUMERICS_LIB_DLL_EXPORT BLAS<int, double>
+  {
+  public:
+    inline BLAS(void) {}
+    inline BLAS(const BLAS<int, double>& /*BLAS_source*/) {}
+    inline virtual ~BLAS(void) {}
+    void ROTG(double* da, double* db, double* c, double* s) const;
+    void ROT(const int& n, double* dx, const int& incx, double* dy, const int& incy, double* c, double* s) const;
+    double ASUM(const int& n, const double* x, const int& incx) const;
+    void AXPY(const int& n, const double& alpha, const double* x, const int& incx, double* y, const int& incy) const;
+    void COPY(const int& n, const double* x, const int& incx, double* y, const int& incy) const;
+    double DOT(const int& n, const double* x, const int& incx, const double* y, const int& incy) const;
+    double NRM2(const int& n, const double* x, const int& incx) const;
+    void SCAL(const int& n, const double& alpha, double* x, const int& incx) const;
+    int IAMAX(const int& n, const double* x, const int& incx) const;
+    void GEMV(ETransp trans, const int& m, const int& n, const double& alpha, const double* A, const int& lda, const double* x, const int& incx, const double& beta, double* y, const int& incy) const;
+    void TRMV(EUplo uplo, ETransp trans, EDiag diag, const int& n, const double* A, const int& lda, double* x, const int& incx) const;
+    void GER(const int& m, const int& n, const double& alpha, const double* x, const int& incx, const double* y, const int& incy, double* A, const int& lda) const;
+    void GEMM(ETransp transa, ETransp transb, const int& m, const int& n, const int& k, const double& alpha, const double* A, const int& lda, const double* B, const int& ldb, const double& beta, double* C, const int& ldc) const;
+    void SWAP(const int& n, double* const x, const int& incx, double* const y, const int& incy) const;
+    void SYMM(ESide side, EUplo uplo, const int& m, const int& n, const double& alpha, const double* A, const int& lda, const double* B, const int& ldb, const double& beta, double* C, const int& ldc) const;
+    void SYRK(EUplo uplo, ETransp trans, const int& n, const int& k, const double& alpha, const double* A, const int& lda, const double& beta, double* C, const int& ldc) const;
+    void HERK(EUplo uplo, ETransp trans, const int& n, const int& k, const double& alpha, const double* A, const int& lda, const double& beta, double* C, const int& ldc) const;
+    void TRMM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int& m, const int& n, const double& alpha, const double* A, const int& lda, double* B, const int& ldb) const;
+    void TRSM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int& m, const int& n, const double& alpha, const double* A, const int& lda, double* B, const int& ldb) const;
+  };
+
+  // Explicit instantiation for template<int,complex<float> >
+
+  template<>
+  class TEUCHOSNUMERICS_LIB_DLL_EXPORT BLAS<int, std::complex<float> >
+  {
+  public:
+    inline BLAS(void) {}
+    inline BLAS(const BLAS<int, std::complex<float> >& /*BLAS_source*/) {}
+    inline virtual ~BLAS(void) {}
+    void ROTG(std::complex<float>* da, std::complex<float>* db, float* c, std::complex<float>* s) const;
+    void ROT(const int& n, std::complex<float>* dx, const int& incx, std::complex<float>* dy, const int& incy, float* c, std::complex<float>* s) const;
+    float ASUM(const int& n, const std::complex<float>* x, const int& incx) const;
+    void AXPY(const int& n, const std::complex<float> alpha, const std::complex<float>* x, const int& incx, std::complex<float>* y, const int& incy) const;
+    void COPY(const int& n, const std::complex<float>* x, const int& incx, std::complex<float>* y, const int& incy) const;
+    std::complex<float> DOT(const int& n, const std::complex<float>* x, const int& incx, const std::complex<float>* y, const int& incy) const;
+    float NRM2(const int& n, const std::complex<float>* x, const int& incx) const;
+    void SCAL(const int& n, const std::complex<float> alpha, std::complex<float>* x, const int& incx) const;
+    int IAMAX(const int& n, const std::complex<float>* x, const int& incx) const;
+    void GEMV(ETransp trans, const int& m, const int& n, const std::complex<float> alpha, const std::complex<float>* A, const int& lda, const std::complex<float>* x, const int& incx, const std::complex<float> beta, std::complex<float>* y, const int& incy) const;
+    void TRMV(EUplo uplo, ETransp trans, EDiag diag, const int& n, const std::complex<float>* A, const int& lda, std::complex<float>* x, const int& incx) const;
+    void GER(const int& m, const int& n, const std::complex<float> alpha, const std::complex<float>* x, const int& incx, const std::complex<float>* y, const int& incy, std::complex<float>* A, const int& lda) const;
+    void GEMM(ETransp transa, ETransp transb, const int& m, const int& n, const int& k, const std::complex<float> alpha, const std::complex<float>* A, const int& lda, const std::complex<float>* B, const int& ldb, const std::complex<float> beta, std::complex<float>* C, const int& ldc) const;
+    void SWAP(const int& n, std::complex<float>* const x, const int& incx, std::complex<float>* const y, const int& incy) const;
+    void SYMM(ESide side, EUplo uplo, const int& m, const int& n, const std::complex<float> alpha, const std::complex<float>* A, const int& lda, const std::complex<float> *B, const int& ldb, const std::complex<float> beta, std::complex<float> *C, const int& ldc) const;
+    void SYRK(EUplo uplo, ETransp trans, const int& n, const int& k, const std::complex<float> alpha, const std::complex<float>* A, const int& lda, const std::complex<float> beta, std::complex<float>* C, const int& ldc) const;
+    void HERK(EUplo uplo, ETransp trans, const int& n, const int& k, const std::complex<float> alpha, const std::complex<float>* A, const int& lda, const std::complex<float> beta, std::complex<float>* C, const int& ldc) const;
+    void TRMM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int& m, const int& n, const std::complex<float> alpha, const std::complex<float>* A, const int& lda, std::complex<float>* B, const int& ldb) const;
+    void TRSM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int& m, const int& n, const std::complex<float> alpha, const std::complex<float>* A, const int& lda, std::complex<float>* B, const int& ldb) const;
+  };
+
+  // Explicit instantiation for template<int,complex<double> >
+
+  template<>
+  class TEUCHOSNUMERICS_LIB_DLL_EXPORT BLAS<int, std::complex<double> >
+  {
+  public:
+    inline BLAS(void) {}
+    inline BLAS(const BLAS<int, std::complex<double> >& /*BLAS_source*/) {}
+    inline virtual ~BLAS(void) {}
+    void ROTG(std::complex<double>* da, std::complex<double>* db, double* c, std::complex<double>* s) const;
+    void ROT(const int& n, std::complex<double>* dx, const int& incx, std::complex<double>* dy, const int& incy, double* c, std::complex<double>* s) const;
+    double ASUM(const int& n, const std::complex<double>* x, const int& incx) const;
+    void AXPY(const int& n, const std::complex<double> alpha, const std::complex<double>* x, const int& incx, std::complex<double>* y, const int& incy) const;
+    void COPY(const int& n, const std::complex<double>* x, const int& incx, std::complex<double>* y, const int& incy) const;
+    std::complex<double> DOT(const int& n, const std::complex<double>* x, const int& incx, const std::complex<double>* y, const int& incy) const;
+    double NRM2(const int& n, const std::complex<double>* x, const int& incx) const;
+    void SCAL(const int& n, const std::complex<double> alpha, std::complex<double>* x, const int& incx) const;
+    int IAMAX(const int& n, const std::complex<double>* x, const int& incx) const;
+    void GEMV(ETransp trans, const int& m, const int& n, const std::complex<double> alpha, const std::complex<double>* A, const int& lda, const std::complex<double>* x, const int& incx, const std::complex<double> beta, std::complex<double>* y, const int& incy) const;
+    void TRMV(EUplo uplo, ETransp trans, EDiag diag, const int& n, const std::complex<double>* A, const int& lda, std::complex<double>* x, const int& incx) const;
+    void GER(const int& m, const int& n, const std::complex<double> alpha, const std::complex<double>* x, const int& incx, const std::complex<double>* y, const int& incy, std::complex<double>* A, const int& lda) const;
+    void GEMM(ETransp transa, ETransp transb, const int& m, const int& n, const int& k, const std::complex<double> alpha, const std::complex<double>* A, const int& lda, const std::complex<double>* B, const int& ldb, const std::complex<double> beta, std::complex<double>* C, const int& ldc) const;
+    void SWAP(const int& n, std::complex<double>* const x, const int& incx, std::complex<double>* const y, const int& incy) const;
+    void SYMM(ESide side, EUplo uplo, const int& m, const int& n, const std::complex<double> alpha, const std::complex<double>* A, const int& lda, const std::complex<double> *B, const int& ldb, const std::complex<double> beta, std::complex<double> *C, const int& ldc) const;
+    void SYRK(EUplo uplo, ETransp trans, const int& n, const int& k, const std::complex<double> alpha, const std::complex<double>* A, const int& lda, const std::complex<double> beta, std::complex<double>* C, const int& ldc) const;
+    void HERK(EUplo uplo, ETransp trans, const int& n, const int& k, const std::complex<double> alpha, const std::complex<double>* A, const int& lda, const std::complex<double> beta, std::complex<double>* C, const int& ldc) const;
+    void TRMM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int& m, const int& n, const std::complex<double> alpha, const std::complex<double>* A, const int& lda, std::complex<double>* B, const int& ldb) const;
+    void TRSM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int& m, const int& n, const std::complex<double> alpha, const std::complex<double>* A, const int& lda, std::complex<double>* B, const int& ldb) const;
+  };
+
+} // namespace Teuchos
+
+#endif // _TEUCHOS_BLAS_HPP_
diff --git a/packages/rol/src/compatibility/teuchos-lite/Teuchos_BLAS_types.hpp b/packages/rol/src/compatibility/teuchos-lite/Teuchos_BLAS_types.hpp
new file mode 100644
index 000000000000..13f0e2f3df7a
--- /dev/null
+++ b/packages/rol/src/compatibility/teuchos-lite/Teuchos_BLAS_types.hpp
@@ -0,0 +1,121 @@
+/*
+// @HEADER
+// ***********************************************************************
+//
+//                    Teuchos: Common Tools Package
+//                 Copyright (2004) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
+//
+// ***********************************************************************
+// @HEADER
+*/
+
+// /////////////////////////////////////////////
+// Teuchos_BLAS_types.hpp
+
+#ifndef TEUCHOS_BLAS_TYPES_HPP
+#define TEUCHOS_BLAS_TYPES_HPP
+
+/*! \file Teuchos_BLAS_types.hpp
+	\brief Enumerated types for BLAS input characters.
+*/
+
+/*! \defgroup BLASEnum_grp Enumerations for character inputs in Teuchos::BLAS methods
+
+  \brief These enumerated lists are used in compile time checking of the input characters
+  for BLAS methods.
+
+	\note Any other input other than those specified here will result
+	in an error at compile time and are not supported by the templated BLAS/LAPACK interface.
+
+	<ul>
+	<li><b>Teuchos::ESide</b> : Enumerated list for BLAS character input "SIDE".
+		<ul>
+		<li>LEFT_SIDE : The matrix/std::vector is on, or applied to, the left side of the equation
+		<li>RIGHT_SIDE : The matrix/std::vector is on, or applied to, the right side of the equation
+		</ul><br>
+	<li><b>Teuchos::ETransp</b> : Enumerated list for BLAS character input "TRANS".
+		<ul>
+		<li>NO_TRANS : The matrix/std::vector is not transposed
+		<li>TRANS : The matrix/std::vector is transposed
+		<li>CONJ_TRANS : The matrix/std::vector is conjugate transposed
+		</ul><br>
+	<li><b>Teuchos::EUplo</b> : Enumerated list for BLAS character input "UPLO".
+		<ul>
+		<li>UPPER_TRI : The matrix is upper triangular
+		<li>LOWER_TRI : The matrix is lower triangular
+		</ul><br>
+	<li><b>Teuchos::EDiag</b> : Enumerated list for BLAS character input "DIAG".
+		<ul>
+		<li>UNIT_DIAG : The matrix has all ones on its diagonal
+		<li>NON_UNIT_DIAG : The matrix does not have all ones on its diagonal
+		</ul><br>
+        </ul>
+*/
+
+namespace Teuchos {
+  enum ESide { 	
+    LEFT_SIDE,	/*!< Left side */
+    RIGHT_SIDE 	/*!< Right side */
+  };
+
+  enum ETransp { 	
+    NO_TRANS,	/*!< Not transposed */
+    TRANS, 		/*!< Transposed */
+    CONJ_TRANS 	/*!< Conjugate transposed */
+  };
+
+  enum EUplo { 	
+    UPPER_TRI,	/*!< Upper triangular */
+    LOWER_TRI,	/*!< Lower triangular */
+    UNDEF_TRI   /*!< Unspeficied/undefined triangular structure */
+  };
+
+  enum EDiag { 	
+    UNIT_DIAG,	/*!< Unit diagaonal */
+    NON_UNIT_DIAG	/*!< Not unit diagonal */
+  };
+
+  enum EType {
+    FULL,	/*!< Full matrix */
+    LOWER,	/*!< Lower triangular */
+    UPPER,	/*!< Upper triangular */
+    HESSENBERG, /*!< Upper Hessenberg */
+    SYM_BAND_L, /*!< Symmetric band, lower half stored */
+    SYM_BAND_U, /*!< Symmetric band, upper half stored */
+    BAND        /*!< General band */
+  };
+}
+
+#endif // TEUCHOS_BLAS_TYPES_HPP
diff --git a/packages/rol/src/compatibility/teuchos-lite/Teuchos_BLAS_wrappers.hpp b/packages/rol/src/compatibility/teuchos-lite/Teuchos_BLAS_wrappers.hpp
new file mode 100644
index 000000000000..c29aacde86bc
--- /dev/null
+++ b/packages/rol/src/compatibility/teuchos-lite/Teuchos_BLAS_wrappers.hpp
@@ -0,0 +1,421 @@
+/*
+// @HEADER
+// ***********************************************************************
+//
+//                    Teuchos: Common Tools Package
+//                 Copyright (2004) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
+//
+// ***********************************************************************
+// @HEADER
+*/
+
+#ifndef _TEUCHOS_BLAS_WRAPPERS_HPP_
+#define _TEUCHOS_BLAS_WRAPPERS_HPP_
+
+#include "Teuchos_ConfigDefs.hpp"
+#ifdef _MSC_VER
+/* disable warning for C-linkage returning complex class */
+#pragma warning ( disable : 4190 )
+#endif
+
+/*! \file Teuchos_BLAS_wrappers.hpp
+    \brief The Templated BLAS wrappers.
+*/
+
+
+/* A) Define PREFIX and Teuchos_fcd based on platform. */
+
+#if defined(INTEL_CXML)
+#  define PREFIX __stdcall
+#  define Teuchos_fcd const char *, unsigned int
+#elif defined(INTEL_MKL)
+#  define PREFIX
+#  define Teuchos_fcd const char *
+#else /* Not CRAY_T3X or INTEL_CXML or INTEL_MKL */
+#  define PREFIX
+#  define Teuchos_fcd const char *
+#endif
+
+// Handle Complex types (we assume C++11 at a minumum
+// Microsoft C++11 apparently does not support float/double _Complex
+
+#if ( defined(_MSC_VER) )
+#define Teuchos_Complex_double_type_name std::complex<double>
+#define Teuchos_Complex_float_type_name std::complex<float>
+#else
+#define Teuchos_Complex_double_type_name double _Complex
+#define Teuchos_Complex_float_type_name float _Complex
+#endif
+
+
+/* B) Take care of of the link name case */
+
+
+#define DROTG_F77   F77_BLAS_MANGLE(drotg,DROTG)
+#define DROT_F77    F77_BLAS_MANGLE(drot,DROT)
+#define DASUM_F77   F77_BLAS_MANGLE(dasum,DASUM)
+#define DAXPY_F77   F77_BLAS_MANGLE(daxpy,DAXPY)
+#define DCOPY_F77   F77_BLAS_MANGLE(dcopy,DCOPY)
+#define DDOT_F77    F77_BLAS_MANGLE(ddot,DDOT)
+#define DNRM2_F77   F77_BLAS_MANGLE(dnrm2,DNRM2)
+#define DSCAL_F77   F77_BLAS_MANGLE(dscal,DSCAL)
+#define IDAMAX_F77  F77_BLAS_MANGLE(idamax,IDAMAX)
+#define DGEMV_F77   F77_BLAS_MANGLE(dgemv,DGEMV)
+#define DGER_F77    F77_BLAS_MANGLE(dger,DGER)
+#define DTRMV_F77   F77_BLAS_MANGLE(dtrmv,DTRMV)
+#define DGEMM_F77   F77_BLAS_MANGLE(dgemm,DGEMM)
+#define DSWAP_F77   F77_BLAS_MANGLE(dswap,DSWAP)
+#define DSYMM_F77   F77_BLAS_MANGLE(dsymm,DSYMM)
+#define DSYRK_F77   F77_BLAS_MANGLE(dsyrk,DSYRK)
+#define DTRMM_F77   F77_BLAS_MANGLE(dtrmm,DTRMM)
+#define DTRSM_F77   F77_BLAS_MANGLE(dtrsm,DTRSM)
+
+#ifdef HAVE_TEUCHOS_COMPLEX
+
+#define ZROTG_F77   F77_BLAS_MANGLE(zrotg,ZROTG)
+#define ZROT_F77    F77_BLAS_MANGLE(zrot,ZROT)
+#define ZASUM_F77   F77_BLAS_MANGLE(dzasum,DZASUM)
+#define ZAXPY_F77   F77_BLAS_MANGLE(zaxpy,ZAXPY)
+#define ZCOPY_F77   F77_BLAS_MANGLE(zcopy,ZCOPY)
+#define ZDOT_F77    F77_BLAS_MANGLE(zdotc,ZDOTC)
+#define ZNRM2_F77   F77_BLAS_MANGLE(dznrm2,DZNRM2)
+#define ZSCAL_F77   F77_BLAS_MANGLE(zscal,ZSCAL)
+#define IZAMAX_F77  F77_BLAS_MANGLE(izamax,IZAMAX)
+#define ZGEMV_F77   F77_BLAS_MANGLE(zgemv,ZGEMV)
+#define ZGER_F77    F77_BLAS_MANGLE(zgeru,ZGERU)
+#define ZTRMV_F77   F77_BLAS_MANGLE(ztrmv,ZTRMV)
+#define ZGEMM_F77   F77_BLAS_MANGLE(zgemm,ZGEMM)
+#define ZSWAP_F77   F77_BLAS_MANGLE(zswap,ZSWAP)
+#define ZSYMM_F77   F77_BLAS_MANGLE(zsymm,ZSYMM)
+#define ZSYRK_F77   F77_BLAS_MANGLE(zsyrk,ZSYRK)
+#define ZHERK_F77   F77_BLAS_MANGLE(zherk,ZHERK)
+#define ZTRMM_F77   F77_BLAS_MANGLE(ztrmm,ZTRMM)
+#define ZTRSM_F77   F77_BLAS_MANGLE(ztrsm,ZTRSM)
+
+#endif /* HAVE_TEUCHOS_COMPLEX */
+
+#define SROTG_F77   F77_BLAS_MANGLE(srotg,SROTG)
+#define SROT_F77    F77_BLAS_MANGLE(srot,SROT)
+#define SSCAL_F77   F77_BLAS_MANGLE(sscal,SSCAL)
+#define SCOPY_F77   F77_BLAS_MANGLE(scopy,SCOPY)
+#define SAXPY_F77   F77_BLAS_MANGLE(saxpy,SAXPY)
+#define SDOT_F77    F77_BLAS_MANGLE(sdot,SDOT)
+#define SNRM2_F77   F77_BLAS_MANGLE(snrm2,SNRM2)
+#define SASUM_F77   F77_BLAS_MANGLE(sasum,SASUM)
+#define ISAMAX_F77  F77_BLAS_MANGLE(isamax,ISAMAX)
+#define SGEMV_F77   F77_BLAS_MANGLE(sgemv,SGEMV)
+#define SGER_F77    F77_BLAS_MANGLE(sger,SGER)
+#define STRMV_F77   F77_BLAS_MANGLE(strmv,STRMV)
+#define SGEMM_F77   F77_BLAS_MANGLE(sgemm,SGEMM)
+#define SSWAP_F77   F77_BLAS_MANGLE(sswap,SSWAP)
+#define SSYMM_F77   F77_BLAS_MANGLE(ssymm,SSYMM)
+#define SSYRK_F77   F77_BLAS_MANGLE(ssyrk,SSYRK)
+#define STRMM_F77   F77_BLAS_MANGLE(strmm,STRMM)
+#define STRSM_F77   F77_BLAS_MANGLE(strsm,STRSM)
+
+#ifdef HAVE_TEUCHOS_COMPLEX
+
+#define CROTG_F77   F77_BLAS_MANGLE(crotg,CROTG)
+#define CROT_F77    F77_BLAS_MANGLE(crot,CROT)
+#define SCASUM_F77  F77_BLAS_MANGLE(scasum,SCASUM)
+#define CAXPY_F77   F77_BLAS_MANGLE(caxpy,CAXPY)
+#define CCOPY_F77   F77_BLAS_MANGLE(ccopy,CCOPY)
+#define CDOT_F77    F77_BLAS_MANGLE(cdotc,CDOTC)
+#define SCNRM2_F77   F77_BLAS_MANGLE(scnrm2,SCNRM2)
+#define CSCAL_F77   F77_BLAS_MANGLE(cscal,CSCAL)
+#define ICAMAX_F77  F77_BLAS_MANGLE(icamax,ICAMAX)
+#define CGEMV_F77   F77_BLAS_MANGLE(cgemv,CGEMV)
+#define CGER_F77    F77_BLAS_MANGLE(cgeru,CGERU)
+#define CTRMV_F77   F77_BLAS_MANGLE(ctrmv,CTRMV)
+#define CGEMM_F77   F77_BLAS_MANGLE(cgemm,CGEMM)
+#define CSWAP_F77   F77_BLAS_MANGLE(cswap,CSWAP)
+#define CSYMM_F77   F77_BLAS_MANGLE(csymm,CSYMM)
+#define CSYRK_F77   F77_BLAS_MANGLE(csyrk,CSYRK)
+#define CHERK_F77   F77_BLAS_MANGLE(cherk,CHERK)
+#define CTRMM_F77   F77_BLAS_MANGLE(ctrmm,CTRMM)
+#define CTRSM_F77   F77_BLAS_MANGLE(ctrsm,CTRSM)
+#define TEUCHOS_BLAS_CONVERT_COMPLEX_FORTRAN_TO_CXX(TYPE, Z) \
+   reinterpret_cast<std::complex<TYPE>&>(Z);
+// NOTE: The above is guaranteed to be okay given the C99 and C++11 standards
+
+#endif /* HAVE_TEUCHOS_COMPLEX */
+
+
+/* C) Define the function prototypes for all platforms! */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Double precision BLAS 1 */
+void PREFIX DROTG_F77(double* da, double* db, double* c, double* s);
+void PREFIX DROT_F77(const int* n, double* dx, const int* incx, double* dy, const int* incy, double* c, double* s);
+double PREFIX DASUM_F77(const int* n, const double x[], const int* incx);
+void PREFIX DAXPY_F77(const int* n, const double* alpha, const double x[], const int* incx, double y[], const int* incy);
+void PREFIX DCOPY_F77(const int* n, const double *x, const int* incx, double *y, const int* incy);
+double PREFIX DDOT_F77(const int* n, const double x[], const int* incx, const double y[], const int* incy);
+double PREFIX DNRM2_F77(const int* n, const double x[], const int* incx);
+void PREFIX DSCAL_F77(const int* n, const double* alpha, double *x, const int* incx);
+void PREFIX DSWAP_F77(const int* const n, double* const x, const int* const incx,
+                      double* const y, const int* const incy);
+int PREFIX IDAMAX_F77(const int* n, const double *x, const int* incx);
+
+/* Double std::complex precision BLAS 1 */
+#if defined(HAVE_TEUCHOS_COMPLEX) && defined(__cplusplus)
+
+#  if defined(HAVE_COMPLEX_BLAS_PROBLEM)
+#    if defined(HAVE_FIXABLE_COMPLEX_BLAS_PROBLEM)
+void PREFIX ZDOT_F77(std::complex<double> *ret, const int* n, const std::complex<double> x[], const int* incx, const std::complex<double> y[], const int* incy);
+#    elif defined(HAVE_VECLIB_COMPLEX_BLAS)
+// no declarations; they're in cblas.h
+#      include <vecLib/cblas.h>
+#    else
+  // mfh 01 Feb 2013: If the code reaches this point, it means that
+  // some complex BLAS routines are broken, but there is no easy
+  // workaround.  We deal with this in Teuchos_BLAS.cpp by
+  // reimplementing the offending routines.
+#    endif // HAVE_COMPLEX_BLAS_PROBLEM
+#  else // no problem
+Teuchos_Complex_double_type_name PREFIX ZDOT_F77(const int* n, const std::complex<double> x[], const int* incx, const std::complex<double> y[], const int* incy);
+#  endif // defined(HAVE_COMPLEX_BLAS_PROBLEM)
+
+double PREFIX ZNRM2_F77(const int* n, const std::complex<double> x[], const int* incx);
+double PREFIX ZASUM_F77(const int* n, const std::complex<double> x[], const int* incx);
+void PREFIX ZROTG_F77(std::complex<double>* da, std::complex<double>* db, double* c, std::complex<double>* s);
+void PREFIX ZROT_F77(const int* n, std::complex<double>* dx, const int* incx, std::complex<double>* dy, const int* incy, double* c, std::complex<double>* s);
+void PREFIX ZAXPY_F77(const int* n, const std::complex<double>* alpha, const std::complex<double> x[], const int* incx, std::complex<double> y[], const int* incy);
+void PREFIX ZCOPY_F77(const int* n, const std::complex<double> *x, const int* incx, std::complex<double> *y, const int* incy);
+void PREFIX ZSCAL_F77(const int* n, const std::complex<double>* alpha, std::complex<double> *x, const int* incx);
+void PREFIX ZSWAP_F77(const int* const n, std::complex<double>* const x, const int* const incx,
+                      std::complex<double>* const y, const int* const incy);
+int PREFIX IZAMAX_F77(const int* n, const std::complex<double> *x, const int* incx);
+
+#endif /* defined(HAVE_TEUCHOS_COMPLEX) && defined(__cplusplus) */
+
+/* Single precision BLAS 1 */
+#ifdef HAVE_TEUCHOS_BLASFLOAT
+#  ifdef HAVE_TEUCHOS_BLASFLOAT_APPLE_VECLIB_BUGFIX
+#    include <vecLib/cblas.h>
+#  elif defined(HAVE_TEUCHOS_BLASFLOAT_DOUBLE_RETURN)
+double PREFIX SASUM_F77(const int* n, const float x[], const int* incx);
+double PREFIX SDOT_F77(const int* n, const float x[], const int* incx, const float y[], const int* incy);
+double PREFIX SNRM2_F77(const int* n, const float x[], const int* incx);
+#  else
+float PREFIX SASUM_F77(const int* n, const float x[], const int* incx);
+float PREFIX SDOT_F77(const int* n, const float x[], const int* incx, const float y[], const int* incy);
+float PREFIX SNRM2_F77(const int* n, const float x[], const int* incx);
+#  endif // which blasfloat
+#endif // ifdef blasfloat
+void PREFIX SROTG_F77(float* da, float* db, float* c, float* s);
+void PREFIX SROT_F77(const int* n, float* dx, const int* incx, float* dy, const int* incy, float* c, float* s);
+void PREFIX SAXPY_F77(const int* n, const float* alpha, const float x[], const int* incx, float y[], const int* incy);
+void PREFIX SCOPY_F77(const int* n, const float *x, const int* incx, float *y, const int* incy);
+void PREFIX SSCAL_F77(const int* n, const float* alpha, float *x, const int* incx);
+void PREFIX SSWAP_F77(const int* const n, float* const x, const int* const incx,
+                      float* const y, const int* const incy);
+int PREFIX ISAMAX_F77(const int* n, const float *x, const int* incx);
+
+/* Single std::complex precision BLAS 1 */
+#if defined(HAVE_TEUCHOS_COMPLEX) && defined(__cplusplus)
+#  if defined(HAVE_TEUCHOS_BLASFLOAT)
+#    if defined(HAVE_TEUCHOS_BLASFLOAT_APPLE_VECLIB_BUGFIX)
+// no declarations; they're in cblas.h
+#      include <vecLib/cblas.h>
+#    elif defined(HAVE_TEUCHOS_BLASFLOAT_DOUBLE_RETURN)
+double PREFIX SCASUM_F77(const int* n, const std::complex<float> x[], const int* incx);
+double PREFIX SCNRM2_F77(const int* n, const std::complex<float> x[], const int* incx);
+#    else
+float PREFIX SCASUM_F77(const int* n, const std::complex<float> x[], const int* incx);
+float PREFIX SCNRM2_F77(const int* n, const std::complex<float> x[], const int* incx);
+#  endif // Whether or not we have the veclib bugfix
+#endif // defined(HAVE_TEUCHOS_BLASFLOAT)
+
+#if defined(HAVE_TEUCHOS_BLASFLOAT_APPLE_VECLIB_BUGFIX)
+// no declarations; they're in cblas.h
+#include <vecLib/cblas.h>
+#elif defined(HAVE_COMPLEX_BLAS_PROBLEM) && defined(HAVE_FIXABLE_COMPLEX_BLAS_PROBLEM)
+void PREFIX CDOT_F77(std::complex<float> *ret, const int* n, const std::complex<float> x[], const int* incx, const std::complex<float> y[], const int* incy);
+#elif defined(HAVE_TEUCHOS_BLASFLOAT)
+Teuchos_Complex_float_type_name PREFIX CDOT_F77(const int* n, const std::complex<float> x[], const int* incx, const std::complex<float> y[], const int* incy);
+#else
+// the code is literally in Teuchos_BLAS.cpp
+#endif
+
+void PREFIX CROTG_F77(std::complex<float>* da, std::complex<float>* db, float* c, std::complex<float>* s);
+void PREFIX CROT_F77(const int* n, std::complex<float>* dx, const int* incx, std::complex<float>* dy, const int* incy, float* c, std::complex<float>* s);
+void PREFIX CAXPY_F77(const int* n, const std::complex<float>* alpha, const std::complex<float> x[], const int* incx, std::complex<float> y[], const int* incy);
+void PREFIX CCOPY_F77(const int* n, const std::complex<float> *x, const int* incx, std::complex<float> *y, const int* incy);
+void PREFIX CSCAL_F77(const int* n, const std::complex<float>* alpha, std::complex<float> *x, const int* incx);
+void PREFIX CSWAP_F77(const int* const n, std::complex<float>* const x, const int* const incx,
+                      std::complex<float>* const y, const int* const incy);
+int PREFIX ICAMAX_F77(const int* n, const std::complex<float> *x, const int* incx);
+
+#endif /* defined(HAVE_TEUCHOS_COMPLEX) && defined(__cplusplus) */
+
+/* Double precision BLAS 2 */
+void PREFIX DGEMV_F77(Teuchos_fcd, const int* m, const int* n, const double* alpha, const double A[], const int* lda,
+                 const double x[], const int* incx, const double* beta, double y[], const int* incy);
+void PREFIX DTRMV_F77(Teuchos_fcd, Teuchos_fcd, Teuchos_fcd, const int *n,
+                const double *a, const int *lda, double *x, const int *incx);
+void PREFIX DGER_F77(const int *m, const int *n, const double *alpha, const double *x, const int *incx, const double *y,
+               const int *incy, double *a, const int *lda);
+
+/* Double precision BLAS 2 */
+#if defined(HAVE_TEUCHOS_COMPLEX) && defined(__cplusplus)
+
+void PREFIX ZGEMV_F77(Teuchos_fcd, const int* m, const int* n, const std::complex<double>* alpha, const std::complex<double> A[], const int* lda,
+                 const std::complex<double> x[], const int* incx, const std::complex<double>* beta, std::complex<double> y[], const int* incy);
+void PREFIX ZTRMV_F77(Teuchos_fcd, Teuchos_fcd, Teuchos_fcd, const int *n,
+                const std::complex<double> *a, const int *lda, std::complex<double> *x, const int *incx);
+void PREFIX ZGER_F77(const int *m, const int *n, const std::complex<double> *alpha, const std::complex<double> *x, const int *incx, const std::complex<double> *y,
+               const int *incy, std::complex<double> *a, const int *lda);
+
+#endif /* defined(HAVE_TEUCHOS_COMPLEX) && defined(__cplusplus) */
+
+/* Single precision BLAS 2 */
+void PREFIX SGEMV_F77(Teuchos_fcd, const int* m, const int* n, const float* alpha, const float A[], const int* lda,
+                 const float x[], const int* incx, const float* beta, float y[], const int* incy);
+void PREFIX STRMV_F77(Teuchos_fcd, Teuchos_fcd, Teuchos_fcd, const int *n,
+                const float *a, const int *lda, float *x, const int *incx);
+void PREFIX SGER_F77(const int *m, const int *n, const float *alpha, const float *x, const int *incx, const float *y,
+               const int *incy, float *a, const int *lda);
+
+/* Single std::complex precision BLAS 2 */
+#if defined(HAVE_TEUCHOS_COMPLEX) && defined(__cplusplus)
+
+void PREFIX CGEMV_F77(Teuchos_fcd, const int* m, const int* n, const std::complex<float>* alpha, const std::complex<float> A[], const int* lda,
+                 const std::complex<float> x[], const int* incx, const std::complex<float>* beta, std::complex<float> y[], const int* incy);
+void PREFIX CTRMV_F77(Teuchos_fcd, Teuchos_fcd, Teuchos_fcd, const int *n,
+                const std::complex<float> *a, const int *lda, std::complex<float> *x, const int *incx);
+void PREFIX CGER_F77(const int *m, const int *n, const std::complex<float> *alpha, const std::complex<float> *x, const int *incx, const std::complex<float> *y,
+               const int *incy, std::complex<float> *a, const int *lda);
+
+#endif /* defined(HAVE_TEUCHOS_COMPLEX) && defined(__cplusplus) */
+
+/* Double precision BLAS 3 */
+void PREFIX DGEMM_F77(Teuchos_fcd, Teuchos_fcd, const int *m, const int *
+                n, const int *k, const double *alpha, const double *a, const int *lda,
+                const double *b, const int *ldb, const double *beta, double *c, const int *ldc);
+void PREFIX DSYMM_F77(Teuchos_fcd, Teuchos_fcd, const int *m, const int * n,
+                const double *alpha, const double *a, const int *lda,
+                const double *b, const int *ldb, const double *beta, double *c, const int *ldc);
+void PREFIX DSYRK_F77(Teuchos_fcd, Teuchos_fcd, const int *n, const int * k,
+                const double *alpha, const double *a, const int *lda,
+                const double *beta, double *c, const int *ldc);
+void PREFIX DTRMM_F77(Teuchos_fcd, Teuchos_fcd, Teuchos_fcd, Teuchos_fcd,
+                const int *m, const int *n, const double *alpha, const double *a, const int * lda, double *b, const int *ldb);
+void PREFIX DTRSM_F77(Teuchos_fcd, Teuchos_fcd, Teuchos_fcd, Teuchos_fcd,
+                const int *m, const int *n, const double *alpha, const double *a, const int *
+                lda, double *b, const int *ldb);
+
+/* Double std::complex precision BLAS 3 */
+#if defined(HAVE_TEUCHOS_COMPLEX) && defined(__cplusplus)
+
+void PREFIX ZGEMM_F77(Teuchos_fcd, Teuchos_fcd, const int *m, const int *
+                n, const int *k, const std::complex<double> *alpha, const std::complex<double> *a, const int *lda,
+                const std::complex<double> *b, const int *ldb, const std::complex<double> *beta, std::complex<double> *c, const int *ldc);
+void PREFIX ZSYMM_F77(Teuchos_fcd, Teuchos_fcd, const int *m, const int * n,
+                const std::complex<double> *alpha, const std::complex<double> *a, const int *lda,
+                const std::complex<double> *b, const int *ldb, const std::complex<double> *beta, std::complex<double> *c, const int *ldc);
+void PREFIX ZSYRK_F77(Teuchos_fcd, Teuchos_fcd, const int *n, const int * k,
+                const std::complex<double> *alpha, const std::complex<double> *a, const int *lda,
+                const std::complex<double> *beta, std::complex<double> *c, const int *ldc);
+void PREFIX ZHERK_F77(Teuchos_fcd, Teuchos_fcd, const int *n, const int * k,
+                const std::complex<double> *alpha, const std::complex<double> *a, const int *lda,
+                const std::complex<double> *beta, std::complex<double> *c, const int *ldc);
+void PREFIX ZTRMM_F77(Teuchos_fcd, Teuchos_fcd, Teuchos_fcd, Teuchos_fcd,
+                const int *m, const int *n, const std::complex<double> *alpha, const std::complex<double> *a, const int * lda, std::complex<double> *b, const int *ldb);
+void PREFIX ZTRSM_F77(Teuchos_fcd, Teuchos_fcd, Teuchos_fcd, Teuchos_fcd,
+                const int *m, const int *n, const std::complex<double> *alpha, const std::complex<double> *a, const int *
+                lda, std::complex<double> *b, const int *ldb);
+
+#endif /* defined(HAVE_TEUCHOS_COMPLEX) && defined(__cplusplus) */
+
+/* Single precision BLAS 3 */
+void PREFIX SGEMM_F77(Teuchos_fcd, Teuchos_fcd, const int *m, const int *
+                n, const int *k, const float *alpha, const float *a, const int *lda,
+                const float *b, const int *ldb, const float *beta, float *c, const int *ldc);
+void PREFIX SSYMM_F77(Teuchos_fcd, Teuchos_fcd, const int *m, const int * n,
+                const float *alpha, const float *a, const int *lda,
+                const float *b, const int *ldb, const float *beta, float *c, const int *ldc);
+void PREFIX SSYRK_F77(Teuchos_fcd, Teuchos_fcd, const int *n, const int * k,
+                const float *alpha, const float *a, const int *lda,
+                const float *beta, float *c, const int *ldc);
+void PREFIX STRMM_F77(Teuchos_fcd, Teuchos_fcd, Teuchos_fcd, Teuchos_fcd,
+                const int *m, const int *n, const float *alpha, const float *a, const int * lda, float *b, const int *ldb);
+void PREFIX STRSM_F77(Teuchos_fcd, Teuchos_fcd, Teuchos_fcd, Teuchos_fcd,
+                const int *m, const int *n, const float *alpha, const float *a, const int *
+                lda, float *b, const int *ldb);
+
+/* Single std::complex precision BLAS 3 */
+
+#if defined(HAVE_TEUCHOS_COMPLEX) && defined(__cplusplus)
+
+void PREFIX CGEMM_F77(Teuchos_fcd, Teuchos_fcd, const int *m, const int *
+                n, const int *k, const std::complex<float> *alpha, const std::complex<float> *a, const int *lda,
+                const std::complex<float> *b, const int *ldb, const std::complex<float> *beta, std::complex<float> *c, const int *ldc);
+void PREFIX CSYMM_F77(Teuchos_fcd, Teuchos_fcd, const int *m, const int * n,
+                const std::complex<float> *alpha, const std::complex<float> *a, const int *lda,
+                const std::complex<float> *b, const int *ldb, const std::complex<float> *beta, std::complex<float> *c, const int *ldc);
+void PREFIX CTRMM_F77(Teuchos_fcd, Teuchos_fcd, Teuchos_fcd, Teuchos_fcd,
+                const int *m, const int *n, const std::complex<float> *alpha, const std::complex<float> *a, const int * lda, std::complex<float> *b, const int *ldb);
+void PREFIX CSYRK_F77(Teuchos_fcd, Teuchos_fcd, const int *n, const int * k,
+                const std::complex<float> *alpha, const std::complex<float> *a, const int *lda,
+                const std::complex<float> *beta, std::complex<float> *c, const int *ldc);
+void PREFIX CHERK_F77(Teuchos_fcd, Teuchos_fcd, const int *n, const int * k,
+                const std::complex<float> *alpha, const std::complex<float> *a, const int *lda,
+                const std::complex<float> *beta, std::complex<float> *c, const int *ldc);
+void PREFIX CTRSM_F77(Teuchos_fcd, Teuchos_fcd, Teuchos_fcd, Teuchos_fcd,
+                const int *m, const int *n, const std::complex<float> *alpha, const std::complex<float> *a, const int *
+                lda, std::complex<float> *b, const int *ldb);
+
+#endif /* defined(HAVE_TEUCHOS_COMPLEX) && defined(__cplusplus) */
+
+#ifdef __cplusplus
+}
+#endif
+
+/* Don't leave a global macros called PREFIX or Teuchos_fcd laying around */
+
+#ifdef PREFIX
+#undef PREFIX
+#endif
+
+#ifdef Teuchos_fcd
+#undef Teuchos_fcd
+#endif
+
+#endif /* end of TEUCHOS_BLAS_WRAPPERS_HPP_ */
diff --git a/packages/rol/src/compatibility/teuchos-lite/Teuchos_CompObject.cpp b/packages/rol/src/compatibility/teuchos-lite/Teuchos_CompObject.cpp
new file mode 100644
index 000000000000..c8c45062e4b4
--- /dev/null
+++ b/packages/rol/src/compatibility/teuchos-lite/Teuchos_CompObject.cpp
@@ -0,0 +1,69 @@
+// @HEADER
+// ***********************************************************************
+//
+//                    Teuchos: Common Tools Package
+//                 Copyright (2004) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
+//
+// ***********************************************************************
+// @HEADER
+
+// Kris
+// 07.08.03 -- Move into Teuchos package/namespace
+
+// Constructor
+
+#include "Teuchos_CompObject.hpp"
+
+namespace Teuchos
+{
+
+CompObject::CompObject() : flopCounter_(0)
+{
+}
+
+// Copy Constructor
+
+CompObject::CompObject(const CompObject& source) : flopCounter_(source.flopCounter_)
+{
+}
+
+// Destructor
+
+CompObject::~CompObject()
+{
+  flopCounter_ = 0;
+}
+
+} // namespace Teuchos
diff --git a/packages/rol/src/compatibility/teuchos-lite/Teuchos_CompObject.hpp b/packages/rol/src/compatibility/teuchos-lite/Teuchos_CompObject.hpp
new file mode 100644
index 000000000000..81f0e52c5ce2
--- /dev/null
+++ b/packages/rol/src/compatibility/teuchos-lite/Teuchos_CompObject.hpp
@@ -0,0 +1,132 @@
+// @HEADER
+// ***********************************************************************
+//
+//                    Teuchos: Common Tools Package
+//                 Copyright (2004) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
+//
+// ***********************************************************************
+// @HEADER
+
+// Kris
+// 07.08.03 -- Move into Teuchos package/namespace
+
+#ifndef TEUCHOS_COMPOBJECT_HPP
+#define TEUCHOS_COMPOBJECT_HPP
+
+/*! \file Teuchos_CompObject.hpp
+    \brief Object for storing data and providing functionality that is common to all
+	computational classes.
+*/
+
+#include "Teuchos_Object.hpp"
+#include "Teuchos_Flops.hpp"
+
+/*! \class Teuchos::CompObject
+    \brief Functionality and data that is common to all computational classes.
+
+    The Teuchos::CompObject is a base class for all Teuchos computational objects.  It provides the basic
+    mechanisms and interface specifications for floating point operations using Teuchos::Flops objects.
+*/
+
+namespace Teuchos
+{
+class TEUCHOSNUMERICS_LIB_DLL_EXPORT CompObject
+{
+
+  public:
+
+    //! @name Constructors/Destructor.
+  //@{
+
+  //! Default constructor
+  CompObject();
+
+  //! Copy Constructor
+  CompObject(const CompObject &source);
+
+  //! Destructor
+  virtual ~CompObject();
+  //@}
+
+  //! @name Set/Get counter method.
+  //@{
+  //! Set the internal Teuchos::Flops() pointer.
+  void setFlopCounter(const Flops &FlopCounter) {flopCounter_= (Flops *) &FlopCounter; return;}
+
+  //! Set the internal Teuchos::Flops() pointer to the flop counter of another Teuchos::CompObject.
+  void setFlopCounter(const CompObject &compObject) {flopCounter_= (Flops *) (compObject.getFlopCounter()); return;}
+
+  //! Set the internal Teuchos::Flops() pointer to 0 (no flops counted).
+  void unsetFlopCounter() {flopCounter_=0; return;}
+
+  //! Get the pointer to the Teuchos::Flops() object associated with this object, returns 0 if none.
+  Flops * getFlopCounter() const {return(flopCounter_);}
+  //@}
+
+  //! @name Set flop count methods.
+  //@{
+  //! Resets the number of floating point operations to zero for \e this multi-std::vector.
+  void resetFlops() const {if (flopCounter_!=0) flopCounter_->resetFlops(); return;}
+
+  //! Returns the number of floating point operations with \e this multi-std::vector.
+  double getFlops() const {if (flopCounter_!=0) return(flopCounter_->flops()); else return(0.0);}
+  //@}
+
+  //! @name Update flop count methods.
+  //@{
+  //! Increment Flop count for \e this object
+  void updateFlops(int addflops) const { if (flopCounter_!=0) flopCounter_->updateFlops(addflops); return;}
+
+  //! Increment Flop count for \e this object
+  void updateFlops(long int addflops) const { if (flopCounter_!=0) flopCounter_->updateFlops(addflops); return;}
+
+  //! Increment Flop count for \e this object
+  void updateFlops(double addflops) const { if (flopCounter_!=0) flopCounter_->updateFlops(addflops); return;}
+
+  //! Increment Flop count for \e this object
+  void updateFlops(float addflops) const {if (flopCounter_!=0) flopCounter_->updateFlops(addflops); return;}
+  //@}
+
+ protected:
+
+  Flops *flopCounter_;
+
+};
+
+  // #include "Teuchos_CompObject.cpp"
+
+} // namespace Teuchos
+
+#endif // end of TEUCHOS_COMPOBJECT_HPP
diff --git a/packages/rol/src/compatibility/teuchos-lite/Teuchos_ConfigDefs.hpp b/packages/rol/src/compatibility/teuchos-lite/Teuchos_ConfigDefs.hpp
new file mode 100644
index 000000000000..67e859a213a8
--- /dev/null
+++ b/packages/rol/src/compatibility/teuchos-lite/Teuchos_ConfigDefs.hpp
@@ -0,0 +1,173 @@
+/*
+// @HEADER
+// ***********************************************************************
+//
+//                    Teuchos: Common Tools Package
+//                 Copyright (2004) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
+//
+// ***********************************************************************
+// @HEADER
+*/
+
+#ifndef TEUCHOS_CONFIGDEFS_HPP
+#define TEUCHOS_CONFIGDEFS_HPP
+
+/*! \file Teuchos_ConfigDefs.hpp
+    \brief Teuchos header file which uses auto-configuration information
+        to include necessary C++ headers.
+*/
+
+#include "TeuchosCore_config.h"
+#include "Teuchos_config.h"
+
+#ifdef HAVE_TEUCHOS_DEBUG
+#  define TEUCHOS_DEBUG
+#  define HAVE_TEUCHOS_ARRAY_BOUNDSCHECK
+#endif
+
+#ifdef __cplusplus
+
+#if defined(_MSC_VER) || defined(__APPLE__)
+#  define TEUCHOS_NO_ZERO_ITERATOR_CONVERSION
+#endif
+
+#if defined(__IBMC__) || defined(__IBMCPP__)
+#  ifndef TEMPLATE_FRIENDS_NOT_SUPPORTED
+#    define TEMPLATE_FRIENDS_NOT_SUPPORTED
+#  endif
+#  ifndef TEUCHOS_PRIVIATE_DELETE_NOT_SUPPORTED
+#    define TEUCHOS_PRIVIATE_DELETE_NOT_SUPPORTED
+#  endif
+#endif
+
+/* Deprecated */
+#ifndef HAVE_COMPLEX
+#  define HAVE_COMPLEX
+#endif
+
+// #include <cstdio>
+// #include <cstdarg>
+// #include <cerrno>
+// #include <climits>
+#include <cstdlib>
+#include <string>
+#include <cstring>
+#include <cmath>
+#include <iostream>
+#include <iomanip>
+// #include <fstream>
+#include <sstream>
+#include <stdexcept>
+#include <cassert>
+#include <complex>
+// #include <map>
+// #include <vector>
+// #include <deque>
+#include <algorithm>
+// #include <numeric>
+// #include <list>
+// #include <set>
+// #include <typeinfo>
+#include <limits>
+// #include <memory>
+#include <cstddef>
+
+/* Avoid duplicating instantiation provided by IBM XL C++ runtime library. */
+#if defined(__IBMCPP__)
+# pragma do_not_instantiate std::fpos<mbstate_t>
+#endif
+
+const double Teuchos_MinDouble = 1.0E-100;
+const double Teuchos_MaxDouble = 1.0E+100;
+const double Teuchos_Overflow = 1.79E308; // Used to test if equilibration should be done.
+const double Teuchos_Underflow = 2.23E-308;
+
+#else /* __cplusplus */
+
+#include <stddef.h>
+
+#endif /* __cplusplus */
+
+/* Delete any previous definition of TEUCHOS_NO_ERROR_REPORTS */
+
+#ifdef TEUCHOS_CHK_ERR
+#undef TEUCHOS_CHK_ERR
+#endif
+#ifdef TEUCHOS_CHK_PTR
+#undef TEUCHOS_CHK_PTR
+#endif
+#ifdef TEUCHOS_CHK_REF
+#undef TEUCHOS_CHK_REF
+#endif
+
+/* The integral type that is used for the largest ordinal values on this
+ * machine.
+ *
+ * On a 32 bit machine, ptrdiff_t will be an unsighed 32 bit integer and on a
+ * 64 bit machine it will be an unsigned 64 bit integer.  Just what I want!
+*/
+typedef TEUCHOS_ORDINAL_TYPE Teuchos_Ordinal;
+
+#ifdef __cplusplus
+namespace Teuchos { typedef Teuchos_Ordinal Ordinal; }
+#endif /* __cplusplus */
+
+/* Deprecated (use Teuchos_Ordinal instead) */
+TEUCHOS_DEPRECATED typedef Teuchos_Ordinal Teuchos_Index;
+
+/* Make error report silent by defining TEUCHOS_NO_ERROR_REPORTS */
+
+#define TEUCHOS_CHK_ERR(a) { if (a != 0)  return(a);}
+#define TEUCHOS_CHK_PTR(a) { return(a);}
+#define TEUCHOS_CHK_REF(a) { return(a);}
+
+#ifdef __cplusplus
+const int Teuchos_DefaultTracebackMode = 1; /* Default value for traceback behavior */
+#endif /* __cplusplus */
+
+/* Define some macros */
+#define TEUCHOS_MAX(x,y) (( (x) > (y) ) ? (x)  : (y) )     /* max function  */
+#define TEUCHOS_MIN(x,y) (( (x) < (y) ) ? (x)  : (y) )     /* min function  */
+#define TEUCHOS_SGN(x)   (( (x) < 0.0 ) ? -1.0 : 1.0 )     /* sign function */
+
+#ifndef HAVE_FORTRAN_SUPPORT
+#  ifndef FORTRAN_DISABLED
+#    define FORTRAN_DISABLED
+#  endif
+#endif
+
+#include "Teuchos_DLLExportMacro.h"
+
+#endif /* TEUCHOS_CONFIGDEFS_HPP */
diff --git a/packages/rol/src/compatibility/teuchos-lite/Teuchos_DLLExportMacro.h b/packages/rol/src/compatibility/teuchos-lite/Teuchos_DLLExportMacro.h
new file mode 100644
index 000000000000..31d469c2bd0e
--- /dev/null
+++ b/packages/rol/src/compatibility/teuchos-lite/Teuchos_DLLExportMacro.h
@@ -0,0 +1,45 @@
+
+#if defined (_WIN32) && defined (BUILD_SHARED_LIBS)
+#  if defined(TEUCHOSCORE_LIB_EXPORTS_MODE)
+#    define TEUCHOSCORE_LIB_DLL_EXPORT __declspec(dllexport)
+#  else
+#    define TEUCHOSCORE_LIB_DLL_EXPORT __declspec(dllimport)
+#  endif
+#else
+#  define TEUCHOSCORE_LIB_DLL_EXPORT
+#endif
+
+#if defined (_WIN32) && defined (BUILD_SHARED_LIBS)
+#  if defined(TEUCHOSCOMM_LIB_EXPORTS_MODE)
+#    define TEUCHOSCOMM_LIB_DLL_EXPORT __declspec(dllexport)
+#  else
+#    define TEUCHOSCOMM_LIB_DLL_EXPORT __declspec(dllimport)
+#  endif
+#else
+#  define TEUCHOSCOMM_LIB_DLL_EXPORT
+#endif
+
+#if defined (_WIN32) && defined (BUILD_SHARED_LIBS)
+#  if defined(TEUCHOSPARAMETERLIST_LIB_EXPORTS_MODE)
+#    define TEUCHOSPARAMETERLIST_LIB_DLL_EXPORT __declspec(dllexport)
+#  else
+#    define TEUCHOSPARAMETERLIST_LIB_DLL_EXPORT __declspec(dllimport)
+#  endif
+#else
+#  define TEUCHOSPARAMETERLIST_LIB_DLL_EXPORT
+#endif
+
+#if defined (_WIN32) && defined (BUILD_SHARED_LIBS)
+#  if defined(TEUCHOSNUMERICS_LIB_EXPORTS_MODE)
+#    define TEUCHOSNUMERICS_LIB_DLL_EXPORT __declspec(dllexport)
+#  else
+#    define TEUCHOSNUMERICS_LIB_DLL_EXPORT __declspec(dllimport)
+#  endif
+#else
+#  define TEUCHOSNUMERICS_LIB_DLL_EXPORT
+#endif
+
+/* Defining this to allow the backwards compatibility testing to work. This is not needed for library code */
+#define TEUCHOS_LIB_DLL_EXPORT
+
+/* There is not export stuff used in the remainder subpackage yet. */
diff --git a/packages/rol/src/compatibility/teuchos-lite/Teuchos_DataAccess.hpp b/packages/rol/src/compatibility/teuchos-lite/Teuchos_DataAccess.hpp
new file mode 100644
index 000000000000..6f41f93c9f98
--- /dev/null
+++ b/packages/rol/src/compatibility/teuchos-lite/Teuchos_DataAccess.hpp
@@ -0,0 +1,67 @@
+// @HEADER
+// ***********************************************************************
+//
+//                    Teuchos: Common Tools Package
+//                 Copyright (2004) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
+//
+// ***********************************************************************
+// @HEADER
+
+// Kris
+// 07.08.03 -- Move into Teuchos package/namespace
+
+#ifndef _TEUCHOS_DATAACCESS_HPP_
+#define _TEUCHOS_DATAACCESS_HPP_
+
+/*! \file Teuchos_DataAccess.hpp
+    \brief Teuchos::DataAccess Mode enumerable type
+*/
+
+namespace Teuchos {
+	
+	/*! \enum DataAccess
+      If set to Copy, user data will be copied at construction.
+      If set to View, user data will be encapsulated and used throughout
+      the life of the object.
+	*/
+	
+	enum DataAccess {
+		Copy, /*!< User data will be copied at construction. */
+		View /*!< User data will be encapsulated and used throughout the life of the object. */
+	};
+
+} // namespace Teuchos
+
+#endif /* _TEUCHOS_DATAACCESS_HPP_ */
diff --git a/packages/rol/src/compatibility/teuchos-lite/Teuchos_Flops.hpp b/packages/rol/src/compatibility/teuchos-lite/Teuchos_Flops.hpp
new file mode 100644
index 000000000000..e33ee3efe2b5
--- /dev/null
+++ b/packages/rol/src/compatibility/teuchos-lite/Teuchos_Flops.hpp
@@ -0,0 +1,138 @@
+// @HEADER
+// ***********************************************************************
+//
+//                    Teuchos: Common Tools Package
+//                 Copyright (2004) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
+//
+// ***********************************************************************
+// @HEADER
+
+// Kris
+// 07.08.03 -- Move into Teuchos package/namespace
+
+#ifndef TEUCHOS_FLOPS_HPP
+#define TEUCHOS_FLOPS_HPP
+
+/*! \file Teuchos_Flops.hpp
+    \brief Object for providing basic support and consistent interfaces for
+	counting/reporting floating-point operations performed in Teuchos computational
+	classes.
+*/
+
+/*! \class Teuchos::Flops
+    \brief The Teuchos Floating Point Operations Class.
+
+    The Teuchos_Flops class provides basic support and consistent interfaces
+    for counting and reporting floating point operations performed in
+    the Teuchos computational classes.  All classes based on the Teuchos::CompObject
+    can count flops by the user creating an Teuchos::Flops object and calling the SetFlopCounter()
+    method for an Teuchos_CompObject.
+*/
+
+namespace Teuchos
+{
+class Flops
+{
+  public:
+
+    //! @name Constructor/Destructor.
+  //@{
+
+  //! Default Constructor.
+  /*! Creates a Flops instance. This instance can be queried for
+      the number of floating point operations performed for the associated
+      \e this object.
+  */
+  Flops();
+
+  //! Copy Constructor.
+  /*! Makes an exact copy of an existing Flops instance.
+  */
+  Flops(const Flops &flops);
+
+  //! Destructor.
+  /*! Completely deletes a Flops object.
+  */
+  virtual ~Flops();
+
+  //@}
+
+  //! @name Accessor methods.
+  //@{
+
+  //! Returns the number of floating point operations with \e this object and resets the count.
+  double flops() const { return flops_; }
+
+  //@}
+
+  //! @name Reset methods.
+  //@{
+
+  //! Resets the number of floating point operations to zero for \e this multi-std::vector.
+  void resetFlops() {flops_ = 0.0;}
+
+  //@}
+
+  friend class CompObject;
+
+ protected:
+
+  mutable double flops_;
+
+  //! @name Updating methods.
+  //@{
+  //! Increment Flop count for \e this object from an int
+  void updateFlops(int addflops) const {flops_ += (double) addflops; }
+
+  //! Increment Flop count for \e this object from a long int
+  void updateFlops(long int addflops) const {flops_ += (double) addflops; }
+
+  //! Increment Flop count for \e this object from a double
+  void updateFlops(double addflops) const {flops_ += (double) addflops; }
+
+  //! Increment Flop count for \e this object from a float
+  void updateFlops(float addflops) const {flops_ += (double) addflops; }
+
+  //@}
+
+ private:
+
+};
+
+  // #include "Teuchos_Flops.cpp"
+
+} // namespace Teuchos
+
+#endif // end of TEUCHOS_FLOPS_HPP
diff --git a/packages/rol/src/compatibility/teuchos-lite/Teuchos_Object.hpp b/packages/rol/src/compatibility/teuchos-lite/Teuchos_Object.hpp
new file mode 100644
index 000000000000..6c6c1bb31ffa
--- /dev/null
+++ b/packages/rol/src/compatibility/teuchos-lite/Teuchos_Object.hpp
@@ -0,0 +1,159 @@
+// @HEADER
+// ***********************************************************************
+//
+//                    Teuchos: Common Tools Package
+//                 Copyright (2004) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
+//
+// ***********************************************************************
+// @HEADER
+
+// Kris
+// 07.08.03 -- Move into Teuchos package/namespace
+
+#ifndef _TEUCHOS_OBJECT_HPP_
+#define _TEUCHOS_OBJECT_HPP_
+
+/*! \file Teuchos_Object.hpp
+    \brief The base Teuchos object.
+*/
+
+#include "Teuchos_ConfigDefs.hpp"
+#include "Teuchos_DataAccess.hpp"
+
+// 2007/11/26: rabartl: This class has to change from using 'char*' to
+// std::string!
+
+/*! \class Teuchos::Object
+    \brief The base Teuchos class.
+
+    The Object class provides capabilities common to all Teuchos objects,
+    such as a label that identifies an object instance, constant definitions,
+    enum types.
+*/
+
+namespace Teuchos {
+
+class TEUCHOSNUMERICS_LIB_DLL_EXPORT Object {
+public:
+  //! @name Constructors/Destructor.
+  //@{
+  //! Default Constructor.
+  /*! Object is the primary base class in Teuchos.  All Teuchos class [sic]
+      are derived from it, directly or indirectly.  This class is seldom
+      used explictly.
+
+     \warning (mfh 23 Nov 2014) Pretty much just ignore the above
+       description of this class.  Very few classes in Teuchos inherit
+       from Object.  Such inheritance should be treated as deprecated
+       legacy behavior.  It's not 1990 and C++ is not Java 1.0; we
+       don't need a common base class for everything.
+  */
+  Object (int tracebackModeIn = -1);
+
+  //! Labeling Constructor.
+  /*! Creates an Object with the given label.
+   *
+   * LEGACY; DEPRECATE.
+  */
+  Object (const char* label, int tracebackModeIn = -1);
+
+  /// \brief Create an Object with the given label, and optionally,
+  ///   with the given traceback mode.
+  Object (const std::string& label, int tracebackModeIn = -1);
+
+  //! Destructor (virtual, for safety of derived classes).
+  virtual ~Object () {}
+
+  //@}
+  //! @name Set methods.
+  //@{
+
+  // LEGACY; REPLACE "const char*" with std::string.
+  virtual void setLabel (const char* theLabel);
+
+  /// \brief Set the value of the Object error traceback report mode.
+  ///
+  /// TracebackMode controls whether or not traceback information is
+  /// printed when run time integer errors are detected:
+  ///
+  /// <= 0 - No information report
+  ///
+  /// = 1 - Fatal (negative) values are reported
+  ///
+  /// >= 2 - All values (except zero) reported.
+  ///
+  /// \note Default is set to -1 when object is constructed.
+  static void setTracebackMode (int tracebackModeValue);
+
+  //@}
+  //! @name Accessor methods.
+  //@{
+
+  //! Access the object's label (LEGACY; return std::string instead).
+  virtual const char* label () const;
+
+  //! Get the value of the Object error traceback report mode.
+  static int getTracebackMode();
+
+  //@}
+  //! @name I/O method.
+  //@{
+
+  //! Print the object to the given output stream.
+  virtual void print (std::ostream& os) const;
+
+  //@}
+  //! @name Error reporting method.
+  //@{
+
+  //! Report an error with this Object.
+  virtual int reportError (const std::string message, int errorCode) const;
+
+  //@}
+
+  static int tracebackMode;
+
+private:
+  //! The Object's current label.
+  std::string label_;
+};
+
+/// \brief Print the given Object to the given output stream.
+/// \relates Object
+std::ostream& operator<< (std::ostream& os, const Teuchos::Object& obj);
+
+} // namespace Teuchos
+
+#endif /* _TEUCHOS_OBJECT_HPP_ */
diff --git a/packages/rol/src/compatibility/teuchos-lite/Teuchos_OrdinalTraits.hpp b/packages/rol/src/compatibility/teuchos-lite/Teuchos_OrdinalTraits.hpp
new file mode 100644
index 000000000000..4dc0f257d385
--- /dev/null
+++ b/packages/rol/src/compatibility/teuchos-lite/Teuchos_OrdinalTraits.hpp
@@ -0,0 +1,215 @@
+// @HEADER
+// ***********************************************************************
+//
+//                    Teuchos: Common Tools Package
+//                 Copyright (2004) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
+//
+// ***********************************************************************
+// @HEADER
+
+// Kris
+// 07.08.03 -- Move into Teuchos package/namespace
+
+#ifndef _TEUCHOS_ORDINALTRAITS_HPP_
+#define _TEUCHOS_ORDINALTRAITS_HPP_
+
+/*! \file Teuchos_OrdinalTraits.hpp
+  \brief Defines basic traits for the ordinal field type
+*/
+
+#include "Teuchos_ConfigDefs.hpp"
+#include <limits>
+
+/*! \struct Teuchos::OrdinalTraits
+  \brief This structure defines some basic traits for the ordinal field type.
+
+  Ordinal traits are an essential part of templated codes.  This structure offers
+  the basic traits of the templated ordinal type, like defining zero and one.
+
+  For the general type, or default implementation, an aborting function
+  is defined which should restrict implementations from using ordinal traits other than
+  the defined specializations.
+
+  \note The defined specializations for OrdinalTraits are: \c int and \c long \c int.
+*/
+
+/* This is the default structure used by OrdinalTraits<T> to produce a compile time
+   error when the specialization does not exist for type <tt>T</tt>.
+*/
+namespace Teuchos {
+
+template<class T>
+struct UndefinedOrdinalTraits
+{
+  //! This function should not compile if there is an attempt to instantiate!
+  static inline T notDefined() { return T::this_type_is_missing_a_specialization(); }
+};
+
+template<class T>
+struct OrdinalTraits {
+
+  //! Allows testing to see if ordinal traits machine parameters are defined.
+  static const bool hasMachineParameters = false;
+
+  //! Returns representation of zero for this ordinal type.
+  static inline T zero()                     { return UndefinedOrdinalTraits<T>::notDefined(); }
+
+  //! Returns representation of one for this ordinal type.
+  static inline T one()                      { return UndefinedOrdinalTraits<T>::notDefined(); }
+
+  //! \brief Returns a value designating the maximum value accessible by code using OrdinalTraits.
+  /*! For a signed ordinal, this will typically be the maximum positive value. However, for an unsigned ordinal, this will typically be one less than the
+   *  maximum value, because the maximum value will typically be reserved for designating an invalid value.
+   */
+  static inline T max()                  { return UndefinedOrdinalTraits<T>::notDefined(); }
+
+  //! Returns a value designating an invalid number. For signed types, this is typically negative one; for unsigned types, it is typically the largest value.
+  static inline T invalid()                  { return UndefinedOrdinalTraits<T>::notDefined(); }
+
+  //! Returns name of this ordinal type.
+  static inline std::string name()           { return UndefinedOrdinalTraits<T>::notDefined(); }
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+
+template<>
+struct OrdinalTraits<char> {
+  static const bool hasMachineParameters = false;
+  static inline char zero()                   {return(0);}
+  static inline char one()                    {return(1);}
+  static inline char invalid()                {return(std::numeric_limits<char>::max());}
+  static inline char max()                    {return(std::numeric_limits<char>::max()-one());}
+  static inline std::string name()            {return("char");}
+};
+
+template<>
+struct OrdinalTraits<short int> {
+  static const bool hasMachineParameters = false;
+  static inline short int zero()              {return(0);}
+  static inline short int one()               {return(1);}
+  static inline short int invalid()           {return(-1);}
+  static inline short int max()               {return(std::numeric_limits<short int>::max());}
+  static inline std::string name()            {return("short int");}
+};
+
+template<>
+struct OrdinalTraits<int> {
+  static const bool hasMachineParameters = false;
+  static inline int zero()                   {return(0);}
+  static inline int one()                    {return(1);}
+  static inline int invalid()                {return(-1);}
+  static inline int max()                    {return(std::numeric_limits<int>::max());}
+  static inline std::string name()           {return("int");}
+};
+
+template<>
+struct OrdinalTraits<unsigned int> {
+  static const bool hasMachineParameters = false;
+  static inline unsigned int zero()                   {return(0);}
+  static inline unsigned int one()                    {return(1);}
+  static inline unsigned int invalid()                {return(std::numeric_limits<unsigned int>::max());}
+  static inline unsigned int max()                    {return(std::numeric_limits<unsigned int>::max()-1);}
+  static inline std::string name()           {return("unsigned int");}
+};
+
+template<>
+struct OrdinalTraits<long int> {
+  static const bool hasMachineParameters = false;
+  static inline long int zero()              {return(static_cast<long int>(0));}
+  static inline long int one()               {return(static_cast<long int>(1));}
+  static inline long int invalid()           {return(static_cast<long int>(-1));}
+  static inline long int max()               {return(std::numeric_limits<long int>::max());}
+  static inline std::string name()           {return("long int");}
+};
+
+template<>
+struct OrdinalTraits<long unsigned int> {
+  static const bool hasMachineParameters = false;
+  static inline long unsigned int zero()              {return(static_cast<long unsigned int>(0));}
+  static inline long unsigned int one()               {return(static_cast<long unsigned int>(1));}
+  static inline long unsigned int invalid()           {return(std::numeric_limits<long unsigned int>::max());}
+  static inline long unsigned int max()               {return(std::numeric_limits<long unsigned int>::max()-1);}
+  static inline std::string name()           {return("long unsigned int");}
+};
+
+template<>
+struct OrdinalTraits<long long int> {
+  static const bool hasMachineParameters = false;
+  static inline long long int zero()          {return(static_cast<long long int>(0));}
+  static inline long long int one()           {return(static_cast<long long int>(1));}
+  static inline long long int invalid()       {return(static_cast<long long int>(-1));}
+  static inline long long int max()           {return(std::numeric_limits<long long int>::max());}
+  static inline std::string name()            {return("long long int");}
+};
+
+template<>
+struct OrdinalTraits<unsigned long long int> {
+  static const bool hasMachineParameters = false;
+  static inline unsigned long long int zero()          {return(static_cast<unsigned long long int>(0));}
+  static inline unsigned long long int one()           {return(static_cast<unsigned long long int>(1));}
+  static inline unsigned long long int invalid()       {return(std::numeric_limits<unsigned long long int>::max());}
+  static inline unsigned long long int max()           {return(std::numeric_limits<unsigned long long int>::max()-1);}
+  static inline std::string name()                     {return("unsigned long long int");}
+};
+
+#ifdef HAVE_TEUCHOS___INT64
+
+template<>
+struct OrdinalTraits<__int64> {
+  static const bool hasMachineParameters = false;
+  static inline __int64 zero() {return(static_cast<__int64>(0));}
+  static inline __int64 one() {return(static_cast<__int64>(1));}
+  static inline __int64 invalid() {return(std::numeric_limits<__int64>::max());}
+  static inline __int64 max() {return(std::numeric_limits<__int64>::max()-1);}
+  static inline std::string name() {return("__int64");}
+};
+
+template<>
+struct OrdinalTraits<unsigned __int64> {
+  static const bool hasMachineParameters = false;
+  static inline unsigned __int64 zero() {return(static_cast<unsigned __int64>(0));}
+  static inline unsigned __int64 one() {return(static_cast<unsigned __int64>(1));}
+  static inline unsigned __int64 invalid() {return(std::numeric_limits<unsigned __int64>::max());}
+  static inline unsigned __int64 max() {return(std::numeric_limits<unsigned __int64>::max()-1);}
+  static inline std::string name() {return("unsigned __int64");}
+};
+
+#endif // HAVE_TEUCHOS___INT64
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+} // namespace Teuchos
+
+#endif // _TEUCHOS_ORDINALTRAITS_HPP_
diff --git a/packages/rol/src/compatibility/teuchos-lite/Teuchos_ScalarTraits.hpp b/packages/rol/src/compatibility/teuchos-lite/Teuchos_ScalarTraits.hpp
new file mode 100644
index 000000000000..f52b3cb83ee9
--- /dev/null
+++ b/packages/rol/src/compatibility/teuchos-lite/Teuchos_ScalarTraits.hpp
@@ -0,0 +1,1374 @@
+// @HEADER
+// ***********************************************************************
+//
+//                    Teuchos: Common Tools Package
+//                 Copyright (2004) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
+//
+// ***********************************************************************
+// @HEADER
+
+// NOTE: Before adding specializations of ScalarTraits, make sure that they do
+// not duplicate specializations already present in PyTrilinos (see
+// packages/PyTrilinos/src/Teuchos_Traits.i)
+
+// NOTE: halfPrecision and doublePrecision are not currently implemented for
+// ARPREC, GMP or the ordinal types (e.g., int, char)
+
+#ifndef _TEUCHOS_SCALARTRAITS_HPP_
+#define _TEUCHOS_SCALARTRAITS_HPP_
+
+/*! \file Teuchos_ScalarTraits.hpp
+    \brief Defines basic traits for the scalar field type.
+*/
+
+#include "Teuchos_ConfigDefs.hpp"
+
+#ifdef HAVE_TEUCHOSCORE_KOKKOS
+#include "Kokkos_Complex.hpp"
+#endif // HAVE_TEUCHOSCORE_KOKKOS
+
+#ifdef HAVE_TEUCHOS_ARPREC
+#include <arprec/mp_real.h>
+#endif
+
+#ifdef HAVE_TEUCHOSCORE_QUADMATH
+#include <quadmath.h>
+
+// Teuchos_ConfigDefs.hpp includes <iostream>, which includes
+// <ostream>.  If this ever changes, include <ostream> here.
+
+namespace std {
+
+/// \brief Overload of operator<<(std::ostream&, const T&) for T =
+///   __float128.
+///
+/// GCC / libquadmath doesn't implement an std::ostream operator<< for
+/// __float128, so we have to write our own.  At least libquadmath
+/// provides a printing function specifically for __float128.
+///
+/// FIXME (mfh 19 Mar 2015) This will break if users have already
+/// defined their own operator<< in this namespace.
+ostream&
+operator<< (std::ostream& out, const __float128& x);
+
+/// \brief Overload of operator>>(std::istream&, const T&) for T =
+///   __float128.
+///
+/// GCC / libquadmath doesn't implement an std::istream operator>> for
+/// __float128, so we have to write our own.  At least libquadmath
+/// provides an input function specifically for __float128.
+///
+/// FIXME (mfh 10 Sep 2015) This will break if users have already
+/// defined their own operator>> in this namespace.
+istream&
+operator>> (std::istream& in, __float128& x);
+
+} // namespace std
+
+#endif // HAVE_TEUCHOSCORE_QUADMATH
+
+#ifdef HAVE_TEUCHOS_QD
+#include <qd/qd_real.h>
+#include <qd/dd_real.h>
+#endif
+
+#ifdef HAVE_TEUCHOS_GNU_MP
+#include <gmp.h>
+#include <gmpxx.h>
+#endif
+
+
+#include "Teuchos_ScalarTraitsDecl.hpp"
+
+
+namespace Teuchos {
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+
+
+TEUCHOSCORE_LIB_DLL_EXPORT
+void throwScalarTraitsNanInfError( const std::string &errMsg );
+
+
+template<class Scalar>
+bool generic_real_isnaninf(const Scalar &x)
+{
+#ifdef HAVE_TEUCHOSCORE_CXX11
+  if (std::isnan(x)) return true;
+  if (std::isinf(x)) return true;
+  return false;
+#else
+  typedef std::numeric_limits<Scalar> STD_NL;
+  // IEEE says this should fail for NaN (not all compilers do not obey IEEE)
+  const Scalar tol = 1.0; // Any (bounded) number should do!
+  if (!(x <= tol) && !(x > tol)) return true;
+  // Use fact that Inf*0 = NaN (again, all compilers do not obey IEEE)
+  Scalar z = static_cast<Scalar>(0.0) * x;
+  if (!(z <= tol) && !(z > tol)) return true;
+  // As a last result use comparisons
+  if (x == STD_NL::infinity() || x == -STD_NL::infinity()) return true;
+  // We give up and assume the number is finite
+  return false;
+#endif
+}
+
+
+#define TEUCHOS_SCALAR_TRAITS_NAN_INF_ERR( VALUE, MSG ) \
+  if (isnaninf(VALUE)) { \
+    std::ostringstream omsg; \
+    omsg << MSG; \
+    Teuchos::throwScalarTraitsNanInfError(omsg.str());  \
+  }
+
+
+template<>
+struct ScalarTraits<char>
+{
+  typedef char magnitudeType;
+  typedef char halfPrecision;
+  typedef char doublePrecision;
+  typedef char coordinateType;
+  static const bool isComplex = false;
+  static const bool isOrdinal = true;
+  static const bool isComparable = true;
+  static const bool hasMachineParameters = false;
+  // Not defined: eps(), sfmin(), base(), prec(), t(), rnd(), emin(), rmin(), emax(), rmax()
+  static inline magnitudeType magnitude(char a) { return static_cast<char>(std::fabs(static_cast<double>(a))); }
+  static inline char zero()  { return 0; }
+  static inline char one()   { return 1; }
+  static inline char conjugate(char x) { return x; }
+  static inline char real(char x) { return x; }
+  static inline char imag(char) { return 0; }
+  static inline bool isnaninf(char ) { return false; }
+  static inline void seedrandom(unsigned int s) {
+    std::srand(s);
+#ifdef __APPLE__
+    // throw away first random number to address bug 3655
+    // http://software.sandia.gov/bugzilla/show_bug.cgi?id=3655
+    random();
+#endif
+  }
+  //static inline char random() { return (-1 + 2*rand()); } // RAB: This version should be used to be consistent with others
+  static inline char random() { return std::rand(); } // RAB: This version should be used for an unsigned char, not char
+  static inline std::string name() { return "char"; }
+  static inline char squareroot(char x) { return (char) std::sqrt((double) x); }
+  static inline char pow(char x, char y) { return (char) std::pow((double)x,(double)y); }
+  static inline char log(char x) { return static_cast<char> (std::log (static_cast<double> (x))); }
+  static inline char log10(char x) { return static_cast<char> (std::log10 (static_cast<double> (x))); }
+};
+
+
+template<>
+struct ScalarTraits<short int>
+{
+  typedef short int magnitudeType;
+  typedef short int halfPrecision;
+  typedef short int doublePrecision;
+  typedef short int coordinateType;
+  static const bool isComplex = false;
+  static const bool isOrdinal = true;
+  static const bool isComparable = true;
+  static const bool hasMachineParameters = false;
+  // Not defined: eps(), sfmin(), base(), prec(), t(), rnd(), emin(), rmin(), emax(), rmax()
+  static inline magnitudeType magnitude(short int a) { return static_cast<short int>(std::fabs(static_cast<double>(a))); }
+  static inline short int zero()  { return 0; }
+  static inline short int one()   { return 1; }
+  static inline short int conjugate(short int x) { return x; }
+  static inline short int real(short int x) { return x; }
+  static inline short int imag(short int) { return 0; }
+  static inline bool isnaninf(short int) { return false; }
+  static inline void seedrandom(unsigned int s) {
+    std::srand(s);
+#ifdef __APPLE__
+    // throw away first random number to address bug 3655
+    // http://software.sandia.gov/bugzilla/show_bug.cgi?id=3655
+    random();
+#endif
+  }
+  //static inline int random() { return (-1 + 2*rand()); }  // RAB: This version should be used to be consistent with others
+  static inline short int random() { return std::rand(); }             // RAB: This version should be used for an unsigned int, not int
+  static inline std::string name() { return "short int"; }
+  static inline short int squareroot(short int x) { return (short int) std::sqrt((double) x); }
+  static inline short int pow(short int x, short int y) { return (short int) std::pow((double)x,(double)y); }
+  static inline short int log(short int x) { return static_cast<short int> (std::log (static_cast<double> (x))); }
+  static inline short int log10(short int x) { return static_cast<short int> (std::log10 (static_cast<double> (x))); }
+};
+
+template<>
+struct ScalarTraits<unsigned short int>
+{
+  typedef unsigned short int magnitudeType;
+  typedef unsigned short int halfPrecision;
+  typedef unsigned short int doublePrecision;
+  typedef unsigned short int coordinateType;
+  static const bool isComplex = false;
+  static const bool isOrdinal = true;
+  static const bool isComparable = true;
+  static const bool hasMachineParameters = false;
+  // Not defined: eps(), sfmin(), base(), prec(), t(), rnd(), emin(), rmin(), emax(), rmax()
+  static inline magnitudeType magnitude(unsigned short int a) { return static_cast<unsigned short int>(std::fabs(static_cast<double>(a))); }
+  static inline unsigned short int zero()  { return 0; }
+  static inline unsigned short int one()   { return 1; }
+  static inline unsigned short int conjugate(unsigned short int x) { return x; }
+  static inline unsigned short int real(unsigned short int x) { return x; }
+  static inline unsigned short int imag(unsigned short int) { return 0; }
+  static inline bool isnaninf(unsigned short int) { return false; }
+  static inline void seedrandom(unsigned int s) {
+    std::srand(s);
+#ifdef __APPLE__
+    // throw away first random number to address bug 3655
+    // http://software.sandia.gov/bugzilla/show_bug.cgi?id=3655
+    random();
+#endif
+  }
+  //static inline int random() { return (-1 + 2*rand()); }  // RAB: This version should be used to be consistent with others
+  static inline unsigned short int random() { return std::rand(); }             // RAB: This version should be used for an unsigned int, not int
+  static inline std::string name() { return "unsigned short int"; }
+  static inline unsigned short int squareroot(unsigned short int x) { return (unsigned short int) std::sqrt((double) x); }
+  static inline unsigned short int pow(unsigned short int x, unsigned short int y) { return (unsigned short int) std::pow((double)x,(double)y); }
+  static inline unsigned short int log(unsigned short int x) { return static_cast<unsigned short int> (std::log (static_cast<double> (x))); }
+  static inline unsigned short int log10(unsigned short int x) { return static_cast<unsigned short int> (std::log10 (static_cast<double> (x))); }
+};
+
+
+template<>
+struct ScalarTraits<int>
+{
+  typedef int magnitudeType;
+  typedef int halfPrecision;
+  typedef int doublePrecision;
+  typedef int coordinateType;
+  static const bool isComplex = false;
+  static const bool isOrdinal = true;
+  static const bool isComparable = true;
+  static const bool hasMachineParameters = false;
+  // Not defined: eps(), sfmin(), base(), prec(), t(), rnd(), emin(), rmin(), emax(), rmax()
+  static inline magnitudeType magnitude(int a) { return static_cast<int>(std::fabs(static_cast<double>(a))); }
+  static inline int zero()  { return 0; }
+  static inline int one()   { return 1; }
+  static inline int conjugate(int x) { return x; }
+  static inline int real(int x) { return x; }
+  static inline int imag(int) { return 0; }
+  static inline bool isnaninf(int) { return false; }
+  static inline void seedrandom(unsigned int s) {
+    std::srand(s);
+#ifdef __APPLE__
+    // throw away first random number to address bug 3655
+    // http://software.sandia.gov/bugzilla/show_bug.cgi?id=3655
+    random();
+#endif
+  }
+  //static inline int random() { return (-1 + 2*rand()); }  // RAB: This version should be used to be consistent with others
+  static inline int random() { return std::rand(); }             // RAB: This version should be used for an unsigned int, not int
+  static inline std::string name() { return "int"; }
+  static inline int squareroot(int x) { return (int) std::sqrt((double) x); }
+  static inline int pow(int x, int y) { return (int) std::pow((double)x,(double)y); }
+  static inline int log(int x) { return static_cast<int> (std::log (static_cast<double> (x))); }
+  static inline int log10(int x) { return static_cast<int> (std::log10 (static_cast<double> (x))); }
+};
+
+
+template<>
+struct ScalarTraits<unsigned int>
+{
+  typedef unsigned int magnitudeType;
+  typedef unsigned int halfPrecision;
+  typedef unsigned int doublePrecision;
+  typedef unsigned int coordinateType;
+  static const bool isComplex = false;
+  static const bool isOrdinal = true;
+  static const bool isComparable = true;
+  static const bool hasMachineParameters = false;
+  // Not defined: eps(), sfmin(), base(), prec(), t(), rnd(), emin(), rmin(), emax(), rmax()
+  static inline magnitudeType magnitude(unsigned int a) { return static_cast<unsigned int>(std::fabs(static_cast<double>(a))); }
+  static inline unsigned int zero()  { return 0; }
+  static inline unsigned int one()   { return 1; }
+  static inline unsigned int conjugate(unsigned int x) { return x; }
+  static inline unsigned int real(unsigned int x) { return x; }
+  static inline unsigned int imag(unsigned int) { return 0; }
+  static inline bool isnaninf(unsigned int) { return false; }
+  static inline void seedrandom(unsigned int s) {
+    std::srand(s);
+#ifdef __APPLE__
+    // throw away first random number to address bug 3655
+    // http://software.sandia.gov/bugzilla/show_bug.cgi?id=3655
+    random();
+#endif
+  }
+  //static inline int random() { return (-1 + 2*rand()); }  // RAB: This version should be used to be consistent with others
+  static inline unsigned int random() { return std::rand(); }             // RAB: This version should be used for an unsigned int, not int
+  static inline std::string name() { return "unsigned int"; }
+  static inline unsigned int squareroot(unsigned int x) { return (unsigned int) std::sqrt((double) x); }
+  static inline unsigned int pow(unsigned int x, unsigned int y) { return (unsigned int) std::pow((double)x,(double)y); }
+  static inline unsigned int log(unsigned int x) { return static_cast<unsigned int> (std::log (static_cast<double> (x))); }
+  static inline unsigned int log10(unsigned int x) { return static_cast<unsigned int> (std::log10 (static_cast<double> (x))); }
+};
+
+
+template<>
+struct ScalarTraits<long int>
+{
+  typedef long int magnitudeType;
+  typedef long int halfPrecision;
+  typedef long int doublePrecision;
+  typedef long int coordinateType;
+  static const bool isComplex = false;
+  static const bool isOrdinal = true;
+  static const bool isComparable = true;
+  static const bool hasMachineParameters = false;
+  // Not defined: eps(), sfmin(), base(), prec(), t(), rnd(), emin(), rmin(), emax(), rmax()
+  static inline magnitudeType magnitude(long int a) { return static_cast<long int>(std::fabs(static_cast<double>(a))); }
+  static inline long int zero()  { return 0; }
+  static inline long int one()   { return 1; }
+  static inline long int conjugate(long int x) { return x; }
+  static inline long int real(long int x) { return x; }
+  static inline long int imag(long int) { return 0; }
+  static inline bool isnaninf(long int) { return false; }
+  static inline void seedrandom(unsigned int s) {
+    std::srand(s);
+#ifdef __APPLE__
+    // throw away first random number to address bug 3655
+    // http://software.sandia.gov/bugzilla/show_bug.cgi?id=3655
+    random();
+#endif
+  }
+  //static inline int random() { return (-1 + 2*rand()); }  // RAB: This version should be used to be consistent with others
+  static inline long int random() { return std::rand(); }             // RAB: This version should be used for an unsigned int, not int
+  static inline std::string name() { return "long int"; }
+  static inline long int squareroot(long int x) { return (long int) std::sqrt((double) x); }
+  static inline long int pow(long int x, long int y) { return (long int) std::pow((double)x,(double)y); }
+  // Note: Depending on the number of bits in long int, the cast from
+  // long int to double may not be exact.
+  static inline long int log(long int x) { return static_cast<long int> (std::log (static_cast<double> (x))); }
+  static inline long int log10(long int x) { return static_cast<long int> (std::log10 (static_cast<double> (x))); }
+};
+
+
+template<>
+struct ScalarTraits<long unsigned int>
+{
+  typedef long unsigned int magnitudeType;
+  typedef long unsigned int halfPrecision;
+  typedef long unsigned int doublePrecision;
+  typedef long unsigned int coordinateType;
+  static const bool isComplex = false;
+  static const bool isOrdinal = true;
+  static const bool isComparable = true;
+  static const bool hasMachineParameters = false;
+  // Not defined: eps(), sfmin(), base(), prec(), t(), rnd(), emin(), rmin(), emax(), rmax()
+  static inline magnitudeType magnitude(long unsigned int a) { return static_cast<long unsigned int>(std::fabs(static_cast<double>(a))); }
+  static inline long unsigned int zero()  { return 0; }
+  static inline long unsigned int one()   { return 1; }
+  static inline long unsigned int conjugate(long unsigned int x) { return x; }
+  static inline long unsigned int real(long unsigned int x) { return x; }
+  static inline long unsigned int imag(long unsigned int) { return 0; }
+  static inline bool isnaninf(long unsigned int) { return false; }
+  static inline void seedrandom(unsigned int s) {
+    std::srand(s);
+#ifdef __APPLE__
+    // throw away first random number to address bug 3655
+    // http://software.sandia.gov/bugzilla/show_bug.cgi?id=3655
+    random();
+#endif
+  }
+  //static inline int random() { return (-1 + 2*rand()); }  // RAB: This version should be used to be consistent with others
+  static inline long unsigned int random() { return std::rand(); }             // RAB: This version should be used for an unsigned int, not int
+  static inline std::string name() { return "long unsigned int"; }
+  static inline long unsigned int squareroot(long unsigned int x) { return (long unsigned int) std::sqrt((double) x); }
+  static inline long unsigned int pow(long unsigned int x, long unsigned int y) { return (long unsigned int) std::pow((double)x,(double)y); }
+  // Note: Depending on the number of bits in long unsigned int, the
+  // cast from long unsigned int to double may not be exact.
+  static inline long unsigned int log(long unsigned int x) { return static_cast<long unsigned int> (std::log (static_cast<double> (x))); }
+  static inline long unsigned int log10(long unsigned int x) { return static_cast<long unsigned int> (std::log10 (static_cast<double> (x))); }
+};
+
+
+template<>
+struct ScalarTraits<long long int>
+{
+  typedef long long int magnitudeType;
+  typedef long long int halfPrecision;
+  typedef long long int doublePrecision;
+  typedef long long int coordinateType;
+  static const bool isComplex = false;
+  static const bool isOrdinal = true;
+  static const bool isComparable = true;
+  static const bool hasMachineParameters = false;
+  // Not defined: eps(), sfmin(), base(), prec(), t(), rnd(), emin(), rmin(), emax(), rmax()
+  static inline magnitudeType magnitude(long long int a) { return static_cast<long long int>(std::fabs(static_cast<double>(a))); }
+  static inline long long int zero()  { return 0; }
+  static inline long long int one()   { return 1; }
+  static inline long long int conjugate(long long int x) { return x; }
+  static inline long long int real(long long int x) { return x; }
+  static inline long long int imag(long long int) { return 0; }
+  static inline bool isnaninf(long long int) { return false; }
+  static inline void seedrandom(unsigned int s) {
+    std::srand(s);
+#ifdef __APPLE__
+    // throw away first random number to address bug 3655
+    // http://software.sandia.gov/bugzilla/show_bug.cgi?id=3655
+    random();
+#endif
+  }
+  //static inline int random() { return (-1 + 2*rand()); }  // RAB: This version should be used to be consistent with others
+  static inline long long int random() { return std::rand(); }             // RAB: This version should be used for an unsigned int, not int
+  static inline std::string name() { return "long long int"; }
+  static inline long long int squareroot(long long int x) { return (long long int) std::sqrt((double) x); }
+  static inline long long int pow(long long int x, long long int y) { return (long long int) std::pow((double)x,(double)y); }
+  // Note: Depending on the number of bits in long long int, the cast
+  // from long long int to double may not be exact.
+  static inline long long int log(long long int x) { return static_cast<long long int> (std::log (static_cast<double> (x))); }
+  static inline long long int log10(long long int x) { return static_cast<long long int> (std::log10 (static_cast<double> (x))); }
+};
+
+template<>
+struct ScalarTraits<unsigned long long int>
+{
+  typedef unsigned long long int magnitudeType;
+  typedef unsigned long long int halfPrecision;
+  typedef unsigned long long int doublePrecision;
+  typedef unsigned long long int coordinateType;
+  static const bool isComplex = false;
+  static const bool isOrdinal = true;
+  static const bool isComparable = true;
+  static const bool hasMachineParameters = false;
+  // Not defined: eps(), sfmin(), base(), prec(), t(), rnd(), emin(), rmin(), emax(), rmax()
+  static inline magnitudeType magnitude(unsigned long long int a) { return static_cast<unsigned long long int>(std::fabs(static_cast<double>(a))); }
+  static inline unsigned long long int zero()  { return 0; }
+  static inline unsigned long long int one()   { return 1; }
+  static inline unsigned long long int conjugate(unsigned long long int x) { return x; }
+  static inline unsigned long long int real(unsigned long long int x) { return x; }
+  static inline unsigned long long int imag(unsigned long long int) { return 0; }
+  static inline bool isnaninf(unsigned long long int) { return false; }
+  static inline void seedrandom(unsigned int s) {
+    std::srand(s);
+#ifdef __APPLE__
+    // throw away first random number to address bug 3655
+    // http://software.sandia.gov/bugzilla/show_bug.cgi?id=3655
+    random();
+#endif
+  }
+  //static inline int random() { return (-1 + 2*rand()); }  // RAB: This version should be used to be consistent with others
+  static inline unsigned long long int random() { return std::rand(); }             // RAB: This version should be used for an unsigned int, not int
+  static inline std::string name() { return "unsigned long long int"; }
+  static inline unsigned long long int squareroot(unsigned long long int x) { return (unsigned long long int) std::sqrt((double) x); }
+  static inline unsigned long long int pow(unsigned long long int x, unsigned long long int y) { return (unsigned long long int) std::pow((double)x,(double)y); }
+  // Note: Depending on the number of bits in unsigned long long int,
+  // the cast from unsigned long long int to double may not be exact.
+  static inline unsigned long long int log(unsigned long long int x) { return static_cast<unsigned long long int> (std::log (static_cast<double> (x))); }
+  static inline unsigned long long int log10(unsigned long long int x) { return static_cast<unsigned long long int> (std::log10 (static_cast<double> (x))); }
+};
+
+
+#ifdef HAVE_TEUCHOS___INT64
+
+template<>
+struct ScalarTraits<__int64>
+{
+  typedef __int64 magnitudeType;
+  typedef __int64 halfPrecision;
+  typedef __int64 doublePrecision;
+  typedef __int64 coordinateType;
+  static const bool isComplex = false;
+  static const bool isOrdinal = true;
+  static const bool isComparable = true;
+  static const bool hasMachineParameters = false;
+  // Not defined: eps(), sfmin(), base(), prec(), t(), rnd(), emin(), rmin(), emax(), rmax()
+  static inline magnitudeType magnitude(__int64 a) { return static_cast<__int64>(std::fabs(static_cast<double>(a))); }
+  static inline __int64 zero()  { return 0; }
+  static inline __int64 one()   { return 1; }
+  static inline __int64 conjugate(__int64 x) { return x; }
+  static inline __int64 real(__int64 x) { return x; }
+  static inline __int64 imag(__int64) { return 0; }
+  static inline void seedrandom(unsigned int s) {
+    std::srand(s);
+#ifdef __APPLE__
+    // throw away first random number to address bug 3655
+    // http://software.sandia.gov/bugzilla/show_bug.cgi?id=3655
+    random();
+#endif
+  }
+  //static inline int random() { return (-1 + 2*rand()); }  // RAB: This version should be used to be consistent with others
+  static inline __int64 random() { return std::rand(); }             // RAB: This version should be used for an unsigned int, not int
+  static inline std::string name() { return "__int64"; }
+  static inline __int64 squareroot(__int64 x) { return (__int64) std::sqrt((double) x); }
+  static inline __int64 pow(__int64 x, __int64 y) { return (__int64) std::pow((double)x,(double)y); }
+  // Note: Depending on the number of bits in __int64, the cast
+  // from __int64 to double may not be exact.
+  static inline __int64 log(__int64 x) { return static_cast<__int64> (std::log (static_cast<double> (x))); }
+  static inline __int64 log10(__int64 x) { return static_cast<__int64> (std::log10 (static_cast<double> (x))); }
+};
+
+template<>
+struct ScalarTraits<unsigned __int64>
+{
+  typedef unsigned __int64 magnitudeType;
+  typedef unsigned __int64 halfPrecision;
+  typedef unsigned __int64 doublePrecision;
+  typedef unsigned __int64 coordinateType;
+  static const bool isComplex = false;
+  static const bool isOrdinal = true;
+  static const bool isComparable = true;
+  static const bool hasMachineParameters = false;
+  // Not defined: eps(), sfmin(), base(), prec(), t(), rnd(), emin(), rmin(), emax(), rmax()
+  static inline magnitudeType magnitude(unsigned __int64 a) { return static_cast<unsigned __int64>(std::fabs(static_cast<double>(a))); }
+  static inline unsigned __int64 zero()  { return 0; }
+  static inline unsigned __int64 one()   { return 1; }
+  static inline unsigned __int64 conjugate(unsigned __int64 x) { return x; }
+  static inline unsigned __int64 real(unsigned __int64 x) { return x; }
+  static inline unsigned __int64 imag(unsigned __int64) { return 0; }
+  static inline void seedrandom(unsigned int s) {
+    std::srand(s);
+#ifdef __APPLE__
+    // throw away first random number to address bug 3655
+    // http://software.sandia.gov/bugzilla/show_bug.cgi?id=3655
+    random();
+#endif
+  }
+  //static inline int random() { return (-1 + 2*rand()); }  // RAB: This version should be used to be consistent with others
+  static inline unsigned __int64 random() { return std::rand(); }             // RAB: This version should be used for an unsigned int, not int
+  static inline std::string name() { return "unsigned __int64"; }
+  static inline unsigned __int64 squareroot(unsigned __int64 x) { return (unsigned __int64) std::sqrt((double) x); }
+  static inline unsigned __int64 pow(unsigned __int64 x, unsigned __int64 y) { return (unsigned __int64) std::pow((double)x,(double)y); }
+  // Note: Depending on the number of bits in unsigned __int64,
+  // the cast from unsigned __int64 to double may not be exact.
+  static inline unsigned __int64 log(unsigned __int64 x) { return static_cast<unsigned __int64> (std::log (static_cast<double> (x))); }
+  static inline unsigned __int64 log10(unsigned __int64 x) { return static_cast<unsigned __int64> (std::log10 (static_cast<double> (x))); }
+};
+
+#endif // HAVE_TEUCHOS___INT64
+
+
+#ifndef __sun
+extern TEUCHOSCORE_LIB_DLL_EXPORT const float flt_nan;
+#endif
+
+
+template<>
+struct ScalarTraits<float>
+{
+  typedef float magnitudeType;
+  typedef float halfPrecision; // should become IEEE754-2008 binary16 or fp16 later, perhaps specified at configure according to architectural support
+  typedef double doublePrecision;
+  typedef float coordinateType;
+  static const bool isComplex = false;
+  static const bool isOrdinal = false;
+  static const bool isComparable = true;
+  static const bool hasMachineParameters = true;
+  static inline float eps()   {
+    return std::numeric_limits<float>::epsilon();
+  }
+  static inline float sfmin() {
+    return std::numeric_limits<float>::min();
+  }
+  static inline float base()  {
+    return static_cast<float>(std::numeric_limits<float>::radix);
+  }
+  static inline float prec()  {
+    return eps()*base();
+  }
+  static inline float t()     {
+    return static_cast<float>(std::numeric_limits<float>::digits);
+  }
+  static inline float rnd()   {
+    return ( std::numeric_limits<float>::round_style == std::round_to_nearest ? one() : zero() );
+  }
+  static inline float emin()  {
+    return static_cast<float>(std::numeric_limits<float>::min_exponent);
+  }
+  static inline float rmin()  {
+    return std::numeric_limits<float>::min();
+  }
+  static inline float emax()  {
+    return static_cast<float>(std::numeric_limits<float>::max_exponent);
+  }
+  static inline float rmax()  {
+    return std::numeric_limits<float>::max();
+  }
+  static inline magnitudeType magnitude(float a)
+    {
+#ifdef TEUCHOS_DEBUG
+      TEUCHOS_SCALAR_TRAITS_NAN_INF_ERR(
+        a, "Error, the input value to magnitude(...) a = " << a << " can not be NaN!" );
+#endif
+      return std::fabs(a);
+    }
+  static inline float zero()  { return(0.0f); }
+  static inline float one()   { return(1.0f); }
+  static inline float conjugate(float x)   { return(x); }
+  static inline float real(float x) { return x; }
+  static inline float imag(float) { return zero(); }
+  static inline float nan() {
+#ifdef __sun
+    return 0.0f/std::sin(0.0f);
+#else
+    return std::numeric_limits<float>::quiet_NaN();
+#endif
+  }
+  static inline bool isnaninf(float x) {
+    return generic_real_isnaninf<float>(x);
+  }
+  static inline void seedrandom(unsigned int s) {
+    std::srand(s);
+#ifdef __APPLE__
+    // throw away first random number to address bug 3655
+    // http://software.sandia.gov/bugzilla/show_bug.cgi?id=3655
+    random();
+#endif
+  }
+  static inline float random() { float rnd = (float) std::rand() / static_cast<float>(RAND_MAX); return (-1.0f + 2.0f * rnd); }
+  static inline std::string name() { return "float"; }
+  static inline float squareroot(float x)
+    {
+#ifdef TEUCHOS_DEBUG
+      TEUCHOS_SCALAR_TRAITS_NAN_INF_ERR(
+        x, "Error, the input value to squareroot(...) x = " << x << " can not be NaN!" );
+#endif
+      errno = 0;
+      const float rtn = std::sqrt(x);
+      if (errno)
+        return nan();
+      return rtn;
+    }
+  static inline float pow(float x, float y) { return std::pow(x,y); }
+  static inline float pi() { return 3.14159265358979323846f; }
+  static inline float log(float x) { return std::log(x); }
+  static inline float log10(float x) { return std::log10(x); }
+};
+
+
+#ifndef __sun
+extern TEUCHOSCORE_LIB_DLL_EXPORT const double dbl_nan;
+#endif
+
+
+template<>
+struct ScalarTraits<double>
+{
+  typedef double magnitudeType;
+  typedef float halfPrecision;
+  /* there are different options as to how to double "double"
+     - QD's DD (if available)
+     - ARPREC
+     - GNU MP
+     - a true hardware quad
+
+     in the shortterm, this should be specified at configure time. I have inserted a configure-time option (--enable-teuchos-double-to-dd)
+     which uses QD's DD when available. This must be used alongside --enable-teuchos-qd.
+   */
+#if defined(HAVE_TEUCHOS_DOUBLE_TO_QD)
+  typedef dd_real doublePrecision;
+#elif defined(HAVE_TEUCHOS_DOUBLE_TO_ARPREC)
+  typedef mp_real doublePrecision;
+#else
+  typedef double doublePrecision;     // don't double "double" in this case
+#endif
+  typedef double coordinateType;
+  static const bool isComplex = false;
+  static const bool isOrdinal = false;
+  static const bool isComparable = true;
+  static const bool hasMachineParameters = true;
+  static inline double eps()   {
+    return std::numeric_limits<double>::epsilon();
+  }
+  static inline double sfmin() {
+    return std::numeric_limits<double>::min();
+  }
+  static inline double base()  {
+    return std::numeric_limits<double>::radix;
+  }
+  static inline double prec()  {
+    return eps()*base();
+  }
+  static inline double t()     {
+    return std::numeric_limits<double>::digits;
+  }
+  static inline double rnd()   {
+    return ( std::numeric_limits<double>::round_style == std::round_to_nearest ? double(1.0) : double(0.0) );
+  }
+  static inline double emin()  {
+    return std::numeric_limits<double>::min_exponent;
+  }
+  static inline double rmin()  {
+    return std::numeric_limits<double>::min();
+  }
+  static inline double emax()  {
+    return std::numeric_limits<double>::max_exponent;
+  }
+  static inline double rmax()  {
+    return std::numeric_limits<double>::max();
+  }
+  static inline magnitudeType magnitude(double a)
+    {
+#ifdef TEUCHOS_DEBUG
+      TEUCHOS_SCALAR_TRAITS_NAN_INF_ERR(
+        a, "Error, the input value to magnitude(...) a = " << a << " can not be NaN!" );
+#endif
+      return std::fabs(a);
+    }
+  static inline double zero()  { return 0.0; }
+  static inline double one()   { return 1.0; }
+  static inline double conjugate(double x)   { return(x); }
+  static inline double real(double x) { return(x); }
+  static inline double imag(double) { return(0); }
+  static inline double nan() {
+#ifdef __sun
+    return 0.0/std::sin(0.0);
+#else
+    return std::numeric_limits<double>::quiet_NaN();
+#endif
+  }
+  static inline bool isnaninf(double x) {
+    return generic_real_isnaninf<double>(x);
+  }
+  static inline void seedrandom(unsigned int s) {
+    std::srand(s);
+#ifdef __APPLE__
+    // throw away first random number to address bug 3655
+    // http://software.sandia.gov/bugzilla/show_bug.cgi?id=3655
+    random();
+#endif
+  }
+  static inline double random() { double rnd = (double) std::rand() / RAND_MAX; return (double)(-1.0 + 2.0 * rnd); }
+  static inline std::string name() { return "double"; }
+  static inline double squareroot(double x)
+    {
+#ifdef TEUCHOS_DEBUG
+      TEUCHOS_SCALAR_TRAITS_NAN_INF_ERR(
+        x, "Error, the input value to squareroot(...) x = " << x << " can not be NaN!" );
+#endif
+      errno = 0;
+      const double rtn = std::sqrt(x);
+      if (errno)
+        return nan();
+      return rtn;
+    }
+  static inline double pow(double x, double y) { return std::pow(x,y); }
+  static inline double pi() { return 3.14159265358979323846; }
+  static inline double log(double x) { return std::log(x); }
+  static inline double log10(double x) { return std::log10(x); }
+};
+
+
+#ifdef HAVE_TEUCHOS_LONG_DOUBLE
+template<>
+struct ScalarTraits<long double>
+{
+  typedef long double magnitudeType;
+  typedef double halfPrecision;
+  typedef double doublePrecision;    
+  typedef long double coordinateType;
+  static const bool isComplex = false;
+  static const bool isOrdinal = false;
+  static const bool isComparable = true;
+  static const bool hasMachineParameters = true;
+  static inline long double eps()   {
+    return std::numeric_limits<long double>::epsilon();
+  }
+  static inline long double sfmin() {
+    return std::numeric_limits<long double>::min();
+  }
+  static inline long double base()  {
+    return std::numeric_limits<long double>::radix;
+  }
+  static inline long double prec()  {
+    return eps()*base();
+  }
+  static inline long double t()     {
+    return std::numeric_limits<long double>::digits;
+  }
+  static inline long double rnd()   {
+    return ( std::numeric_limits<long double>::round_style == std::round_to_nearest ? (long double)(1.0) : (long double)(0.0) );
+  }
+  static inline long double emin()  {
+    return std::numeric_limits<long double>::min_exponent;
+  }
+  static inline long double rmin()  {
+    return std::numeric_limits<long double>::min();
+  }
+  static inline long double emax()  {
+    return std::numeric_limits<long double>::max_exponent;
+  }
+  static inline long double rmax()  {
+    return std::numeric_limits<long double>::max();
+  }
+  static inline magnitudeType magnitude(long double a)
+    {
+#ifdef TEUCHOS_DEBUG
+      TEUCHOS_SCALAR_TRAITS_NAN_INF_ERR(
+        a, "Error, the input value to magnitude(...) a = " << a << " can not be NaN!" );
+#endif
+      return std::fabs(a);
+    }
+  static inline long double zero()  { return 0.0; }
+  static inline long double one()   { return 1.0; }
+  static inline long double conjugate(long double x)   { return(x); }
+  static inline long double real(long double x) { return(x); }
+  static inline long double imag(long double) { return(0); }
+  static inline long double nan() {
+#ifdef __sun
+    return 0.0/std::sin(0.0);
+#else
+    return std::numeric_limits<long double>::quiet_NaN();
+#endif
+  }
+  static inline bool isnaninf(long double x) {
+    return generic_real_isnaninf<long double>(x);
+  }
+  static inline void seedrandom(unsigned int s) {
+    std::srand(s);
+#ifdef __APPLE__
+    // throw away first random number to address bug 3655
+    // http://software.sandia.gov/bugzilla/show_bug.cgi?id=3655
+    random();
+#endif
+  }
+  static inline long double random() { long double rnd = (long double) std::rand() / RAND_MAX; return (long double)(-1.0 + 2.0 * rnd); }
+  static inline std::string name() { return "long double"; }
+  static inline long double squareroot(long double x)
+    {
+#ifdef TEUCHOS_DEBUG
+      TEUCHOS_SCALAR_TRAITS_NAN_INF_ERR(
+        x, "Error, the input value to squareroot(...) x = " << x << " can not be NaN!" );
+#endif
+      errno = 0;
+      const long double rtn = std::sqrt(x);
+      if (errno)
+        return nan();
+      return rtn;
+    }
+  static inline long double pow(long double x, long double y) { return std::pow(x,y); }
+  static inline long double pi() { return 3.14159265358979323846264338327950288419716939937510; }
+  static inline long double log(double x) { return std::log(x); }
+  static inline long double log10(double x) { return std::log10(x); }
+};
+#endif
+
+#ifdef HAVE_TEUCHOSCORE_QUADMATH
+
+template<>
+struct ScalarTraits<__float128> {
+  typedef __float128 magnitudeType;
+  // Unfortunately, we can't rely on a standard __float256 type.  It
+  // might make sense to use qd_real, but mixing quadmath and QD might
+  // cause unforeseen issues.
+  typedef __float128 doublePrecision;
+  typedef double halfPrecision;
+  typedef __float128 coordinateType;
+
+  static const bool isComplex = false;
+  static const bool isOrdinal = false;
+  static const bool isComparable = true;
+  static const bool hasMachineParameters = true;
+
+  static __float128 eps () {
+    return FLT128_EPSILON;
+  }
+  static __float128 sfmin () {
+    return FLT128_MIN; // ???
+  }
+  static __float128 base () {
+    return 2;
+  }
+  static __float128 prec () {
+    return eps () * base ();
+  }
+  static __float128 t () {
+    return FLT128_MANT_DIG;
+  }
+  static __float128 rnd () {
+    return 1.0;
+  }
+  static __float128 emin () {
+    return FLT128_MIN_EXP;
+  }
+  static __float128 rmin () {
+    return FLT128_MIN; // ??? // should be base^(emin-1)
+  }
+  static __float128 emax () {
+    return FLT128_MAX_EXP;
+  }
+  static __float128 rmax () {
+    return FLT128_MAX; // ??? // should be (base^emax)*(1-eps)
+  }
+  static magnitudeType magnitude (const __float128& x) {
+    return fabsq (x);
+  }
+  static __float128 zero () {
+    return 0.0;
+  }
+  static __float128 one () {
+    return 1.0;
+  }
+  static __float128 conjugate (const __float128& x) {
+    return x;
+  }
+  static __float128 real (const __float128& x) {
+    return x;
+  }
+  static __float128 imag (const __float128& /* x */) {
+    return 0.0;
+  }
+  static __float128 nan () {
+    return strtoflt128 ("NAN()", NULL); // ???
+  }
+  static bool isnaninf (const __float128& x) {
+    return isinfq (x) || isnanq (x);
+  }
+  static inline void seedrandom (unsigned int s) {
+    std::srand (s);
+#ifdef __APPLE__
+    // throw away first random number to address bug 3655
+    // http://software.sandia.gov/bugzilla/show_bug.cgi?id=3655
+    random ();
+#endif
+  }
+  static __float128 random () {
+    // Half the smallest normalized double, is the scaling factor of
+    // the lower-order term in the double-double representation.
+    const __float128 scalingFactor =
+      static_cast<__float128> (std::numeric_limits<double>::min ()) /
+      static_cast<__float128> (2.0);
+    const __float128 higherOrderTerm =
+      static_cast<__float128> (ScalarTraits<double>::random ());
+    const __float128 lowerOrderTerm =
+      static_cast<__float128> (ScalarTraits<double>::random ()) *
+      scalingFactor;
+    return higherOrderTerm + lowerOrderTerm;
+  }
+  static std::string name () {
+    return "__float128";
+  }
+  static __float128 squareroot (const __float128& x) {
+    return sqrtq (x);
+  }
+  static __float128 pow (const __float128& x, const __float128& y) {
+    return powq (x, y);
+  }
+  static __float128 pi() { return 3.14159265358979323846; }
+  static __float128 log (const __float128& x) {
+    return logq (x);
+  }
+  static __float128 log10 (const __float128& x) {
+    return log10q (x);
+  }
+};
+#endif // HAVE_TEUCHOSCORE_QUADMATH
+
+
+
+#ifdef HAVE_TEUCHOS_QD
+
+bool operator&&(const dd_real &a, const dd_real &b);
+bool operator&&(const qd_real &a, const qd_real &b);
+
+template<>
+struct ScalarTraits<dd_real>
+{
+  typedef dd_real magnitudeType;
+  typedef double halfPrecision;
+  typedef qd_real doublePrecision;
+  typedef dd_real coordinateType;
+  static const bool isComplex = false;
+  static const bool isOrdinal = false;
+  static const bool isComparable = true;
+  static const bool hasMachineParameters = true;
+  static inline dd_real eps()   { return std::numeric_limits<dd_real>::epsilon(); }
+  static inline dd_real sfmin() { return std::numeric_limits<dd_real>::min(); }
+  static inline dd_real base()  { return std::numeric_limits<dd_real>::radix; }
+  static inline dd_real prec()  { return eps()*base(); }
+  static inline dd_real t()     { return std::numeric_limits<dd_real>::digits; }
+  static inline dd_real rnd()   { return ( std::numeric_limits<dd_real>::round_style == std::round_to_nearest ? dd_real(1.0) : dd_real(0.0) ); }
+  static inline dd_real emin()  { return std::numeric_limits<dd_real>::min_exponent; }
+  static inline dd_real rmin()  { return std::numeric_limits<dd_real>::min(); }
+  static inline dd_real emax()  { return std::numeric_limits<dd_real>::max_exponent; }
+  static inline dd_real rmax()  { return std::numeric_limits<dd_real>::max(); }
+  static inline magnitudeType magnitude(dd_real a)
+  {
+#ifdef TEUCHOS_DEBUG
+      TEUCHOS_SCALAR_TRAITS_NAN_INF_ERR(
+        a, "Error, the input value to magnitude(...) a = " << a << " can not be NaN!" );
+#endif
+      return ::abs(a);
+  }
+  static inline dd_real zero()  { return dd_real(0.0); }
+  static inline dd_real one()   { return dd_real(1.0); }
+  static inline dd_real conjugate(dd_real x)   { return(x); }
+  static inline dd_real real(dd_real x) { return x ; }
+  static inline dd_real imag(dd_real) { return zero(); }
+  static inline dd_real nan() { return dd_real::_nan; }
+  static inline bool isnaninf(dd_real x) { return isnan(x) || isinf(x); }
+  static inline void seedrandom(unsigned int s) {
+    // ddrand() uses std::rand(), so the std::srand() is our seed
+    std::srand(s);
+#ifdef __APPLE__
+    // throw away first random number to address bug 3655
+    // http://software.sandia.gov/bugzilla/show_bug.cgi?id=3655
+    random();
+#endif
+  }
+  static inline dd_real random() { return ddrand(); }
+  static inline std::string name() { return "dd_real"; }
+  static inline dd_real squareroot(dd_real x)
+  {
+#ifdef TEUCHOS_DEBUG
+      TEUCHOS_SCALAR_TRAITS_NAN_INF_ERR(
+        x, "Error, the input value to squareroot(...) x = " << x << " can not be NaN!" );
+#endif
+      return ::sqrt(x);
+  }
+  static inline dd_real pow(dd_real x, dd_real y) { return ::pow(x,y); }
+  static inline dd_real pi() { return 3.14159265358979323846; }
+  // dd_real puts its transcendental functions in the global namespace.
+  static inline dd_real log(dd_real x) { return ::log(x); }
+  static inline dd_real log10(dd_real x) { return ::log10(x); }
+};
+
+
+template<>
+struct ScalarTraits<qd_real>
+{
+  typedef qd_real magnitudeType;
+  typedef dd_real halfPrecision;
+  typedef qd_real doublePrecision;
+  typedef qd_real coordinateType;
+  static const bool isComplex = false;
+  static const bool isOrdinal = false;
+  static const bool isComparable = true;
+  static const bool hasMachineParameters = true;
+  static inline qd_real eps()   { return std::numeric_limits<qd_real>::epsilon(); }
+  static inline qd_real sfmin() { return std::numeric_limits<qd_real>::min(); }
+  static inline qd_real base()  { return std::numeric_limits<qd_real>::radix; }
+  static inline qd_real prec()  { return eps()*base(); }
+  static inline qd_real t()     { return std::numeric_limits<qd_real>::digits; }
+  static inline qd_real rnd()   { return ( std::numeric_limits<qd_real>::round_style == std::round_to_nearest ? qd_real(1.0) : qd_real(0.0) ); }
+  static inline qd_real emin()  { return std::numeric_limits<qd_real>::min_exponent; }
+  static inline qd_real rmin()  { return std::numeric_limits<qd_real>::min(); }
+  static inline qd_real emax()  { return std::numeric_limits<qd_real>::max_exponent; }
+  static inline qd_real rmax()  { return std::numeric_limits<qd_real>::max(); }
+  static inline magnitudeType magnitude(qd_real a)
+  {
+#ifdef TEUCHOS_DEBUG
+      TEUCHOS_SCALAR_TRAITS_NAN_INF_ERR(
+        a, "Error, the input value to magnitude(...) a = " << a << " can not be NaN!" );
+#endif
+      return ::abs(a);
+  }
+  static inline qd_real zero()  { return qd_real(0.0); }
+  static inline qd_real one()   { return qd_real(1.0); }
+  static inline qd_real conjugate(qd_real x)   { return(x); }
+  static inline qd_real real(qd_real x) { return x ; }
+  static inline qd_real imag(qd_real) { return zero(); }
+  static inline qd_real nan() { return qd_real::_nan; }
+  static inline bool isnaninf(qd_real x) { return isnan(x) || isinf(x); }
+  static inline void seedrandom(unsigned int s) {
+    // qdrand() uses std::rand(), so the std::srand() is our seed
+    std::srand(s);
+#ifdef __APPLE__
+    // throw away first random number to address bug 3655
+    // http://software.sandia.gov/bugzilla/show_bug.cgi?id=3655
+    random();
+#endif
+  }
+  static inline qd_real random() { return qdrand(); }
+  static inline std::string name() { return "qd_real"; }
+  static inline qd_real squareroot(qd_real x)
+  {
+#ifdef TEUCHOS_DEBUG
+      TEUCHOS_SCALAR_TRAITS_NAN_INF_ERR(
+        x, "Error, the input value to squareroot(...) x = " << x << " can not be NaN!" );
+#endif
+      return ::sqrt(x);
+  }
+  static inline qd_real pow(qd_real x, qd_real y) { return ::pow(x,y); }
+  static inline qd_real pi() { return 3.14159265358979323846; }
+  // qd_real puts its transcendental functions in the global namespace.
+  static inline qd_real log(qd_real x) { return ::log(x); }
+  static inline qd_real log10(qd_real x) { return ::log10(x); }
+};
+
+
+#endif  // HAVE_TEUCHOS_QD
+
+
+#ifdef HAVE_TEUCHOS_GNU_MP
+
+
+extern gmp_randclass gmp_rng;
+
+
+/* Regarding halfPrecision, doublePrecision and mpf_class:
+   Because the precision of an mpf_class float is not determined by the data type,
+   there is no way to fill the typedefs for this object.
+
+   Instead, we could create new data classes (e.g., Teuchos::MPF128, Teuchos::MPF256) for
+   commonly used levels of precision, and fill out ScalarTraits for these. This would allow us
+   to typedef the promotions and demotions in the appropriate way. These classes would serve to
+   wrap an mpf_class object, calling the constructor for the appropriate precision, exposing the
+   arithmetic routines but hiding the precision-altering routines.
+
+   Alternatively (perhaps, preferably), would could create a single class templated on the precision (e.g., Teuchos::MPF<N>).
+   Then we have a single (partially-specialized) implementation of ScalarTraits. This class, as above, must expose all of the
+   operations expected of a scalar type; however, most of these can be trivially stolen from the gmpcxx.h header file
+
+   CGB/RAB, 01/05/2009
+*/
+template<>
+struct ScalarTraits<mpf_class>
+{
+  typedef mpf_class magnitudeType;
+  typedef mpf_class halfPrecision;
+  typedef mpf_class doublePrecision;
+  typedef mpf_class coordinateType;
+  static const bool isComplex = false;
+  static const bool hasMachineParameters = false;
+  // Not defined: eps(), sfmin(), base(), prec(), t(), rnd(), emin(), rmin(), emax(), rmax()
+  static magnitudeType magnitude(mpf_class a) { return std::abs(a); }
+  static inline mpf_class zero() { mpf_class zero = 0.0; return zero; }
+  static inline mpf_class one() { mpf_class one = 1.0; return one; }
+  static inline mpf_class conjugate(mpf_class x) { return x; }
+  static inline mpf_class real(mpf_class x) { return(x); }
+  static inline mpf_class imag(mpf_class x) { return(0); }
+  static inline bool isnaninf(mpf_class x) { return false; } // mpf_class currently can't handle nan or inf!
+  static inline void seedrandom(unsigned int s) {
+    unsigned long int seedVal = static_cast<unsigned long int>(s);
+    gmp_rng.seed( seedVal );
+  }
+  static inline mpf_class random() {
+    return gmp_rng.get_f();
+  }
+  static inline std::string name() { return "mpf_class"; }
+  static inline mpf_class squareroot(mpf_class x) { return std::sqrt(x); }
+  static inline mpf_class pow(mpf_class x, mpf_class y) { return pow(x,y); }
+  // Todo: RAB: 2004/05/28: Add nan() and isnaninf() functions when needed!
+};
+
+#endif  // HAVE_TEUCHOS_GNU_MP
+
+#ifdef HAVE_TEUCHOS_ARPREC
+
+/* See discussion above for mpf_class, regarding halfPrecision and doublePrecision. Something similar will need to be done
+   for ARPREC. */
+template<>
+struct ScalarTraits<mp_real>
+{
+  typedef mp_real magnitudeType;
+  typedef double halfPrecision;
+  typedef mp_real doublePrecision;
+  typedef mp_real coordinateType;
+  static const bool isComplex = false;
+  static const bool isComparable = true;
+  static const bool isOrdinal = false;
+  static const bool hasMachineParameters = false;
+  // Not defined: eps(), sfmin(), base(), prec(), t(), rnd(), emin(), rmin(), emax(), rmax()
+  static magnitudeType magnitude(mp_real a) { return abs(a); }
+  static inline mp_real zero() { mp_real zero = 0.0; return zero; }
+  static inline mp_real one() { mp_real one = 1.0; return one; }
+  static inline mp_real conjugate(mp_real x) { return x; }
+  static inline mp_real real(mp_real x) { return(x); }
+  static inline mp_real imag(mp_real x) { return zero(); }
+  static inline bool isnaninf(mp_real x) { return false; } // ToDo: Change this?
+  static inline void seedrandom(unsigned int s) {
+    long int seedVal = static_cast<long int>(s);
+    srand48(seedVal);
+  }
+  static inline mp_real random() { return mp_rand(); }
+  static inline std::string name() { return "mp_real"; }
+  static inline mp_real squareroot(mp_real x) { return sqrt(x); }
+  static inline mp_real pow(mp_real x, mp_real y) { return pow(x,y); }
+  static inline mp_real pi() { return 3.14159265358979323846; }
+  // Todo: RAB: 2004/05/28: Add nan() and isnaninf() functions when needed!
+};
+
+
+#endif // HAVE_TEUCHOS_ARPREC
+
+
+#ifdef HAVE_TEUCHOS_COMPLEX
+
+
+// Partial specialization for std::complex numbers templated on real type T
+template<class T>
+struct ScalarTraits<
+  std::complex<T>
+>
+{
+  typedef std::complex<T>  ComplexT;
+  typedef std::complex<typename ScalarTraits<T>::halfPrecision> halfPrecision;
+  typedef std::complex<typename ScalarTraits<T>::doublePrecision> doublePrecision;
+  typedef typename ScalarTraits<T>::magnitudeType magnitudeType;
+  typedef typename ScalarTraits<T>::coordinateType coordinateType;
+  static const bool isComplex = true;
+  static const bool isOrdinal = ScalarTraits<T>::isOrdinal;
+  static const bool isComparable = false;
+  static const bool hasMachineParameters = true;
+  static inline magnitudeType eps()          { return ScalarTraits<magnitudeType>::eps(); }
+  static inline magnitudeType sfmin()        { return ScalarTraits<magnitudeType>::sfmin(); }
+  static inline magnitudeType base()         { return ScalarTraits<magnitudeType>::base(); }
+  static inline magnitudeType prec()         { return ScalarTraits<magnitudeType>::prec(); }
+  static inline magnitudeType t()            { return ScalarTraits<magnitudeType>::t(); }
+  static inline magnitudeType rnd()          { return ScalarTraits<magnitudeType>::rnd(); }
+  static inline magnitudeType emin()         { return ScalarTraits<magnitudeType>::emin(); }
+  static inline magnitudeType rmin()         { return ScalarTraits<magnitudeType>::rmin(); }
+  static inline magnitudeType emax()         { return ScalarTraits<magnitudeType>::emax(); }
+  static inline magnitudeType rmax()         { return ScalarTraits<magnitudeType>::rmax(); }
+  static magnitudeType magnitude(ComplexT a)
+    {
+#ifdef TEUCHOS_DEBUG
+      TEUCHOS_SCALAR_TRAITS_NAN_INF_ERR(
+        a, "Error, the input value to magnitude(...) a = " << a << " can not be NaN!" );
+#endif
+      return std::abs(a);
+    }
+  static inline ComplexT zero()              { return ComplexT(ScalarTraits<magnitudeType>::zero(),ScalarTraits<magnitudeType>::zero()); }
+  static inline ComplexT one()               { return ComplexT(ScalarTraits<magnitudeType>::one(),ScalarTraits<magnitudeType>::zero()); }
+  static inline ComplexT conjugate(ComplexT a){ return ComplexT(a.real(),-a.imag()); }
+  static inline magnitudeType real(ComplexT a) { return a.real(); }
+  static inline magnitudeType imag(ComplexT a) { return a.imag(); }
+  static inline ComplexT nan()               { return ComplexT(ScalarTraits<magnitudeType>::nan(),ScalarTraits<magnitudeType>::nan()); }
+  static inline bool isnaninf(ComplexT x)    { return ScalarTraits<magnitudeType>::isnaninf(x.real()) || ScalarTraits<magnitudeType>::isnaninf(x.imag()); }
+  static inline void seedrandom(unsigned int s) { ScalarTraits<magnitudeType>::seedrandom(s); }
+  static inline ComplexT random()
+    {
+      const T rnd1 = ScalarTraits<magnitudeType>::random();
+      const T rnd2 = ScalarTraits<magnitudeType>::random();
+      return ComplexT(rnd1,rnd2);
+    }
+  static inline std::string name() { return std::string("std::complex<")+std::string(ScalarTraits<magnitudeType>::name())+std::string(">"); }
+  // This will only return one of the square roots of x, the other can be obtained by taking its conjugate
+  static inline ComplexT squareroot(ComplexT x)
+    {
+#ifdef TEUCHOS_DEBUG
+      TEUCHOS_SCALAR_TRAITS_NAN_INF_ERR(
+        x, "Error, the input value to squareroot(...) x = " << x << " can not be NaN!" );
+#endif
+      typedef ScalarTraits<magnitudeType>  STMT;
+      const T r  = x.real(), i = x.imag(), zero = STMT::zero(), two = 2.0;
+      const T a  = STMT::squareroot((r*r)+(i*i));
+      const T nr = STMT::squareroot((a+r)/two);
+      const T ni = ( i == zero ? zero : STMT::squareroot((a-r)/two) );
+      return ComplexT(nr,ni);
+    }
+    // 2010/03/19: rabartl: Above, I had to add the check for i == zero
+    // to avoid a returned NaN on the Intel 10.1 compiler.  For some
+    // reason, having these two squareroot calls in a row produce a NaN
+    // in an optimized build with this compiler.  Amazingly, when I put
+    // in print statements (i.e. std::cout << ...) the NaN went away and
+    // the second squareroot((a-r)/two) returned zero correctly.  I
+    // guess that calling the output routine flushed the registers or
+    // something and restarted the squareroot rountine for this compiler
+    // or something similar.  Actually, due to roundoff, it is possible that a-r
+    // might be slightly less than zero (i.e. -1e-16) and that would cause
+    // a possbile NaN return.  THe above if test is the right thing to do
+    // I think and is very cheap.
+  static inline ComplexT pow(ComplexT x, ComplexT y) { return pow(x,y); }
+  static inline ComplexT pi() { return ScalarTraits<T>::pi(); }
+};
+#endif //  HAVE_TEUCHOS_COMPLEX
+
+#ifdef HAVE_TEUCHOSCORE_KOKKOS
+// Partial specialization for Kokkos::complex<T>
+template<class T>
+struct ScalarTraits<
+  Kokkos::complex<T>
+>
+{
+  typedef Kokkos::complex<T>  ComplexT;
+  typedef Kokkos::complex<typename ScalarTraits<T>::halfPrecision> halfPrecision;
+  typedef Kokkos::complex<typename ScalarTraits<T>::doublePrecision> doublePrecision;
+  typedef typename ScalarTraits<T>::magnitudeType magnitudeType;
+  typedef typename ScalarTraits<T>::coordinateType coordinateType;
+  static const bool isComplex = true;
+  static const bool isOrdinal = ScalarTraits<T>::isOrdinal;
+  static const bool isComparable = false;
+  static const bool hasMachineParameters = true;
+  static inline magnitudeType eps()          { return ScalarTraits<magnitudeType>::eps(); }
+  static inline magnitudeType sfmin()        { return ScalarTraits<magnitudeType>::sfmin(); }
+  static inline magnitudeType base()         { return ScalarTraits<magnitudeType>::base(); }
+  static inline magnitudeType prec()         { return ScalarTraits<magnitudeType>::prec(); }
+  static inline magnitudeType t()            { return ScalarTraits<magnitudeType>::t(); }
+  static inline magnitudeType rnd()          { return ScalarTraits<magnitudeType>::rnd(); }
+  static inline magnitudeType emin()         { return ScalarTraits<magnitudeType>::emin(); }
+  static inline magnitudeType rmin()         { return ScalarTraits<magnitudeType>::rmin(); }
+  static inline magnitudeType emax()         { return ScalarTraits<magnitudeType>::emax(); }
+  static inline magnitudeType rmax()         { return ScalarTraits<magnitudeType>::rmax(); }
+  static magnitudeType magnitude(ComplexT a)
+    {
+#ifdef TEUCHOS_DEBUG
+      TEUCHOS_SCALAR_TRAITS_NAN_INF_ERR(
+        a, "Error, the input value to magnitude(...) a = " << a << " can not be NaN!" );
+#endif
+      return std::abs(std::complex<T>(a));
+    }
+  static inline ComplexT zero()              { return ComplexT(ScalarTraits<magnitudeType>::zero(),ScalarTraits<magnitudeType>::zero()); }
+  static inline ComplexT one()               { return ComplexT(ScalarTraits<magnitudeType>::one(),ScalarTraits<magnitudeType>::zero()); }
+  static inline ComplexT conjugate(ComplexT a){ return ComplexT(a.real(),-a.imag()); }
+  static inline magnitudeType real(ComplexT a) { return a.real(); }
+  static inline magnitudeType imag(ComplexT a) { return a.imag(); }
+  static inline ComplexT nan()               { return ComplexT(ScalarTraits<magnitudeType>::nan(),ScalarTraits<magnitudeType>::nan()); }
+  static inline bool isnaninf(ComplexT x)    { return ScalarTraits<magnitudeType>::isnaninf(x.real()) || ScalarTraits<magnitudeType>::isnaninf(x.imag()); }
+  static inline void seedrandom(unsigned int s) { ScalarTraits<magnitudeType>::seedrandom(s); }
+  static inline ComplexT random()
+    {
+      const T rnd1 = ScalarTraits<magnitudeType>::random();
+      const T rnd2 = ScalarTraits<magnitudeType>::random();
+      return ComplexT(rnd1,rnd2);
+    }
+  static inline std::string name() { return std::string("Kokkos::complex<")+std::string(ScalarTraits<magnitudeType>::name())+std::string(">"); }
+  // This will only return one of the square roots of x, the other can be obtained by taking its conjugate
+  static inline ComplexT squareroot(ComplexT x)
+    {
+#ifdef TEUCHOS_DEBUG
+      TEUCHOS_SCALAR_TRAITS_NAN_INF_ERR(
+        x, "Error, the input value to squareroot(...) x = " << x << " can not be NaN!" );
+#endif
+      typedef ScalarTraits<magnitudeType>  STMT;
+      const T r  = x.real(), i = x.imag(), zero = STMT::zero(), two = 2.0;
+      const T a  = STMT::squareroot((r*r)+(i*i));
+      const T nr = STMT::squareroot((a+r)/two);
+      const T ni = ( i == zero ? zero : STMT::squareroot((a-r)/two) );
+      return ComplexT(nr,ni);
+    }
+  static inline ComplexT pow(ComplexT x, ComplexT y) { return pow(std::complex<T>(x), std::complex<T>(y)); }
+  static inline ComplexT pi() { return ScalarTraits<T>::pi(); }
+};
+#endif // HAVE_TEUCHOSCORE_KOKKOS
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+} // Teuchos namespace
+
+#endif // _TEUCHOS_SCALARTRAITS_HPP_
diff --git a/packages/rol/src/compatibility/teuchos-lite/Teuchos_ScalarTraitsDecl.hpp b/packages/rol/src/compatibility/teuchos-lite/Teuchos_ScalarTraitsDecl.hpp
new file mode 100644
index 000000000000..3e52aef0dbdb
--- /dev/null
+++ b/packages/rol/src/compatibility/teuchos-lite/Teuchos_ScalarTraitsDecl.hpp
@@ -0,0 +1,165 @@
+// @HEADER
+// ***********************************************************************
+//
+//                    Teuchos: Common Tools Package
+//                 Copyright (2004) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
+//
+// ***********************************************************************
+// @HEADER
+
+#ifndef _TEUCHOS_SCALARTRAITS_DECL_HPP_
+#define _TEUCHOS_SCALARTRAITS_DECL_HPP_
+
+/*! \file Teuchos_ScalarTraitsDecl.hpp
+    \brief Declaration and default implementation for basic traits for the scalar field type.
+*/
+
+#include "Teuchos_ConfigDefs.hpp"
+
+
+namespace Teuchos {
+
+
+template <typename T>
+struct UndefinedScalarTraits
+{
+  //! This function should not compile if there is an attempt to instantiate!
+  static inline T notDefined() { return T::this_type_is_missing_a_specialization(); }
+};
+
+
+/* This is the default structure used by ScalarTraits<T> to produce a compile time
+	error when the specialization does not exist for type <tt>T</tt>.
+*/
+
+
+/*! \brief This structure defines some basic traits for a scalar field type.
+ *
+ * Scalar traits are an essential part of templated codes.  This structure offers
+ * the basic traits of the templated scalar type, like defining zero and one,
+ * and basic functions on the templated scalar type, like performing a square root.
+ *
+ * The functions in the templated base unspecialized struct are designed not to
+ * compile (giving a nice compile-time error message) and therefore specializations
+ * must be written for Scalar types actually used.
+ *
+ * \note <ol>
+ *
+ * <li> The default defined specializations are provided for \c int, \c float, and \c double.
+ *
+ * <li> If Teuchos is configured with </tt>Teuchos_ENABLE_COMPLEX=ON</tt> then
+ * ScalarTraits also has a partial specialization for all
+ * <tt>std::complex</tt> numbers of the form <tt>std::complex<T></tt>.
+ *
+ * </ol>
+*/
+
+template <typename T>
+struct ScalarTraits
+{
+  //! Mandatory typedef for result of magnitude
+  typedef T magnitudeType;
+  //! Typedef for half precision
+  typedef T halfPrecision;
+  //! Typedef for double precision
+  typedef T doublePrecision;
+  //! Typedef for coordinates
+  typedef T coordinateType;
+  //! Determines if scalar type is std::complex
+  static const bool isComplex = false;
+  //! Determines if scalar type is an ordinal type
+  static const bool isOrdinal = false;
+  //! Determines if scalar type supports relational operators such as <, >, <=, >=.
+  static const bool isComparable = false;
+  /** \brief Determines if scalar type have machine-specific parameters
+   * (i.e. eps(), sfmin(), base(), prec(), t(), rnd(), emin(), rmin(), emax(),
+   * rmax() are supported).
+   */
+  static const bool hasMachineParameters = false;
+  //! Returns relative machine precision.
+  static inline magnitudeType eps()   { return UndefinedScalarTraits<T>::notDefined(); }
+  //! Returns safe minimum (sfmin), such that 1/sfmin does not overflow.
+  static inline magnitudeType sfmin() { return UndefinedScalarTraits<T>::notDefined(); }
+  //! Returns the base of the machine.
+  static inline magnitudeType base()  { return UndefinedScalarTraits<T>::notDefined(); }
+  //! Returns \c eps*base.
+  static inline magnitudeType prec()  { return UndefinedScalarTraits<T>::notDefined(); }
+  //! Returns the number of (base) digits in the mantissa.
+  static inline magnitudeType t()     { return UndefinedScalarTraits<T>::notDefined(); }
+  //! Returns 1.0 when rounding occurs in addition, 0.0 otherwise
+  static inline magnitudeType rnd()   { return UndefinedScalarTraits<T>::notDefined(); }
+  //! Returns the minimum exponent before (gradual) underflow.
+  static inline magnitudeType emin()  { return UndefinedScalarTraits<T>::notDefined(); }
+  //! Returns the underflow threshold - \c base^(emin-1)
+  static inline magnitudeType rmin()  { return UndefinedScalarTraits<T>::notDefined(); }
+  //! Returns the largest exponent before overflow.
+  static inline magnitudeType emax()  { return UndefinedScalarTraits<T>::notDefined(); }
+  //! Overflow theshold - \c (base^emax)*(1-eps)
+  static inline magnitudeType rmax()  { return UndefinedScalarTraits<T>::notDefined(); }
+  //! Returns the magnitudeType of the scalar type \c a.
+  static inline magnitudeType magnitude(T a) { return UndefinedScalarTraits<T>::notDefined(); }
+  //! Returns representation of zero for this scalar type.
+  static inline T zero()                     { return UndefinedScalarTraits<T>::notDefined(); }
+  //! Returns representation of one for this scalar type.
+  static inline T one()                      { return UndefinedScalarTraits<T>::notDefined(); }
+  //! Returns the real part of the scalar type \c a.
+  static inline magnitudeType real(T a) { return UndefinedScalarTraits<T>::notDefined(); }
+  //! Returns the imaginary part of the scalar type \c a.
+  static inline magnitudeType imag(T a) { return UndefinedScalarTraits<T>::notDefined(); }
+  //! Returns the conjugate of the scalar type \c a.
+  static inline T conjugate(T a) { return UndefinedScalarTraits<T>::notDefined(); }
+  //! Returns a number that represents NaN.
+  static inline T nan()                      { return UndefinedScalarTraits<T>::notDefined(); }
+  //! Returns <tt>true</tt> if <tt>x</tt> is NaN or Inf.
+  static inline bool isnaninf(const T& x)     { return UndefinedScalarTraits<T>::notDefined(); }
+  //! Seed the random number generator returned by <tt>random()</tt>.
+  static inline void seedrandom(unsigned int s) { int i; T t = &i; }
+  //! Returns a random number (between -one() and +one()) of this scalar type.
+  static inline T random()                   { return UndefinedScalarTraits<T>::notDefined(); }
+  //! Returns the name of this scalar type.
+  static inline std::string name()           { (void)UndefinedScalarTraits<T>::notDefined(); return 0; }
+  //! Returns a number of magnitudeType that is the square root of this scalar type \c x.
+  static inline T squareroot(T x) { return UndefinedScalarTraits<T>::notDefined(); }
+  //! Returns the result of raising one scalar \c x to the power \c y.
+  static inline T pow(T x, T y) { return UndefinedScalarTraits<T>::notDefined(); }
+  //! Returns the value of PI.
+  static inline T pi() { return UndefinedScalarTraits<T>::notDefined();}
+};
+
+
+} // Teuchos namespace
+
+
+#endif // _TEUCHOS_SCALARTRAITS_DECL_HPP_
diff --git a/packages/rol/src/compatibility/teuchos-lite/Teuchos_SerialDenseMatrix.hpp b/packages/rol/src/compatibility/teuchos-lite/Teuchos_SerialDenseMatrix.hpp
new file mode 100644
index 000000000000..17029d3d6232
--- /dev/null
+++ b/packages/rol/src/compatibility/teuchos-lite/Teuchos_SerialDenseMatrix.hpp
@@ -0,0 +1,1114 @@
+// @HEADER
+// ***********************************************************************
+//
+//                    Teuchos: Common Tools Package
+//                 Copyright (2004) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// ***********************************************************************
+// @HEADER
+
+#ifndef _TEUCHOS_SERIALDENSEMATRIX_HPP_
+#define _TEUCHOS_SERIALDENSEMATRIX_HPP_
+/*! \file Teuchos_SerialDenseMatrix.hpp
+  \brief Templated serial dense matrix class
+*/
+
+#include "Teuchos_CompObject.hpp"
+#include "Teuchos_BLAS.hpp"
+#include "Teuchos_ScalarTraits.hpp"
+#include "Teuchos_DataAccess.hpp"
+#include "Teuchos_ConfigDefs.hpp"
+#include "Teuchos_Assert.hpp"
+#include "Teuchos_SerialSymDenseMatrix.hpp"
+#include <cstddef>
+#include <utility>
+
+/*!     \class Teuchos::SerialDenseMatrix
+        \brief This class creates and provides basic support for dense rectangular matrix of templated type.
+*/
+/** \example DenseMatrix/cxx_main.cpp
+    This is an example of how to use the Teuchos::SerialDenseMatrix class.
+*/
+
+
+namespace Teuchos {
+
+template<typename OrdinalType, typename ScalarType>
+class SerialDenseMatrix : public CompObject, public BLAS<OrdinalType, ScalarType>
+{
+public:
+
+  //! Typedef for ordinal type
+  typedef OrdinalType ordinalType;
+  //! Typedef for scalar type
+  typedef ScalarType scalarType;
+
+  //! @name Constructor/Destructor methods.
+  //@{
+
+  //! Default Constructor
+  /*! Creates a empty matrix of no dimension.  The Shaping methods should be used to size this matrix.
+    Values of this matrix should be set using the [], (), or = operators.
+  */
+  SerialDenseMatrix() = default;
+
+  //! Shaped Constructor
+  /*!
+    \param numRows - Number of rows in matrix.
+    \param numCols - Number of columns in matrix.
+    \param zeroOut - Initializes values to 0 if true (default)
+
+    Creates a shaped matrix with \c numRows rows and \c numCols cols.  All values are initialized to 0 when \c zeroOut is true.
+    Values of this matrix should be set using the [] or the () operators.
+  */
+  SerialDenseMatrix(OrdinalType numRows, OrdinalType numCols, bool zeroOut = true);
+
+  //! Shaped Constructor with Values
+  /*!
+    \param CV - Enumerated type set to Teuchos::Copy or Teuchos::View.
+    \param values - Pointer to an array of ScalarType.  The first column starts at \c values,
+                the second at \c values+stride, etc.
+    \param stride - The stride between the columns of the matrix in memory.
+    \param numRows - Number of rows in matrix.
+    \param numCols - Number of columns in matrix.
+  */
+  SerialDenseMatrix(DataAccess CV, ScalarType* values, OrdinalType stride, OrdinalType numRows, OrdinalType numCols);
+
+  //! Copy Constructor
+  /*! \note A deep copy of the \c Source transposed can be obtained if \c trans=Teuchos::TRANS, \c else
+    a non-transposed copy of \c Source is made.  There is no storage of the transpose state of the matrix
+    within the SerialDenseMatrix class, so this information will not propogate to any operation performed
+    on a matrix that has been copy constructed in transpose.
+  */
+  SerialDenseMatrix(const SerialDenseMatrix<OrdinalType, ScalarType> &Source, ETransp trans = Teuchos::NO_TRANS);
+
+  //! Copy Constructor
+  /*! \note Only a non-transposed deep copy or view of \c Source is made with this copy constructor.
+  */
+  SerialDenseMatrix(DataAccess CV, const SerialDenseMatrix<OrdinalType, ScalarType> &Source);
+
+  //! Submatrix Copy Constructor
+  /*!
+    \param CV - Enumerated type set to Teuchos::Copy or Teuchos::View.
+    \param Source - Reference to another dense matrix from which values are to be copied.
+    \param numRows - The number of rows in this matrix.
+    \param numCols - The number of columns in this matrix.
+    \param startRow - The row of \c Source from which the submatrix copy should start.
+    \param startCol - The column of \c Source from which the submatrix copy should start.
+
+    Creates a shaped matrix with \c numRows rows and \c numCols columns, which is a submatrix of \c Source.
+    If \c startRow and \c startCol are not given, then the submatrix is the leading submatrix of \c Source.
+    Otherwise, the (1,1) entry in the copied matrix is the (\c startRow, \c startCol) entry of \c Source.
+  */
+  SerialDenseMatrix(DataAccess CV, const SerialDenseMatrix<OrdinalType, ScalarType> &Source, OrdinalType numRows, OrdinalType numCols, OrdinalType startRow=0, OrdinalType startCol=0);
+
+  //! Destructor
+  virtual ~SerialDenseMatrix();
+  //@}
+
+  //! @name Shaping methods.
+  //@{
+  //! Shape method for changing the size of a SerialDenseMatrix, initializing entries to zero.
+  /*!
+    \param numRows - The number of rows in this matrix.
+    \param numCols - The number of columns in this matrix.
+
+    This method allows the user to define the dimensions of a SerialDenseMatrix at any point.  This method
+    can be called at any point after construction.  Any values previously in this object will be destroyed
+    and the resized matrix starts of with all zero values.
+
+    \return Integer error code, set to 0 if successful.
+  */
+  int shape(OrdinalType numRows, OrdinalType numCols);
+
+  //! Same as <tt>shape()</tt> except leaves uninitialized.
+  int shapeUninitialized(OrdinalType numRows, OrdinalType numCols);
+
+  //! Reshaping method for changing the size of a SerialDenseMatrix, keeping the entries.
+  /*!
+    \param numRows - The number of rows in this matrix.
+    \param numCols - The number of columns in this matrix.
+
+    This method allows the user to redefine the dimensions of a SerialDenseMatrix at any point.  This method
+    can be called at any point after construction.  Any values previously in this object will be copied into
+    the reshaped matrix.
+
+    \return Integer error code, set 0 if successful.
+  */
+  int reshape(OrdinalType numRows, OrdinalType numCols);
+
+
+  //@}
+
+  //! @name Set methods.
+  //@{
+
+  //! Copies values from one matrix to another.
+  /*!
+    The operator= copies the values from one existing SerialDenseMatrix to another.
+    If \c Source is a view (i.e. CV = Teuchos::View), then this method will
+    return a view.  Otherwise, it will return a copy of \c Source.  \e this object
+    will be resized if it is not large enough to copy \c Source into.
+  */
+  SerialDenseMatrix<OrdinalType, ScalarType>& operator= (const SerialDenseMatrix<OrdinalType, ScalarType>& Source);
+
+  //! Copies values from one matrix to another.
+  /*!
+    Copies the values from one existing SerialDenseMatrix to another
+    if the dimension of both matrices are the same.  If not, \e this matrix
+    will be returned unchanged.
+  */
+  SerialDenseMatrix<OrdinalType, ScalarType>& assign (const SerialDenseMatrix<OrdinalType, ScalarType>& Source);
+
+  //! Set all values in the matrix to a constant value.
+  /*!
+    \param value - Value to use;
+  */
+  SerialDenseMatrix<OrdinalType, ScalarType>& operator= (const ScalarType value) { putScalar(value); return(*this); }
+
+  //! Set all values in the matrix to a constant value.
+  /*!
+    \param value - Value to use; zero if none specified.
+    \return Integer error code, set to 0 if successful.
+  */
+  int putScalar( const ScalarType value = Teuchos::ScalarTraits<ScalarType>::zero() );
+
+  //! Swap values between this matrix and incoming matrix.
+  /*!
+    Swaps pointers and associated state without copying the matrix data.
+  */
+  void swap (SerialDenseMatrix<OrdinalType, ScalarType> &B);
+
+  //! Set all values in the matrix to be random numbers.
+  int random();
+
+  //@}
+
+  //! @name Accessor methods.
+  //@{
+
+  //! Element access method (non-const).
+  /*! Returns the element in the ith row and jth column if A(i,j) is specified, the
+    expression A[j][i] will return the same element.
+
+        \return Element from the specified \c rowIndex row and \c colIndex column.
+    \warning The validity of \c rowIndex and \c colIndex will only be checked if Teuchos is
+    configured with --enable-teuchos-abc.
+  */
+  ScalarType& operator () (OrdinalType rowIndex, OrdinalType colIndex);
+
+  //! Element access method (const).
+  /*! Returns the element in the ith row and jth column if A(i,j) is specified, the expression
+    A[j][i] will return the same element.
+
+        \return Element from the specified \c rowIndex row and \c colIndex column.
+    \warning The validity of \c rowIndex and \c colIndex will only be checked if Teuchos is
+    configured with --enable-teuchos-abc.
+  */
+  const ScalarType& operator () (OrdinalType rowIndex, OrdinalType colIndex) const;
+
+  //! Column access method (non-const).
+  /*! Returns the pointer to the ScalarType array at the jth column if A[j] is specified, the expression
+    A[j][i] will return the same element as A(i,j).
+
+    \return Pointer to the ScalarType array at the \c colIndex column ( \c values_+colIndex*stride_ ).
+  */
+  ScalarType* operator [] (OrdinalType colIndex);
+
+  //! Column access method (const).
+  /*! Returns the pointer to the ScalarType array at the jth column if A[j] is specified, the expression
+    A[j][i] will return the same element as A(i,j).
+
+    \return Pointer to the ScalarType array at the \c colIndex column ( \c values_+colIndex*stride_ ).
+  */
+  const ScalarType* operator [] (OrdinalType colIndex) const;
+
+  //! Data array access method.
+  /*! \return Pointer to the ScalarType data array contained in the object. */
+  ScalarType* values() const { return values_; }
+
+  //@}
+
+  //! @name Mathematical methods.
+  //@{
+
+  //! Add another matrix to \e this matrix.
+  /*! Add \c Source to \e this if the dimension of both matrices are the same.  If not, \e this matrix
+    will be returned unchanged.
+  */
+  SerialDenseMatrix<OrdinalType, ScalarType>& operator+= (const SerialDenseMatrix<OrdinalType, ScalarType>& Source);
+
+  //! Subtract another matrix from \e this matrix.
+  /*! Subtract \c Source from \e this if the dimension of both matrices are the same.  If not, \e this matrix
+    will be returned unchanged.
+  */
+  SerialDenseMatrix<OrdinalType, ScalarType>& operator-= (const SerialDenseMatrix<OrdinalType, ScalarType>& Source);
+
+  //! Scale \c this matrix by \c alpha; \c *this = \c alpha*\c *this.
+  /*!
+    \param alpha Scalar to multiply \e this by.
+  */
+  SerialDenseMatrix<OrdinalType, ScalarType>& operator*= (const ScalarType alpha);
+
+  //! Scale \c this matrix by \c alpha; \c *this = \c alpha*\c *this.
+  /*!
+    \param alpha Scalar to multiply \e this by.
+    \return Integer error code, set to 0 if successful.
+  */
+  int scale ( const ScalarType alpha );
+
+  //! Point-wise scale \c this matrix by \c A; i.e. *this(i,j) *= A(i,j)
+  /*! The values of \c *this matrix will be point-wise scaled by the values in A.
+    If A and \c this matrix are not the same dimension \c this will be returned unchanged.
+
+    \param B Teuchos::SerialDenseMatrix used to perform element-wise scaling of \e this.
+    \return Integer error code, set to 0 if successful.
+  */
+  int scale ( const SerialDenseMatrix<OrdinalType, ScalarType>& A );
+
+  //! Multiply \c A * \c B and add them to \e this; \e this = \c beta * \e this + \c alpha*A*B.
+  /*!
+    \param transa - Use the transpose of \c A if transa = Teuchos::TRANS, else don't use the
+    transpose if transa = Teuchos::NO_TRANS.
+    \param transb - Use the transpose of \c B if transb = Teuchos::TRANS, else don't use the
+    transpose if transb = Teuchos::NO_TRANS.
+    \param alpha - The scaling factor for \c A * \c B.
+    \param A - SerialDenseMatrix
+    \param B - SerialDenseMatrix
+    \param beta - The scaling factor for \e this.
+
+    If the matrices \c A and \c B are not of the right dimension, consistent with \e this, then \e this
+    matrix will not be altered and -1 will be returned.
+    \return Integer error code, set to 0 if successful.
+  */
+  int multiply (ETransp transa, ETransp transb, ScalarType alpha, const SerialDenseMatrix<OrdinalType, ScalarType> &A, const SerialDenseMatrix<OrdinalType, ScalarType> &B, ScalarType beta);
+
+  //! Multiply \c A and \c B and add them to \e this; \e this = \c beta * \e this + \c alpha*A*B or \e this = \c beta * \e this + \c alpha*B*A.
+  /*!
+    \param sideA - Which side is A on for the multiplication to B, A*B (Teuchos::LEFT_SIDE) or B*A (Teuchos::RIGHT_SIDE).
+    \param alpha - The scaling factor for \c A * \c B, or \c B * \c A.
+    \param A - SerialSymDenseMatrix (a serial SPD dense matrix)
+    \param B - SerialDenseMatrix (a serial dense matrix)
+    \param beta - The scaling factor for \e this.
+
+    If the matrices \c A and \c B are not of the right dimension, consistent with \e this, then \e this
+    matrix will not be altered and -1 will be returned.
+    \return Integer error code, set to 0 if successful.
+  */
+  int multiply (ESide sideA, ScalarType alpha, const SerialSymDenseMatrix<OrdinalType, ScalarType> &A, const SerialDenseMatrix<OrdinalType, ScalarType> &B, ScalarType beta);
+
+  //@}
+
+  //! @name Comparison methods.
+  //@{
+
+  //! Equality of two matrices.
+  /*! \return True if \e this matrix and \c Operand are of the same shape (rows and columns) and have
+    the same entries, else False will be returned.
+  */
+  bool operator== (const SerialDenseMatrix<OrdinalType, ScalarType> &Operand) const;
+
+  //! Inequality of two matrices.
+  /*! \return True if \e this matrix and \c Operand of not of the same shape (rows and columns) or don't
+    have the same entries, else False will be returned.
+  */
+  bool operator!= (const SerialDenseMatrix<OrdinalType, ScalarType> &Operand) const;
+
+  //@}
+
+  //! @name Attribute methods.
+  //@{
+
+  //! Returns the row dimension of this matrix.
+  OrdinalType numRows() const { return(numRows_); }
+
+  //! Returns the column dimension of this matrix.
+  OrdinalType numCols() const { return(numCols_); }
+
+  //! Returns the stride between the columns of this matrix in memory.
+  OrdinalType stride() const { return(stride_); }
+
+  //! Returns whether this matrix is empty.
+  bool empty() const { return(numRows_ == 0 || numCols_ == 0); }
+  //@}
+
+  //! @name Norm methods.
+  //@{
+
+  //! Returns the 1-norm of the matrix.
+  typename ScalarTraits<ScalarType>::magnitudeType normOne() const;
+
+  //! Returns the Infinity-norm of the matrix.
+  typename ScalarTraits<ScalarType>::magnitudeType normInf() const;
+
+  //! Returns the Frobenius-norm of the matrix.
+  typename ScalarTraits<ScalarType>::magnitudeType normFrobenius() const;
+  //@}
+
+  //! @name I/O methods.
+  //@{
+  //! Print method.  Defines the behavior of the std::ostream << operator
+  virtual std::ostream& print(std::ostream& os) const;
+
+  //@}
+protected:
+  void copyMat(ScalarType* inputMatrix, OrdinalType strideInput,
+    OrdinalType numRows, OrdinalType numCols, ScalarType* outputMatrix,
+    OrdinalType strideOutput, OrdinalType startRow, OrdinalType startCol,
+    ScalarType alpha = ScalarTraits<ScalarType>::zero() );
+  void deleteArrays();
+  void checkIndex( OrdinalType rowIndex, OrdinalType colIndex = 0 ) const;
+
+  static ScalarType*
+  allocateValues(const OrdinalType numRows,
+                 const OrdinalType numCols)
+  {
+    const size_t size = size_t(numRows) * size_t(numCols);
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+    return new ScalarType[size];
+#pragma GCC diagnostic pop
+  }
+
+  OrdinalType numRows_ = 0;
+  OrdinalType numCols_ = 0;
+  OrdinalType stride_ = 0;
+  bool valuesCopied_ = false;
+  ScalarType* values_ = nullptr;
+}; // class Teuchos_SerialDenseMatrix
+
+//----------------------------------------------------------------------------------------------------
+//  Constructors and Destructor
+//----------------------------------------------------------------------------------------------------
+
+template<typename OrdinalType, typename ScalarType>
+SerialDenseMatrix<OrdinalType, ScalarType>::SerialDenseMatrix(
+  OrdinalType numRows_in, OrdinalType numCols_in, bool zeroOut
+  )
+  : numRows_(numRows_in),
+    numCols_(numCols_in),
+    stride_(numRows_in),
+    valuesCopied_(true),
+    values_(allocateValues(numRows_in, numCols_in))
+{
+  if (zeroOut) {
+    putScalar();
+  }
+}
+
+template<typename OrdinalType, typename ScalarType>
+SerialDenseMatrix<OrdinalType, ScalarType>::SerialDenseMatrix(
+  DataAccess CV, ScalarType* values_in, OrdinalType stride_in, OrdinalType numRows_in,
+  OrdinalType numCols_in
+  )
+  : numRows_(numRows_in),
+    numCols_(numCols_in),
+    stride_(stride_in),
+    valuesCopied_(false),
+    values_(values_in)
+{
+  if(CV == Copy)
+  {
+    stride_ = numRows_;
+    values_ = allocateValues(stride_, numCols_);
+    copyMat(values_in, stride_in, numRows_, numCols_, values_, stride_, 0, 0);
+    valuesCopied_ = true;
+  }
+}
+
+template<typename OrdinalType, typename ScalarType>
+SerialDenseMatrix<OrdinalType, ScalarType>::SerialDenseMatrix(const SerialDenseMatrix<OrdinalType, ScalarType> &Source, ETransp trans)
+  : valuesCopied_(true)
+{
+  if ( trans == Teuchos::NO_TRANS )
+  {
+    numRows_ = Source.numRows_;
+    numCols_ = Source.numCols_;
+
+    if (!Source.valuesCopied_)
+    {
+      stride_ = Source.stride_;
+      values_ = Source.values_;
+      valuesCopied_ = false;
+    }
+    else
+    {
+      stride_ = numRows_;
+      if(stride_ > 0 && numCols_ > 0) {
+        values_ = allocateValues(stride_, numCols_);
+        copyMat(Source.values_, Source.stride_, numRows_, numCols_, values_, stride_, 0, 0);
+      }
+      else {
+        numRows_ = 0; numCols_ = 0; stride_ = 0;
+        valuesCopied_ = false;
+      }
+    }
+  }
+  else if ( trans == Teuchos::CONJ_TRANS && ScalarTraits<ScalarType>::isComplex )
+  {
+    numRows_ = Source.numCols_;
+    numCols_ = Source.numRows_;
+    stride_ = numRows_;
+    values_ = allocateValues(stride_, numCols_);
+    for (OrdinalType j=0; j<numCols_; j++) {
+      for (OrdinalType i=0; i<numRows_; i++) {
+        values_[j*stride_ + i] = Teuchos::ScalarTraits<ScalarType>::conjugate(Source.values_[i*Source.stride_ + j]);
+      }
+    }
+  }
+  else
+  {
+    numRows_ = Source.numCols_;
+    numCols_ = Source.numRows_;
+    stride_ = numRows_;
+    values_ = allocateValues(stride_, numCols_);
+    for (OrdinalType j=0; j<numCols_; j++) {
+      for (OrdinalType i=0; i<numRows_; i++) {
+        values_[j*stride_ + i] = Source.values_[i*Source.stride_ + j];
+      }
+    }
+  }
+}
+
+
+template<typename OrdinalType, typename ScalarType>
+SerialDenseMatrix<OrdinalType, ScalarType>::SerialDenseMatrix(
+  DataAccess CV, const SerialDenseMatrix<OrdinalType, ScalarType> &Source
+  )
+  : numRows_(Source.numRows_), numCols_(Source.numCols_), stride_(Source.stride_),
+    valuesCopied_(false), values_(Source.values_)
+{
+  if(CV == Copy)
+  {
+    stride_ = numRows_;
+    values_ = allocateValues(stride_, numCols_);
+    copyMat(Source.values_, Source.stride_, numRows_, numCols_, values_, stride_, 0, 0);
+    valuesCopied_ = true;
+  }
+}
+
+
+template<typename OrdinalType, typename ScalarType>
+SerialDenseMatrix<OrdinalType, ScalarType>::SerialDenseMatrix(
+  DataAccess CV, const SerialDenseMatrix<OrdinalType, ScalarType> &Source,
+  OrdinalType numRows_in, OrdinalType numCols_in, OrdinalType startRow,
+  OrdinalType startCol
+  )
+  : CompObject(), numRows_(numRows_in), numCols_(numCols_in), stride_(Source.stride_),
+    valuesCopied_(false), values_(Source.values_)
+{
+  if(CV == Copy)
+  {
+    stride_ = numRows_in;
+    values_ = allocateValues(stride_, numCols_in);
+    copyMat(Source.values_, Source.stride_, numRows_in, numCols_in, values_, stride_, startRow, startCol);
+    valuesCopied_ = true;
+  }
+  else // CV == View
+  {
+    values_ = values_ + (stride_ * startCol) + startRow;
+  }
+}
+
+template<typename OrdinalType, typename ScalarType>
+SerialDenseMatrix<OrdinalType, ScalarType>::~SerialDenseMatrix()
+{
+  deleteArrays();
+}
+
+//----------------------------------------------------------------------------------------------------
+//  Shape methods
+//----------------------------------------------------------------------------------------------------
+
+template<typename OrdinalType, typename ScalarType>
+int SerialDenseMatrix<OrdinalType, ScalarType>::shape(
+  OrdinalType numRows_in, OrdinalType numCols_in
+  )
+{
+  deleteArrays(); // Get rid of anything that might be already allocated
+  numRows_ = numRows_in;
+  numCols_ = numCols_in;
+  stride_ = numRows_;
+  values_ = allocateValues(stride_, numCols_);
+  putScalar();
+  valuesCopied_ = true;
+  return(0);
+}
+
+template<typename OrdinalType, typename ScalarType>
+int SerialDenseMatrix<OrdinalType, ScalarType>::shapeUninitialized(
+  OrdinalType numRows_in, OrdinalType numCols_in
+  )
+{
+  deleteArrays(); // Get rid of anything that might be already allocated
+  numRows_ = numRows_in;
+  numCols_ = numCols_in;
+  stride_ = numRows_;
+  values_ = allocateValues(stride_, numCols_);
+  valuesCopied_ = true;
+  return(0);
+}
+
+template<typename OrdinalType, typename ScalarType>
+int SerialDenseMatrix<OrdinalType, ScalarType>::reshape(
+  OrdinalType numRows_in, OrdinalType numCols_in
+  )
+{
+  // Allocate space for new matrix
+  ScalarType* values_tmp = allocateValues(numRows_in, numCols_in);
+  ScalarType zero = ScalarTraits<ScalarType>::zero();
+  for(OrdinalType k = 0; k < numRows_in * numCols_in; k++)
+  {
+    values_tmp[k] = zero;
+  }
+  OrdinalType numRows_tmp = TEUCHOS_MIN(numRows_, numRows_in);
+  OrdinalType numCols_tmp = TEUCHOS_MIN(numCols_, numCols_in);
+  if(values_ != 0)
+  {
+    copyMat(values_, stride_, numRows_tmp, numCols_tmp, values_tmp,
+      numRows_in, 0, 0); // Copy principal submatrix of A to new A
+  }
+  deleteArrays(); // Get rid of anything that might be already allocated
+  numRows_ = numRows_in;
+  numCols_ = numCols_in;
+  stride_ = numRows_;
+  values_ = values_tmp; // Set pointer to new A
+  valuesCopied_ = true;
+  return(0);
+}
+
+//----------------------------------------------------------------------------------------------------
+//   Set methods
+//----------------------------------------------------------------------------------------------------
+
+template<typename OrdinalType, typename ScalarType>
+int SerialDenseMatrix<OrdinalType, ScalarType>::putScalar( const ScalarType value_in )
+{
+  // Set each value of the dense matrix to "value".
+  for(OrdinalType j = 0; j < numCols_; j++)
+  {
+    for(OrdinalType i = 0; i < numRows_; i++)
+          {
+            values_[i + j*stride_] = value_in;
+          }
+  }
+  return 0;
+}
+
+template<typename OrdinalType, typename ScalarType> void
+SerialDenseMatrix<OrdinalType, ScalarType>::swap(
+  SerialDenseMatrix<OrdinalType, ScalarType> &B)
+{
+  // Notes:
+  // > DefaultBLASImpl::SWAP() uses a deep copy. This fn uses a pointer swap.
+  // > this fn covers both Vector and Matrix, such that some care must be
+  //   employed to not swap across types (creating a Vector with non-unitary
+  //   numCols_)
+  // > Inherited data that is not currently swapped (since inactive/deprecated):
+  //   >> Teuchos::CompObject:
+  //        Flops *flopCounter_ [Note: all SerialDenseMatrix ctors initialize a
+  //        NULL flop-counter using CompObject(), such that any flop increments
+  //        that are computed are not accumulated.]
+  //   >> Teuchos::Object: (now removed from inheritance list)
+  //        static int tracebackMode (no swap for statics)
+  //        std::string label_ (has been reported as a cause of memory overhead)
+
+  std::swap(values_ ,      B.values_);
+  std::swap(numRows_,      B.numRows_);
+  std::swap(numCols_,      B.numCols_);
+  std::swap(stride_,       B.stride_);
+  std::swap(valuesCopied_, B.valuesCopied_);
+}
+
+template<typename OrdinalType, typename ScalarType>
+int SerialDenseMatrix<OrdinalType, ScalarType>::random()
+{
+  // Set each value of the dense matrix to a random value.
+  for(OrdinalType j = 0; j < numCols_; j++)
+  {
+    for(OrdinalType i = 0; i < numRows_; i++)
+          {
+            values_[i + j*stride_] = ScalarTraits<ScalarType>::random();
+          }
+  }
+  return 0;
+}
+
+template<typename OrdinalType, typename ScalarType>
+SerialDenseMatrix<OrdinalType,ScalarType>&
+SerialDenseMatrix<OrdinalType, ScalarType>::operator=(
+  const SerialDenseMatrix<OrdinalType,ScalarType>& Source
+  )
+{
+  if(this == &Source)
+    return(*this); // Special case of source same as target
+  if((!valuesCopied_) && (!Source.valuesCopied_) && (values_ == Source.values_))
+    return(*this); // Special case of both are views to same data.
+
+  // If the source is a view then we will return a view, else we will return a copy.
+  if (!Source.valuesCopied_) {
+    if(valuesCopied_)       {
+      // Clean up stored data if this was previously a copy.
+      deleteArrays();
+    }
+    numRows_ = Source.numRows_;
+    numCols_ = Source.numCols_;
+    stride_ = Source.stride_;
+    values_ = Source.values_;
+  }
+  else {
+    // If we were a view, we will now be a copy.
+    if(!valuesCopied_) {
+      numRows_ = Source.numRows_;
+      numCols_ = Source.numCols_;
+      stride_ = Source.numRows_;
+      if(stride_ > 0 && numCols_ > 0) {
+        values_ = allocateValues(stride_, numCols_);
+        valuesCopied_ = true;
+      }
+      else {
+        values_ = 0;
+      }
+    }
+    // If we were a copy, we will stay a copy.
+    else {
+      if((Source.numRows_ <= stride_) && (Source.numCols_ == numCols_)) { // we don't need to reallocate
+        numRows_ = Source.numRows_;
+        numCols_ = Source.numCols_;
+      }
+      else { // we need to allocate more space (or less space)
+        deleteArrays();
+        numRows_ = Source.numRows_;
+        numCols_ = Source.numCols_;
+        stride_ = Source.numRows_;
+        if(stride_ > 0 && numCols_ > 0) {
+          values_ = allocateValues(stride_, numCols_);
+          valuesCopied_ = true;
+        }
+      }
+    }
+    copyMat(Source.values_, Source.stride_, numRows_, numCols_, values_, stride_, 0, 0);
+  }
+  return(*this);
+}
+
+template<typename OrdinalType, typename ScalarType>
+SerialDenseMatrix<OrdinalType, ScalarType>& SerialDenseMatrix<OrdinalType, ScalarType>::operator+= (const SerialDenseMatrix<OrdinalType,ScalarType>& Source )
+{
+  // Check for compatible dimensions
+  if ((numRows_ != Source.numRows_) || (numCols_ != Source.numCols_))
+  {
+    TEUCHOS_CHK_REF(*this); // Return *this without altering it.
+  }
+  copyMat(Source.values_, Source.stride_, numRows_, numCols_, values_, stride_, 0, 0, ScalarTraits<ScalarType>::one());
+  return(*this);
+}
+
+template<typename OrdinalType, typename ScalarType>
+SerialDenseMatrix<OrdinalType, ScalarType>& SerialDenseMatrix<OrdinalType, ScalarType>::operator-= (const SerialDenseMatrix<OrdinalType,ScalarType>& Source )
+{
+  // Check for compatible dimensions
+  if ((numRows_ != Source.numRows_) || (numCols_ != Source.numCols_))
+  {
+    TEUCHOS_CHK_REF(*this); // Return *this without altering it.
+  }
+  copyMat(Source.values_, Source.stride_, numRows_, numCols_, values_, stride_, 0, 0, -ScalarTraits<ScalarType>::one());
+  return(*this);
+}
+
+template<typename OrdinalType, typename ScalarType>
+SerialDenseMatrix<OrdinalType,ScalarType>& SerialDenseMatrix<OrdinalType, ScalarType>::assign (const SerialDenseMatrix<OrdinalType,ScalarType>& Source) {
+  if(this == &Source)
+    return(*this); // Special case of source same as target
+  if((!valuesCopied_) && (!Source.valuesCopied_) && (values_ == Source.values_))
+    return(*this); // Special case of both are views to same data.
+
+  // Check for compatible dimensions
+  if ((numRows_ != Source.numRows_) || (numCols_ != Source.numCols_))
+  {
+    TEUCHOS_CHK_REF(*this); // Return *this without altering it.
+  }
+  copyMat(Source.values_, Source.stride_, numRows_, numCols_, values_, stride_, 0, 0);
+  return(*this);
+}
+
+//----------------------------------------------------------------------------------------------------
+//   Accessor methods
+//----------------------------------------------------------------------------------------------------
+
+template<typename OrdinalType, typename ScalarType>
+inline ScalarType& SerialDenseMatrix<OrdinalType, ScalarType>::operator () (OrdinalType rowIndex, OrdinalType colIndex)
+{
+#ifdef HAVE_TEUCHOS_ARRAY_BOUNDSCHECK
+  checkIndex( rowIndex, colIndex );
+#endif
+  return(values_[colIndex * stride_ + rowIndex]);
+}
+
+template<typename OrdinalType, typename ScalarType>
+inline const ScalarType& SerialDenseMatrix<OrdinalType, ScalarType>::operator () (OrdinalType rowIndex, OrdinalType colIndex) const
+{
+#ifdef HAVE_TEUCHOS_ARRAY_BOUNDSCHECK
+  checkIndex( rowIndex, colIndex );
+#endif
+  return(values_[colIndex * stride_ + rowIndex]);
+}
+
+template<typename OrdinalType, typename ScalarType>
+inline const ScalarType* SerialDenseMatrix<OrdinalType, ScalarType>::operator [] (OrdinalType colIndex) const
+{
+#ifdef HAVE_TEUCHOS_ARRAY_BOUNDSCHECK
+  checkIndex( 0, colIndex );
+#endif
+  return(values_ + colIndex * stride_);
+}
+
+template<typename OrdinalType, typename ScalarType>
+inline ScalarType* SerialDenseMatrix<OrdinalType, ScalarType>::operator [] (OrdinalType colIndex)
+{
+#ifdef HAVE_TEUCHOS_ARRAY_BOUNDSCHECK
+  checkIndex( 0, colIndex );
+#endif
+  return(values_ + colIndex * stride_);
+}
+
+//----------------------------------------------------------------------------------------------------
+//   Norm methods
+//----------------------------------------------------------------------------------------------------
+
+template<typename OrdinalType, typename ScalarType>
+typename ScalarTraits<ScalarType>::magnitudeType SerialDenseMatrix<OrdinalType, ScalarType>::normOne() const
+{
+  OrdinalType i, j;
+  typename ScalarTraits<ScalarType>::magnitudeType anorm = ScalarTraits<ScalarType>::magnitude(ScalarTraits<ScalarType>::zero());
+  typename ScalarTraits<ScalarType>::magnitudeType absSum = ScalarTraits<ScalarType>::magnitude(ScalarTraits<ScalarType>::zero());
+  ScalarType* ptr;
+  for(j = 0; j < numCols_; j++)
+  {
+    ScalarType sum = 0;
+    ptr = values_ + j * stride_;
+    for(i = 0; i < numRows_; i++)
+          {
+            sum += ScalarTraits<ScalarType>::magnitude(*ptr++);
+          }
+    absSum = ScalarTraits<ScalarType>::magnitude(sum);
+    if(absSum > anorm)
+          {
+            anorm = absSum;
+          }
+  }
+  // don't compute flop increment unless there is an accumulator
+  if (flopCounter_!=0) updateFlops(numRows_ * numCols_);
+  return(anorm);
+}
+
+template<typename OrdinalType, typename ScalarType>
+typename ScalarTraits<ScalarType>::magnitudeType SerialDenseMatrix<OrdinalType, ScalarType>::normInf() const
+{
+  OrdinalType i, j;
+  typename ScalarTraits<ScalarType>::magnitudeType sum, anorm = ScalarTraits<ScalarType>::magnitude(ScalarTraits<ScalarType>::zero());
+
+  for (i = 0; i < numRows_; i++) {
+    sum = ScalarTraits<ScalarType>::magnitude(ScalarTraits<ScalarType>::zero());
+    for (j=0; j< numCols_; j++) {
+      sum += ScalarTraits<ScalarType>::magnitude(*(values_+i+j*stride_));
+    }
+    anorm = TEUCHOS_MAX( anorm, sum );
+  }
+  // don't compute flop increment unless there is an accumulator
+  if (flopCounter_!=0) updateFlops(numRows_ * numCols_);
+  return(anorm);
+}
+
+template<typename OrdinalType, typename ScalarType>
+typename ScalarTraits<ScalarType>::magnitudeType SerialDenseMatrix<OrdinalType, ScalarType>::normFrobenius() const
+{
+  OrdinalType i, j;
+  typename ScalarTraits<ScalarType>::magnitudeType anorm = ScalarTraits<ScalarType>::magnitude(ScalarTraits<ScalarType>::zero());
+  for (j = 0; j < numCols_; j++) {
+    for (i = 0; i < numRows_; i++) {
+      anorm += ScalarTraits<ScalarType>::magnitude(values_[i+j*stride_]*values_[i+j*stride_]);
+    }
+  }
+  anorm = ScalarTraits<ScalarType>::magnitude(ScalarTraits<ScalarType>::squareroot(anorm));
+  // don't compute flop increment unless there is an accumulator
+  if (flopCounter_!=0) updateFlops(numRows_ * numCols_);
+  return(anorm);
+}
+
+//----------------------------------------------------------------------------------------------------
+//   Comparison methods
+//----------------------------------------------------------------------------------------------------
+
+template<typename OrdinalType, typename ScalarType>
+bool SerialDenseMatrix<OrdinalType, ScalarType>::operator== (const SerialDenseMatrix<OrdinalType, ScalarType> &Operand) const
+{
+  bool result = 1;
+  if((numRows_ != Operand.numRows_) || (numCols_ != Operand.numCols_))
+  {
+    result = 0;
+  }
+  else
+  {
+    OrdinalType i, j;
+    for(i = 0; i < numRows_; i++)
+          {
+            for(j = 0; j < numCols_; j++)
+      {
+        if((*this)(i, j) != Operand(i, j))
+        {
+          return 0;
+        }
+      }
+          }
+  }
+  return result;
+}
+
+template<typename OrdinalType, typename ScalarType>
+bool SerialDenseMatrix<OrdinalType, ScalarType>::operator!= (const SerialDenseMatrix<OrdinalType, ScalarType> &Operand) const
+{
+  return !((*this) == Operand);
+}
+
+//----------------------------------------------------------------------------------------------------
+//   Multiplication method
+//----------------------------------------------------------------------------------------------------
+
+template<typename OrdinalType, typename ScalarType>
+SerialDenseMatrix<OrdinalType, ScalarType>& SerialDenseMatrix<OrdinalType, ScalarType>::operator*= (const ScalarType alpha )
+{
+  this->scale( alpha );
+  return(*this);
+}
+
+template<typename OrdinalType, typename ScalarType>
+int SerialDenseMatrix<OrdinalType, ScalarType>::scale( const ScalarType alpha )
+{
+  OrdinalType i, j;
+  ScalarType* ptr;
+
+  for (j=0; j<numCols_; j++) {
+    ptr = values_ + j*stride_;
+    for (i=0; i<numRows_; i++) { *ptr = alpha * (*ptr); ptr++; }
+  }
+  // don't compute flop increment unless there is an accumulator
+  if (flopCounter_!=0) updateFlops( numRows_*numCols_ );
+  return(0);
+}
+
+template<typename OrdinalType, typename ScalarType>
+int SerialDenseMatrix<OrdinalType, ScalarType>::scale( const SerialDenseMatrix<OrdinalType,ScalarType>& A )
+{
+  OrdinalType i, j;
+  ScalarType* ptr;
+
+  // Check for compatible dimensions
+  if ((numRows_ != A.numRows_) || (numCols_ != A.numCols_))
+  {
+    TEUCHOS_CHK_ERR(-1); // Return error
+  }
+  for (j=0; j<numCols_; j++) {
+    ptr = values_ + j*stride_;
+    for (i=0; i<numRows_; i++) { *ptr = A(i,j) * (*ptr); ptr++; }
+  }
+  // don't compute flop increment unless there is an accumulator
+  if (flopCounter_!=0) updateFlops( numRows_*numCols_ );
+  return(0);
+}
+
+template<typename OrdinalType, typename ScalarType>
+int  SerialDenseMatrix<OrdinalType, ScalarType>::multiply(ETransp transa, ETransp transb, ScalarType alpha, const SerialDenseMatrix<OrdinalType, ScalarType> &A, const SerialDenseMatrix<OrdinalType, ScalarType> &B, ScalarType beta)
+{
+  // Check for compatible dimensions
+  OrdinalType A_nrows = (ETranspChar[transa]!='N') ? A.numCols() : A.numRows();
+  OrdinalType A_ncols = (ETranspChar[transa]!='N') ? A.numRows() : A.numCols();
+  OrdinalType B_nrows = (ETranspChar[transb]!='N') ? B.numCols() : B.numRows();
+  OrdinalType B_ncols = (ETranspChar[transb]!='N') ? B.numRows() : B.numCols();
+  if ((numRows_ != A_nrows) || (A_ncols != B_nrows) || (numCols_ != B_ncols))
+  {
+    TEUCHOS_CHK_ERR(-1); // Return error
+  }
+  // Call GEMM function
+  this->GEMM(transa, transb, numRows_, numCols_, A_ncols, alpha, A.values(), A.stride(), B.values(), B.stride(), beta, values_, stride_);
+
+  // don't compute flop increment unless there is an accumulator
+  if (flopCounter_!=0) {
+    double nflops = 2 * numRows_;
+    nflops *= numCols_;
+    nflops *= A_ncols;
+    updateFlops(nflops);
+  }
+  return(0);
+}
+
+template<typename OrdinalType, typename ScalarType>
+int SerialDenseMatrix<OrdinalType, ScalarType>::multiply (ESide sideA, ScalarType alpha, const SerialSymDenseMatrix<OrdinalType, ScalarType> &A, const SerialDenseMatrix<OrdinalType, ScalarType> &B, ScalarType beta)
+{
+  // Check for compatible dimensions
+  OrdinalType A_nrows = A.numRows(), A_ncols = A.numCols();
+  OrdinalType B_nrows = B.numRows(), B_ncols = B.numCols();
+
+  if (ESideChar[sideA]=='L') {
+    if ((numRows_ != A_nrows) || (A_ncols != B_nrows) || (numCols_ != B_ncols)) {
+      TEUCHOS_CHK_ERR(-1); // Return error
+    }
+  } else {
+    if ((numRows_ != B_nrows) || (B_ncols != A_nrows) || (numCols_ != A_ncols)) {
+      TEUCHOS_CHK_ERR(-1); // Return error
+    }
+  }
+
+  // Call SYMM function
+  EUplo uplo = (A.upper() ? Teuchos::UPPER_TRI : Teuchos::LOWER_TRI);
+  this->SYMM(sideA, uplo, numRows_, numCols_, alpha, A.values(), A.stride(), B.values(), B.stride(), beta, values_, stride_);
+
+  // don't compute flop increment unless there is an accumulator
+  if (flopCounter_!=0) {
+    double nflops = 2 * numRows_;
+    nflops *= numCols_;
+    nflops *= A_ncols;
+    updateFlops(nflops);
+  }
+  return(0);
+}
+
+template<typename OrdinalType, typename ScalarType>
+std::ostream& SerialDenseMatrix<OrdinalType, ScalarType>::print(std::ostream& os) const
+{
+  os << std::endl;
+  if(valuesCopied_)
+    os << "Values_copied : yes" << std::endl;
+  else
+    os << "Values_copied : no" << std::endl;
+  os << "Rows : " << numRows_ << std::endl;
+  os << "Columns : " << numCols_ << std::endl;
+  os << "LDA : " << stride_ << std::endl;
+  if(numRows_ == 0 || numCols_ == 0) {
+    os << "(matrix is empty, no values to display)" << std::endl;
+  } else {
+    for(OrdinalType i = 0; i < numRows_; i++) {
+      for(OrdinalType j = 0; j < numCols_; j++){
+        os << (*this)(i,j) << " ";
+      }
+      os << std::endl;
+    }
+  }
+  return os;
+}
+
+//----------------------------------------------------------------------------------------------------
+//   Protected methods
+//----------------------------------------------------------------------------------------------------
+
+template<typename OrdinalType, typename ScalarType>
+inline void SerialDenseMatrix<OrdinalType, ScalarType>::checkIndex( OrdinalType rowIndex, OrdinalType colIndex ) const {
+  TEUCHOS_TEST_FOR_EXCEPTION(rowIndex < 0 || rowIndex >= numRows_, std::out_of_range,
+    "SerialDenseMatrix<T>::checkIndex: "
+    "Row index " << rowIndex << " out of range [0, "<< numRows_ << ")");
+  TEUCHOS_TEST_FOR_EXCEPTION(colIndex < 0 || colIndex >= numCols_, std::out_of_range,
+    "SerialDenseMatrix<T>::checkIndex: "
+    "Col index " << colIndex << " out of range [0, "<< numCols_ << ")");
+}
+
+template<typename OrdinalType, typename ScalarType>
+void SerialDenseMatrix<OrdinalType, ScalarType>::deleteArrays(void)
+{
+  if (valuesCopied_)
+  {
+    delete [] values_;
+    values_ = 0;
+    valuesCopied_ = false;
+  }
+}
+
+template<typename OrdinalType, typename ScalarType>
+void SerialDenseMatrix<OrdinalType, ScalarType>::copyMat(
+  ScalarType* inputMatrix, OrdinalType strideInput, OrdinalType numRows_in,
+  OrdinalType numCols_in, ScalarType* outputMatrix, OrdinalType strideOutput,
+  OrdinalType startRow, OrdinalType startCol, ScalarType alpha
+  )
+{
+  OrdinalType i, j;
+  ScalarType* ptr1 = 0;
+  ScalarType* ptr2 = 0;
+  for(j = 0; j < numCols_in; j++) {
+    ptr1 = outputMatrix + (j * strideOutput);
+    ptr2 = inputMatrix + (j + startCol) * strideInput + startRow;
+    if (alpha != Teuchos::ScalarTraits<ScalarType>::zero() ) {
+      for(i = 0; i < numRows_in; i++)
+            {
+              *ptr1++ += alpha*(*ptr2++);
+            }
+    } else {
+      for(i = 0; i < numRows_in; i++)
+            {
+              *ptr1++ = *ptr2++;
+            }
+    }
+  }
+}
+
+/// \brief Ostream manipulator for SerialDenseMatrix
+template<typename OrdinalType, typename ScalarType>
+struct SerialDenseMatrixPrinter {
+public:
+  const SerialDenseMatrix<OrdinalType,ScalarType> &obj;
+  SerialDenseMatrixPrinter(
+        const SerialDenseMatrix<OrdinalType,ScalarType> &obj_in)
+      : obj(obj_in) {}
+};
+
+/// \brief Output SerialDenseMatrix object through its stream manipulator.
+template<typename OrdinalType, typename ScalarType>
+std::ostream&
+operator<<(std::ostream &out,
+           const SerialDenseMatrixPrinter<OrdinalType,ScalarType> printer)
+{
+  printer.obj.print(out);
+  return out;
+}
+
+/// \brief Return SerialDenseMatrix ostream manipulator Use as:
+template<typename OrdinalType, typename ScalarType>
+SerialDenseMatrixPrinter<OrdinalType,ScalarType>
+printMat(const SerialDenseMatrix<OrdinalType,ScalarType> &obj)
+{
+  return SerialDenseMatrixPrinter<OrdinalType,ScalarType>(obj);
+}
+
+
+} // namespace Teuchos
+
+
+#endif /* _TEUCHOS_SERIALDENSEMATRIX_HPP_ */
diff --git a/packages/rol/src/compatibility/teuchos-lite/Teuchos_SerialDenseVector.hpp b/packages/rol/src/compatibility/teuchos-lite/Teuchos_SerialDenseVector.hpp
new file mode 100644
index 000000000000..c2f14836c2e0
--- /dev/null
+++ b/packages/rol/src/compatibility/teuchos-lite/Teuchos_SerialDenseVector.hpp
@@ -0,0 +1,374 @@
+// @HEADER
+// ***********************************************************************
+//
+//                    Teuchos: Common Tools Package
+//                 Copyright (2004) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
+//
+// ***********************************************************************
+// @HEADER
+
+
+#ifndef _TEUCHOS_SERIALDENSEVECTOR_HPP_
+#define _TEUCHOS_SERIALDENSEVECTOR_HPP_
+
+/*! \file Teuchos_SerialDenseVector.hpp
+    \brief Templated serial dense vector class
+*/
+
+#include "Teuchos_ConfigDefs.hpp"
+#include "Teuchos_Object.hpp"
+#include "Teuchos_SerialDenseMatrix.hpp"
+
+/*! \class Teuchos::SerialDenseVector
+    \brief This class creates and provides basic support for dense vectors of templated type as a specialization of Teuchos::SerialDenseMatrix.  Additional methods for the SerialDenseVector class, like mathematical methods, can be found documented in SerialDenseMatrix.
+*/
+namespace Teuchos {
+
+  template<typename OrdinalType, typename ScalarType>
+  class SerialDenseVector : public SerialDenseMatrix<OrdinalType,ScalarType> {
+
+  public:
+    //! @name Constructor/Destructor methods.
+  //@{
+
+    //! Default Constructor
+    /*! Creates an empty vector of no length.  The Sizing methods should be used to size this matrix.  Values of this matrix should be set using the [] or the () operators.
+    */
+    SerialDenseVector();
+
+    //! Shaped Constructor
+    /*!
+	\param length - Number of elements in this vector.
+	\param zeroOut - Initializes values to 0 if true (default)
+
+    Creates a shaped vector of length \c length.  All values are initialized to 0 when \c zeroOut is true.
+    Values of this matrix should be set using the [] or the () operators.
+    */
+    SerialDenseVector(OrdinalType length, bool zeroOut = true);
+
+    //! Shaped Constructor with Values
+    /*!
+	\param CV - Enumerated type set to Teuchos::Copy or Teuchos::View.
+	\param values - Pointer to an array of ScalarType of the given \c length.
+	\param length - Length of vector to be constructed.
+    */
+    SerialDenseVector(DataAccess CV, ScalarType* values, OrdinalType length);
+
+    //! Copy Constructor
+    SerialDenseVector(const SerialDenseVector<OrdinalType,ScalarType>& Source);
+
+    //! Copy Constructor
+    /*! \note Allow explicit control of whether a deep copy or view of \c Source is made with this copy constructor.
+    */
+    SerialDenseVector(DataAccess CV, const SerialDenseVector<OrdinalType, ScalarType> &Source);
+    
+    //! Destructor
+    virtual ~SerialDenseVector ();
+  //@}
+
+  //! @name Sizing methods.
+  //@{
+
+    //! Size method for changing the size of a SerialDenseVector, initializing entries to zero.
+    /*!
+	\param length - The length of the new vector.
+
+	This allows the user to define the length of a SerialDenseVector at any point.
+	This method can be called at any point after construction.  Any values previously in
+	this object will be destroyed and the resized vector starts with all zero values.
+    */
+    int size(OrdinalType length_in)
+      {return(SerialDenseMatrix<OrdinalType, ScalarType>::shape(length_in, 1));}
+
+    //! Same as <tt>size()</tt> except leaves values uninitialized.
+    int sizeUninitialized(OrdinalType length_in)
+      {return(SerialDenseMatrix<OrdinalType, ScalarType>::shapeUninitialized(length_in, 1));}
+
+    //! Resizing method for changing the size of a SerialDenseVector, keeping the entries.
+    /*!
+	\param length - The length of the new vector.
+	This allows the user to redefine the length of a SerialDenseVector at any point.
+	This method can be called at any point after construction.  Any values previously in
+	this object will be copied to the resized vector.
+    */
+    int resize(OrdinalType length_in)
+      {return(SerialDenseMatrix<OrdinalType,ScalarType>::reshape(length_in, 1));}
+  //@}
+
+  //! @name Set methods.
+  //@{
+
+    //! Set all values in the matrix to a constant value.
+    /*!
+      \param value - Value to use;
+    */
+    SerialDenseVector<OrdinalType, ScalarType>& operator= (const ScalarType value) { this->putScalar(value); return(*this); }
+  //@}
+
+  //! @name Comparison methods.
+  //@{
+    //! Equality of two matrices.
+    /*! \return True if \e this vector and \c Operand are of the same length and have the same entries, else False will be returned.
+    */
+    bool operator == (const SerialDenseVector<OrdinalType, ScalarType> &Operand) const;
+
+    //! Inequality of two matrices.
+    /*! \return True if \e this vector and \c Operand are not of the same length or do not have the same entries, else False will be returned.
+    */
+    bool operator != (const SerialDenseVector<OrdinalType, ScalarType> &Operand) const;
+  //@}
+
+  //! @name Set methods.
+  //@{
+
+    //! Copies values from one vector to another.
+    /*!
+	The operator= copies the values from one existing SerialDenseVector to
+	another.  If \c Source is a view (i.e. CV = Teuchos::View), then this
+	method will return a view.  Otherwise, it will return a copy of \c Source.
+	\e this will be resized if it is not large enough to copy \c Source into.
+    */
+    SerialDenseVector<OrdinalType,ScalarType>& operator = (const SerialDenseVector<OrdinalType,ScalarType>& Source);
+  //@}
+
+  //! @name Accessor methods.
+  //@{
+    //! Element access method (non-const).
+    /*! Returns the ith element if x(i) is specified, the expression x[i] will return the same element.
+	\return (*this)(index)
+	\warning The validity of \c index will only be checked if Teuchos is configured with --enable-teuchos-abc.
+    */
+    ScalarType& operator () (OrdinalType index);
+
+    //! Element access method (const).
+    /*! Returns the ith element if x(i) is specified, the expression x[i] will return the same element.
+	\return (*this)(index)
+	\warning The validity of \c index will only be checked if Teuchos is configured with --enable-teuchos-abc.
+    */
+    const ScalarType& operator () (OrdinalType index) const;
+
+    //! Element access method (non-const).
+    /*! Returns the ith element if x[i] is specified, the expression x(i) will return the same element.
+	\return (*this)[index]
+	\warning The validity of \c index will only be checked if Teuchos is configured with --enable-teuchos-abc.
+    */
+    ScalarType& operator [] (OrdinalType index);
+
+    //! Element access method (const).
+    /*! Returns the ith element if x[i] is specified, the expression x(i) will return the same element.
+    	\return (*this)[index]
+	\warning The validity of \c index will only be checked if Teuchos is configured with --enable-teuchos-abc.
+    */
+    const ScalarType& operator [] (OrdinalType index) const;
+
+  //@}
+
+  //! @name Mathematical methods.
+  //@{
+    //! Compute the dot product of \c this vector and \c x.
+    ScalarType dot( const SerialDenseVector<OrdinalType,ScalarType> &x) const;
+  //@}
+
+  //! @name Attribute methods.
+  //@{
+    //! Returns the length of this vector.
+    OrdinalType length() const {return(this->numRows_);}
+  //@}
+
+  //! @name I/O methods.
+  //@{
+    //! Print method.  Define the behavior of the std::ostream << operator inherited from the Object class.
+    std::ostream& print(std::ostream& os) const;
+  //@}
+};
+
+  template<typename OrdinalType, typename ScalarType>
+  SerialDenseVector<OrdinalType, ScalarType>::SerialDenseVector() : SerialDenseMatrix<OrdinalType,ScalarType>() {}
+
+  template<typename OrdinalType, typename ScalarType>
+  SerialDenseVector<OrdinalType, ScalarType>::SerialDenseVector( OrdinalType length_in, bool zeroOut ) : SerialDenseMatrix<OrdinalType,ScalarType>( length_in, 1, zeroOut ) {}
+
+  template<typename OrdinalType, typename ScalarType>
+  SerialDenseVector<OrdinalType, ScalarType>::SerialDenseVector(DataAccess CV, ScalarType* values_in, OrdinalType length_in) :
+    SerialDenseMatrix<OrdinalType,ScalarType>( CV, values_in, length_in, length_in, 1 ) {}
+
+  template<typename OrdinalType, typename ScalarType>
+  SerialDenseVector<OrdinalType, ScalarType>::SerialDenseVector(const SerialDenseVector<OrdinalType, ScalarType> &Source) :
+    SerialDenseMatrix<OrdinalType,ScalarType>( Source ) {}
+
+  template<typename OrdinalType, typename ScalarType>
+  SerialDenseVector<OrdinalType, ScalarType>::SerialDenseVector(DataAccess CV, const SerialDenseVector<OrdinalType, ScalarType> &Source) :
+    SerialDenseMatrix<OrdinalType,ScalarType>( CV, Source ) {}
+
+  template<typename OrdinalType, typename ScalarType>
+  SerialDenseVector<OrdinalType, ScalarType>::~SerialDenseVector() {}
+
+  template<typename OrdinalType, typename ScalarType>
+  SerialDenseVector<OrdinalType, ScalarType>& SerialDenseVector<OrdinalType,ScalarType>::operator = (const SerialDenseVector<OrdinalType, ScalarType>& Source)
+  {
+    SerialDenseMatrix<OrdinalType,ScalarType>::operator=(Source);
+    return(*this);
+  }
+
+  template<typename OrdinalType, typename ScalarType>
+  bool SerialDenseVector<OrdinalType, ScalarType>::operator == (const SerialDenseVector<OrdinalType, ScalarType> &Operand) const
+  {
+    bool result = 1;
+    if(this->numRows_ != Operand.numRows_)
+      {
+	result = 0;
+      }
+    else
+      {
+	OrdinalType i;
+	for(i = 0; i < this->numRows_; i++) {
+	  if((*this)(i) != Operand(i))
+	    {
+	      return 0;
+	    }
+	}
+      }
+    return result;
+  }
+
+  template<typename OrdinalType, typename ScalarType>
+  bool SerialDenseVector<OrdinalType, ScalarType>::operator != (const SerialDenseVector<OrdinalType, ScalarType> &Operand) const
+  {
+    return !((*this)==Operand);
+  }
+
+  template<typename OrdinalType, typename ScalarType>
+  ScalarType SerialDenseVector<OrdinalType, ScalarType>::dot( const SerialDenseVector<OrdinalType, ScalarType> &x) const
+  {
+    TEUCHOS_TEST_FOR_EXCEPTION(this->numRows_!= x.numRows_, std::invalid_argument,
+    "SerialDenseVector<T>::dot : " <<
+    "Number of rows " << this->numRows_ << " not equal to x.numRows_ "<< x.numRows() );
+
+    // Compute the dot product and return the result.
+    return BLAS<OrdinalType, ScalarType>::DOT(this->numRows_, this->values(), 1, x.values(), 1);
+  }
+
+  template<typename OrdinalType, typename ScalarType>
+  std::ostream& SerialDenseVector<OrdinalType, ScalarType>::print(std::ostream& os) const
+  {
+    os << std::endl;
+    if(this->valuesCopied_)
+      os << "Values_copied : yes" << std::endl;
+    else
+      os << "Values_copied : no" << std::endl;
+    os << "Length : " << this->numRows_ << std::endl;
+    if(this->numRows_ == 0) {
+      os << "(std::vector is empty, no values to display)" << std::endl;
+    } else {
+      for(OrdinalType i = 0; i < this->numRows_; i++) {
+        os << (*this)(i) << " ";
+      }
+      os << std::endl;
+    }
+    return os;
+  }
+
+  //----------------------------------------------------------------------------------------------------
+  //   Accessor methods
+  //----------------------------------------------------------------------------------------------------
+
+  template<typename OrdinalType, typename ScalarType>
+  inline ScalarType& SerialDenseVector<OrdinalType, ScalarType>::operator () (OrdinalType index)
+  {
+#ifdef HAVE_TEUCHOS_ARRAY_BOUNDSCHECK
+    this->checkIndex( index );
+#endif
+    return(this->values_[index]);
+  }
+
+  template<typename OrdinalType, typename ScalarType>
+  inline const ScalarType& SerialDenseVector<OrdinalType, ScalarType>::operator () (OrdinalType index) const
+  {
+#ifdef HAVE_TEUCHOS_ARRAY_BOUNDSCHECK
+    this->checkIndex( index );
+#endif
+    return(this->values_[index]);
+  }
+
+  template<typename OrdinalType, typename ScalarType>
+  inline const ScalarType& SerialDenseVector<OrdinalType, ScalarType>::operator [] (OrdinalType index) const
+  {
+#ifdef HAVE_TEUCHOS_ARRAY_BOUNDSCHECK
+    this->checkIndex( index );
+#endif
+    return(this->values_[index]);
+  }
+
+  template<typename OrdinalType, typename ScalarType>
+  inline ScalarType& SerialDenseVector<OrdinalType, ScalarType>::operator [] (OrdinalType index)
+  {
+#ifdef HAVE_TEUCHOS_ARRAY_BOUNDSCHECK
+    this->checkIndex( index );
+#endif
+    return(this->values_[index]);
+  }
+
+/// \brief Ostream manipulator for SerialDenseVector 
+template<typename OrdinalType, typename ScalarType>
+struct SerialDenseVectorPrinter {
+public:
+  const SerialDenseVector<OrdinalType,ScalarType> &obj;
+  SerialDenseVectorPrinter(
+        const SerialDenseVector<OrdinalType,ScalarType> &obj_in)
+      : obj(obj_in) {}
+};
+
+/// \brief Output SerialDenseVector object through its stream manipulator. 
+template<typename OrdinalType, typename ScalarType>
+std::ostream&
+operator<<(std::ostream &out,
+           const SerialDenseVectorPrinter<OrdinalType,ScalarType> printer)
+{
+  printer.obj.print(out);
+  return out;
+}
+
+/// \brief Return SerialDenseVector ostream manipulator Use as:
+template<typename OrdinalType, typename ScalarType>
+SerialDenseVectorPrinter<OrdinalType,ScalarType>
+printMat(const SerialDenseVector<OrdinalType,ScalarType> &obj)
+{
+  return SerialDenseVectorPrinter<OrdinalType,ScalarType>(obj);
+}
+
+
+} // namespace Teuchos
+
+#endif /* _TEUCHOS_SERIALDENSEVECTOR_HPP_ */
diff --git a/packages/rol/src/compatibility/teuchos-lite/Teuchos_SerialSymDenseMatrix.hpp b/packages/rol/src/compatibility/teuchos-lite/Teuchos_SerialSymDenseMatrix.hpp
new file mode 100644
index 000000000000..1b932d0a8c47
--- /dev/null
+++ b/packages/rol/src/compatibility/teuchos-lite/Teuchos_SerialSymDenseMatrix.hpp
@@ -0,0 +1,1209 @@
+
+// @HEADER
+// ***********************************************************************
+//
+//                    Teuchos: Common Tools Package
+//                 Copyright (2004) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// ***********************************************************************
+// @HEADER
+
+#ifndef _TEUCHOS_SERIALSYMDENSEMATRIX_HPP_
+#define _TEUCHOS_SERIALSYMDENSEMATRIX_HPP_
+/*! \file Teuchos_SerialSymDenseMatrix.hpp
+  \brief Templated serial, dense, symmetric matrix class.
+*/
+
+#include "Teuchos_CompObject.hpp"
+#include "Teuchos_BLAS.hpp"
+#include "Teuchos_ScalarTraits.hpp"
+#include "Teuchos_DataAccess.hpp"
+#include "Teuchos_ConfigDefs.hpp"
+#include "Teuchos_Assert.hpp"
+#include <utility>
+#include <vector>
+
+/*! \class Teuchos::SerialSymDenseMatrix
+    \brief This class creates and provides basic support for symmetric, positive-definite dense matrices of templated type.
+
+    The Teuchos_SerialSymDenseMatrix class enables the construction and use of
+    symmetric, positive-definite, dense matrices of templated type.
+
+    The Teuchos::SerialSymDenseMatrix class is intended to provide full-featured support for solving
+    linear and eigen system problems for symmetric positive-definite (SPD) matrices.  It is written on
+    top of BLAS and LAPACK and thus has excellent performance and numerical capabilities.  Using this
+    class, one can either perform simple factorizations and solves or apply all the tricks available in
+    LAPACK to get the best possible solution for very ill-conditioned problems.
+
+    <b>Teuchos::SerialSymDenseMatrix vs. Teuchos::LAPACK</b>
+
+    The Teuchos::LAPACK class provides access to most of the same functionality as Teuchos::SerialSymDenseMatrix.
+    The primary difference is that Teuchos::LAPACK is a "thin" layer on top of LAPACK and
+    Teuchos::SerialSymDenseMatrix attempts to provide easy access to the more sophisticated aspects of
+    solving dense linear and eigensystems.
+<ul>
+<li> When you should use Teuchos::LAPACK:  If you are simply looking for a convenient wrapper around
+the Fortran LAPACK routines and you have a well-conditioned problem, you should probably use Teuchos::LAPACK directly.
+<li> When you should use Teuchos::SerialSymDenseMatrix: If you want to (or potentially want to) solve ill-conditioned
+     problems or want to work with a more object-oriented interface, you should probably use Teuchos::SerialSymDenseMatrix.
+</ul>
+
+<b>Constructing Teuchos::SerialSymDenseMatrix Objects</b>
+
+There are three Teuchos::SerialSymDenseMatrix constructors.  The first constructs a zero-sized object
+which should be made to appropriate length using the Shape() or Reshape() functions and then filled with
+the [] or () operators. The second is a constructor that accepts user data as a 2D array, the third is a
+copy constructor. The second constructor has two data access modes (specified by the Teuchos::DataAccess argument):
+<ol>
+  <li> Copy mode - Allocates memory and makes a copy of the user-provided data. In this case, the
+       user data is not needed after construction.
+  <li> View mode - Creates a "view" of the user data. In this case, the
+       user data is required to remain intact for the life of the object.
+</ol>
+
+\warning View mode is \e extremely dangerous from a data hiding perspective. Therefore, we strongly encourage
+users to develop code using Copy mode first and only use the View mode in a secondary optimization phase.
+
+<b>Extracting Data from Teuchos::SerialSymDenseMatrix Objects</b>
+
+Once a Teuchos::SerialSymDenseMatrix is constructed, it is possible to view the data via access functions.
+
+\warning Use of these access functions cam be \e extremely dangerous from a data hiding perspective.
+
+<b>Vector and Utility Functions</b>
+
+Once a Teuchos::SerialSymDenseMatrix is constructed, several mathematical functions can be applied to
+the object.  Specifically:
+<ul>
+  <li> Multiplication.
+  <li> Norms.
+</ul>
+
+*/
+
+/** \example DenseMatrix/cxx_main_sym.cpp
+    This is an example of how to use the Teuchos::SerialSymDenseMatrix class.
+*/
+
+namespace Teuchos {
+
+template<typename OrdinalType, typename ScalarType>
+class SerialSymDenseMatrix : public CompObject, public BLAS<OrdinalType,ScalarType>
+{
+ public:
+
+  //! Typedef for ordinal type
+  typedef OrdinalType ordinalType;
+  //! Typedef for scalar type
+  typedef ScalarType scalarType;
+
+  //! @name Constructor/Destructor Methods
+  //@{
+  //! Default constructor; defines a zero size object.
+  /*!
+    Teuchos::SerialSymDenseMatrix objects defined by the default constructor
+    should be sized with the Shape() or Reshape() functions.
+    Values should be defined by using the [] or ()operators.
+
+    Note: By default the active part of the matrix is assumed to be the lower triangular part.
+    To set the upper part as active, call SetUpper(). See Detailed Description section for further discussion.
+   */
+  SerialSymDenseMatrix() = default;
+
+  //! Basic constructor; defines a matrix of \c numRowsCols size and (optionally) initializes it.
+  /*!
+    \param numRowsCols - Number of rows and columns in the matrix.
+    \param zeroOut - Initializes values to 0 if true (default)
+
+    Creates a shaped matrix with \c numRowsCols rows and cols.  All values are initialized to 0 when \c zeroOut is true.
+    Values of this matrix should be set using the [] or the () operators.
+
+    \note By default the active part of the matrix is assumed to be the lower triangular part.
+    To set the upper part as active, call SetUpper(). See Detailed Description section for further discussion.
+  */
+  SerialSymDenseMatrix(OrdinalType numRowsCols, bool zeroOut = true);
+
+  //! Set object values from two-dimensional array.
+  /*!
+    \param CV - Enumerated type set to Teuchos::Copy or Teuchos::View.
+
+    \param values - Pointer to an array of ScalarType.  The first column starts at \c values,
+                the second at \c values+stride, etc.
+    \param stride - The stride between the columns of the matrix in memory.
+    \param numRowsCols - Number of rows and columns in the matrix.
+
+    \note By default the active part of the matrix is assumed to be the lower triangular part.
+    To set the upper part as active, call SetUpper(). See Detailed Description section for further discussion.
+  */
+  SerialSymDenseMatrix(DataAccess CV, bool upper, ScalarType* values, OrdinalType stride, OrdinalType numRowsCols);
+
+  //! Teuchos::SerialSymDenseMatrix copy constructor.
+  SerialSymDenseMatrix(const SerialSymDenseMatrix<OrdinalType, ScalarType> &Source);
+
+  //! Submatrix Copy Constructor
+  /*!
+    \param CV - Enumerated type set to Teuchos::Copy or Teuchos::View.
+    \param Source - Reference to another dense matrix from which values are to be copied.
+    \param numRowCols - The number of rows and columns in this matrix.
+    \param startRowCol - The row and column of \c Source from which the submatrix copy should start.
+
+    Creates a shaped matrix with \c numRowCols rows and columns, which is a submatrix of \c Source.
+    If \c startRowCol are not given, then the submatrix is the leading submatrix of \c Source.
+  */
+  SerialSymDenseMatrix(DataAccess CV, const SerialSymDenseMatrix<OrdinalType, ScalarType> &Source, OrdinalType numRowCols, OrdinalType startRowCol=0);
+
+  //! Teuchos::SerialSymDenseMatrix destructor.
+  virtual ~SerialSymDenseMatrix ();
+  //@}
+
+  //! @name Shaping Methods
+  //@{
+
+  //! Set dimensions of a Teuchos::SerialSymDenseMatrix object; init values to zero.
+  /*!
+    \param numRowsCols - Number of rows and columns in object.
+
+    Allows user to define the dimensions of a Teuchos::SerialSymDenseMatrix at any point. This function can
+    be called at any point after construction.  Any values that were previously in this object are
+    destroyed and the resized matrix starts off with all zero values.
+
+    \return Integer error code, set to 0 if successful.
+  */
+  int shape(OrdinalType numRowsCols);
+
+  //! Set dimensions of a Teuchos::SerialSymDenseMatrix object; don't initialize values.
+  /*!
+    \param numRowsCols - Number of rows and columns in object.
+
+    Allows user to define the dimensions of a Teuchos::SerialSymDenseMatrix at any point. This function can
+    be called at any point after construction.  Any values that were previously in this object are
+    destroyed.  The resized matrix has uninitialized values.
+
+    \return Integer error code, set to 0 if successful.
+  */
+  int shapeUninitialized(OrdinalType numRowsCols);
+
+  //! Reshape a Teuchos::SerialSymDenseMatrix object.
+  /*!
+    \param numRowsCols - Number of rows and columns in object.
+
+    Allows user to define the dimensions of a Teuchos::SerialSymDenseMatrix at any point. This function can
+    be called at any point after construction.  Any values that were previously in this object are
+    copied into the new shape.  If the new shape is smaller than the original, the upper left portion
+    of the original matrix (the principal submatrix) is copied to the new matrix.
+
+    \return Integer error code, set to 0 if successful.
+  */
+  int reshape(OrdinalType numRowsCols);
+
+  //! Specify that the lower triangle of the \e this matrix should be used.
+  /*! \warning This may necessitate the matrix values be copied from the upper to lower portion of the matrix.
+  */
+  void setLower();
+
+  //! Specify that the upper triangle of the \e this matrix should be used.
+  /*! \warning This may necessitate the matrix values be copied from the lower to upper portion of the matrix.
+  */
+  void setUpper();
+
+  //@}
+
+  //! @name Set methods.
+  //@{
+
+  //! Copies values from one matrix to another.
+  /*!
+    The operator= copies the values from one existing SerialSymDenseMatrix to another.
+    If \c Source is a view (i.e. CV = Teuchos::View), then this method will
+    return a view.  Otherwise, it will return a copy of \c Source.  \e this object
+    will be resized if it is not large enough to copy \c Source into.
+  */
+  SerialSymDenseMatrix<OrdinalType, ScalarType>& operator= (const SerialSymDenseMatrix<OrdinalType, ScalarType>& Source);
+
+  //! Copies values from one matrix to another.
+  /*!
+    Copies the values from one existing SerialSymDenseMatrix to another if the dimension of both matrices are the same.
+    If not, \e this matrix will be returned unchanged.
+  */
+  SerialSymDenseMatrix<OrdinalType, ScalarType>& assign (const SerialSymDenseMatrix<OrdinalType, ScalarType>& Source);
+
+  //! Set all values in the matrix to a constant value.
+  /*!
+    \param value - Value to use;
+  */
+  SerialSymDenseMatrix<OrdinalType, ScalarType>& operator= (const ScalarType value) { putScalar(value); return(*this); }
+
+  //! Set all values in the matrix to a constant value.
+  /*!
+    \param value - Value to use; zero if none specified.
+    \param fullMatrix - set full matrix entries to \c value, not just active portion of symmetric matrix.
+    \return Integer error code, set to 0 if successful.
+  */
+  int putScalar( const ScalarType value = Teuchos::ScalarTraits<ScalarType>::zero(), bool fullMatrix = false );
+
+  //! Swap values between this matrix and incoming matrix.
+  /*!
+    Swaps pointers and associated state without copying the matrix data.
+  */
+  void swap (SerialSymDenseMatrix<OrdinalType, ScalarType> &B);
+
+  //! Set all values in the active area (upper/lower triangle) of this matrix to be random numbers.
+  /*! \note The diagonal will be the sum of the off diagonal elements, plus a bias, so the matrix is SPD.
+   */
+  int random( const ScalarType bias = 0.1*Teuchos::ScalarTraits<ScalarType>::one() );
+
+  //@}
+
+  //! @name Accessor methods.
+  //@{
+
+  //! Element access method (non-const).
+  /*! Returns the element in the ith row and jth column if A(i,j) is specified.
+
+    \return Element from the specified \c rowIndex row and \c colIndex column.
+    \note If the requested element lies in the inactive part of the matrix, then A(j,i) will be returned.
+    \warning The validity of \c rowIndex and \c colIndex will only be checked if Teuchos is
+    configured with --enable-teuchos-abc.
+  */
+  ScalarType& operator () (OrdinalType rowIndex, OrdinalType colIndex);
+
+  //! Element access method (const).
+  /*! Returns the element in the ith row and jth column if A(i,j) is specified.
+
+    \return Element from the specified \c rowIndex row and \c colIndex column.
+    \note If the requested element lies in the inactive part of the matrix, then A(j,i) will be returned.
+    \warning The validity of \c rowIndex and \c colIndex will only be checked if Teuchos is
+    configured with --enable-teuchos-abc.
+  */
+  const ScalarType& operator () (OrdinalType rowIndex, OrdinalType colIndex) const;
+
+  //! Returns the pointer to the ScalarType data array contained in the object.
+  /*! \note The matrix values are only guaranteed to be stored in the active area of the matrix (upper/lower).
+  */
+  ScalarType* values() const { return(values_); }
+
+  //@}
+
+  //! @name Query methods
+  //@{
+
+  //! Returns true if upper triangular part of \e this matrix has and will be used.
+  bool upper() const {return(upper_);};
+
+  //! Returns character value of UPLO used by LAPACK routines.
+  char UPLO() const {return(UPLO_);};
+  //@}
+
+  //! @name Mathematical Methods
+  //@{
+
+  //! Inplace scalar-matrix product A = \c alpha*A.
+  /*! Scale a matrix, entry-by-entry using the value \e alpha.  This method is sensitive to
+      the UPLO() parameter.
+
+   \param alpha - Scalar to multiply with A.
+
+  */
+  SerialSymDenseMatrix<OrdinalType, ScalarType>& operator*= (const ScalarType alpha);
+
+  //! Add another matrix to \e this matrix.
+  /*! Add \c Source to \e this if the dimension of both matrices are the same.  If not, \e this matrix
+    will be returned unchanged.
+  */
+  SerialSymDenseMatrix<OrdinalType, ScalarType>& operator+= (const SerialSymDenseMatrix<OrdinalType, ScalarType>& Source);
+
+  //! Subtract another matrix from \e this matrix.
+  /*! Subtract \c Source from \e this if the dimension of both matrices are the same.  If not, \e this matrix
+    will be returned unchanged.
+  */
+  SerialSymDenseMatrix<OrdinalType, ScalarType>& operator-= (const SerialSymDenseMatrix<OrdinalType, ScalarType>& Source);
+
+  //@}
+
+  //! @name Comparison methods.
+  //@{
+
+  //! Equality of two matrices.
+  /*! \return True if \e this matrix and \c Operand are of the same shape (rows / columns) and have
+    the same entries in the active (upper / lower triangular) area of the matrix, else False will be returned.
+  */
+  bool operator== (const SerialSymDenseMatrix<OrdinalType, ScalarType> &Operand) const;
+
+  //! Inequality of two matrices.
+  /*! \return True if \e this matrix and \c Operand of not of the same shape (rows / columns) or don't
+    have the same entries in the active (upper / lower triangular), else False will be returned.
+  */
+  bool operator!= (const SerialSymDenseMatrix<OrdinalType, ScalarType> &Operand) const;
+
+  //@}
+
+  //! @name Attribute methods.
+  //@{
+
+  //! Returns the row dimension of this matrix.
+  OrdinalType numRows() const { return(numRowCols_); }
+
+  //! Returns the column dimension of this matrix.
+  OrdinalType numCols() const { return(numRowCols_); }
+
+  //! Returns the stride between the columns of this matrix in memory.
+  OrdinalType stride() const { return(stride_); }
+
+  //! Returns whether this matrix is empty.
+  bool empty() const { return(numRowCols_ == 0); }
+
+  //@}
+
+  //! @name Norm methods.
+  //@{
+
+  //! Returns the 1-norm of the matrix.
+  typename ScalarTraits<ScalarType>::magnitudeType normOne() const;
+
+  //! Returns the Infinity-norm of the matrix.
+  typename ScalarTraits<ScalarType>::magnitudeType normInf() const;
+
+  //! Returns the Frobenius-norm of the matrix.
+  typename ScalarTraits<ScalarType>::magnitudeType normFrobenius() const;
+  //@}
+
+  //! @name I/O methods.
+  //@{
+  //! Print method.  Defines the behavior of the std::ostream << operator
+  virtual std::ostream& print(std::ostream& os) const;
+
+  //@}
+
+ protected:
+
+  // In-place scaling of matrix.
+  void scale( const ScalarType alpha );
+
+  // Copy the values from one matrix to the other.
+  void copyMat(bool inputUpper, ScalarType* inputMatrix, OrdinalType inputStride,
+               OrdinalType numRowCols, bool outputUpper, ScalarType* outputMatrix,
+               OrdinalType outputStride, OrdinalType startRowCol,
+               ScalarType alpha = ScalarTraits<ScalarType>::zero() );
+
+  // Copy the values from the active triangle of the matrix to the other to make the matrix full symmetric.
+  void copyUPLOMat(bool inputUpper, ScalarType* inputMatrix,
+                   OrdinalType inputStride, OrdinalType inputRows);
+
+  void deleteArrays();
+  void checkIndex( OrdinalType rowIndex, OrdinalType colIndex = 0 ) const;
+
+  static ScalarType*
+  allocateValues(const OrdinalType numRows,
+                 const OrdinalType numCols)
+  {
+    const size_t size = size_t(numRows) * size_t(numCols);
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+    return new ScalarType[size];
+#pragma GCC diagnostic pop
+  }
+
+  OrdinalType numRowCols_ = 0;
+  OrdinalType stride_ = 0;
+  bool valuesCopied_ = false;
+  ScalarType* values_ = nullptr;
+  bool upper_ = false;
+  char UPLO_ {'L'};
+};
+
+//----------------------------------------------------------------------------------------------------
+//  Constructors and Destructor
+//----------------------------------------------------------------------------------------------------
+
+template<typename OrdinalType, typename ScalarType>
+SerialSymDenseMatrix<OrdinalType, ScalarType>::SerialSymDenseMatrix(OrdinalType numRowCols_in, bool zeroOut)
+  : numRowCols_(numRowCols_in), stride_(numRowCols_in), valuesCopied_(false), upper_(false), UPLO_('L')
+{
+  values_ = allocateValues(stride_, numRowCols_);
+  valuesCopied_ = true;
+  if (zeroOut) {
+    const ScalarType ZERO = Teuchos::ScalarTraits<ScalarType>::zero();
+    putScalar(ZERO, true);
+  }
+}
+
+template<typename OrdinalType, typename ScalarType>
+SerialSymDenseMatrix<OrdinalType, ScalarType>::SerialSymDenseMatrix(
+  DataAccess CV, bool upper_in, ScalarType* values_in, OrdinalType stride_in, OrdinalType numRowCols_in
+  )
+  : numRowCols_(numRowCols_in), stride_(stride_in), valuesCopied_(false),
+    values_(values_in), upper_(upper_in)
+{
+  if (upper_)
+    UPLO_ = 'U';
+  else
+    UPLO_ = 'L';
+
+  if(CV == Copy)
+  {
+    stride_ = numRowCols_;
+    values_ = allocateValues(stride_, numRowCols_);
+    copyMat(upper_in, values_in, stride_in, numRowCols_, upper_, values_, stride_, 0);
+    valuesCopied_ = true;
+  }
+}
+
+template<typename OrdinalType, typename ScalarType>
+SerialSymDenseMatrix<OrdinalType, ScalarType>::SerialSymDenseMatrix(const SerialSymDenseMatrix<OrdinalType, ScalarType> &Source)
+  : numRowCols_(Source.numRowCols_), stride_(0), valuesCopied_(true),
+    values_(0), upper_(Source.upper_), UPLO_(Source.UPLO_)
+{
+  if (!Source.valuesCopied_)
+  {
+    stride_ = Source.stride_;
+    values_ = Source.values_;
+    valuesCopied_ = false;
+  }
+  else
+  {
+    stride_ = numRowCols_;
+    if(stride_ > 0 && numRowCols_ > 0) {
+      values_ = allocateValues(stride_, numRowCols_);
+      copyMat(Source.upper_, Source.values_, Source.stride_, numRowCols_, upper_, values_, stride_, 0);
+    }
+    else {
+      numRowCols_ = 0;
+      stride_ = 0;
+      valuesCopied_ = false;
+    }
+  }
+}
+
+template<typename OrdinalType, typename ScalarType>
+SerialSymDenseMatrix<OrdinalType, ScalarType>::SerialSymDenseMatrix(
+                                                                    DataAccess CV, const SerialSymDenseMatrix<OrdinalType,
+                                                                    ScalarType> &Source, OrdinalType numRowCols_in, OrdinalType startRowCol )
+  : CompObject(), BLAS<OrdinalType,ScalarType>(),
+    numRowCols_(numRowCols_in), stride_(Source.stride_), valuesCopied_(false), upper_(Source.upper_), UPLO_(Source.UPLO_)
+{
+  if(CV == Copy)
+  {
+    stride_ = numRowCols_in;
+    values_ = allocateValues(stride_, numRowCols_in);
+    copyMat(Source.upper_, Source.values_, Source.stride_, numRowCols_in, upper_, values_, stride_, startRowCol);
+    valuesCopied_ = true;
+  }
+  else // CV == View
+  {
+    values_ = Source.values_ + (stride_ * startRowCol) + startRowCol;
+  }
+}
+
+template<typename OrdinalType, typename ScalarType>
+SerialSymDenseMatrix<OrdinalType, ScalarType>::~SerialSymDenseMatrix()
+{
+  deleteArrays();
+}
+
+//----------------------------------------------------------------------------------------------------
+//  Shape methods
+//----------------------------------------------------------------------------------------------------
+
+template<typename OrdinalType, typename ScalarType>
+int SerialSymDenseMatrix<OrdinalType, ScalarType>::shape( OrdinalType numRowCols_in )
+{
+  deleteArrays(); // Get rid of anything that might be already allocated
+  numRowCols_ = numRowCols_in;
+  stride_ = numRowCols_;
+  values_ = allocateValues(stride_, numRowCols_);
+  putScalar( Teuchos::ScalarTraits<ScalarType>::zero(), true );
+  valuesCopied_ = true;
+  return(0);
+}
+
+template<typename OrdinalType, typename ScalarType>
+int SerialSymDenseMatrix<OrdinalType, ScalarType>::shapeUninitialized( OrdinalType numRowCols_in )
+{
+  deleteArrays(); // Get rid of anything that might be already allocated
+  numRowCols_ = numRowCols_in;
+  stride_ = numRowCols_;
+  values_ = allocateValues(stride_, numRowCols_);
+  valuesCopied_ = true;
+  return(0);
+}
+
+template<typename OrdinalType, typename ScalarType>
+int SerialSymDenseMatrix<OrdinalType, ScalarType>::reshape( OrdinalType numRowCols_in )
+{
+  // Allocate space for new matrix
+  ScalarType* values_tmp = allocateValues(numRowCols_in, numRowCols_in);
+  const ScalarType zero = Teuchos::ScalarTraits<ScalarType>::zero();
+  for(OrdinalType k = 0; k < numRowCols_in * numRowCols_in; k++)
+  {
+    values_tmp[k] = zero;
+  }
+  OrdinalType numRowCols_tmp = TEUCHOS_MIN(numRowCols_, numRowCols_in);
+  if(values_ != 0)
+  {
+    copyMat(upper_, values_, stride_, numRowCols_tmp, upper_, values_tmp, numRowCols_in, 0); // Copy principal submatrix of A to new A
+  }
+  deleteArrays(); // Get rid of anything that might be already allocated
+  numRowCols_ = numRowCols_in;
+  stride_ = numRowCols_;
+  values_ = values_tmp; // Set pointer to new A
+  valuesCopied_ = true;
+  return(0);
+}
+
+//----------------------------------------------------------------------------------------------------
+//   Set methods
+//----------------------------------------------------------------------------------------------------
+
+template<typename OrdinalType, typename ScalarType>
+void SerialSymDenseMatrix<OrdinalType, ScalarType>::setLower()
+{
+  // Do nothing if the matrix is already a lower triangular matrix
+  if (upper_ != false) {
+    copyUPLOMat( true, values_, stride_, numRowCols_ );
+    upper_ = false;
+    UPLO_ = 'L';
+  }
+}
+
+template<typename OrdinalType, typename ScalarType>
+void SerialSymDenseMatrix<OrdinalType, ScalarType>::setUpper()
+{
+  // Do nothing if the matrix is already an upper triangular matrix
+  if (upper_ == false) {
+    copyUPLOMat( false, values_, stride_, numRowCols_ );
+    upper_ = true;
+    UPLO_ = 'U';
+  }
+}
+
+template<typename OrdinalType, typename ScalarType>
+int SerialSymDenseMatrix<OrdinalType, ScalarType>::putScalar( const ScalarType value_in, bool fullMatrix )
+{
+  // Set each value of the dense matrix to "value".
+  if (fullMatrix == true) {
+    for(OrdinalType j = 0; j < numRowCols_; j++)
+      {
+        for(OrdinalType i = 0; i < numRowCols_; i++)
+          {
+            values_[i + j*stride_] = value_in;
+          }
+      }
+  }
+  // Set the active upper or lower triangular part of the matrix to "value"
+  else {
+    if (upper_) {
+      for(OrdinalType j = 0; j < numRowCols_; j++) {
+        for(OrdinalType i = 0; i <= j; i++) {
+          values_[i + j*stride_] = value_in;
+        }
+      }
+    }
+    else {
+      for(OrdinalType j = 0; j < numRowCols_; j++) {
+        for(OrdinalType i = j; i < numRowCols_; i++) {
+          values_[i + j*stride_] = value_in;
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+template<typename OrdinalType, typename ScalarType> void
+SerialSymDenseMatrix<OrdinalType, ScalarType>::swap(
+  SerialSymDenseMatrix<OrdinalType, ScalarType> &B)
+{
+  std::swap(values_ ,      B.values_);
+  std::swap(numRowCols_,   B.numRowCols_);
+  std::swap(stride_,       B.stride_);
+  std::swap(valuesCopied_, B.valuesCopied_);
+  std::swap(upper_,        B.upper_);
+  std::swap(UPLO_,         B.UPLO_);
+}
+
+template<typename OrdinalType, typename ScalarType>
+int SerialSymDenseMatrix<OrdinalType, ScalarType>::random( const ScalarType bias )
+{
+  typedef typename Teuchos::ScalarTraits<ScalarType>::magnitudeType MT;
+
+  // Set each value of the dense matrix to a random value.
+  std::vector<MT> diagSum( numRowCols_, Teuchos::ScalarTraits<MT>::zero() );
+  if (upper_) {
+    for(OrdinalType j = 0; j < numRowCols_; j++) {
+      for(OrdinalType i = 0; i < j; i++) {
+        values_[i + j*stride_] = ScalarTraits<ScalarType>::random();
+        diagSum[i] += Teuchos::ScalarTraits<ScalarType>::magnitude( values_[i + j*stride_] );
+        diagSum[j] += Teuchos::ScalarTraits<ScalarType>::magnitude( values_[i + j*stride_] );
+      }
+    }
+  }
+  else {
+    for(OrdinalType j = 0; j < numRowCols_; j++) {
+      for(OrdinalType i = j+1; i < numRowCols_; i++) {
+        values_[i + j*stride_] = ScalarTraits<ScalarType>::random();
+        diagSum[i] += Teuchos::ScalarTraits<ScalarType>::magnitude( values_[i + j*stride_] );
+        diagSum[j] += Teuchos::ScalarTraits<ScalarType>::magnitude( values_[i + j*stride_] );
+      }
+    }
+  }
+
+  // Set the diagonal.
+  for(OrdinalType i = 0; i < numRowCols_; i++) {
+    values_[i + i*stride_] = diagSum[i] + bias;
+  }
+  return 0;
+}
+
+template<typename OrdinalType, typename ScalarType>
+SerialSymDenseMatrix<OrdinalType,ScalarType>&
+SerialSymDenseMatrix<OrdinalType, ScalarType>::operator=( const SerialSymDenseMatrix<OrdinalType,ScalarType>& Source )
+{
+  if(this == &Source)
+    return(*this); // Special case of source same as target
+  if((!valuesCopied_) && (!Source.valuesCopied_) && (values_ == Source.values_)) {
+    upper_ = Source.upper_;  // Might have to change the active part of the matrix.
+    return(*this); // Special case of both are views to same data.
+  }
+
+  // If the source is a view then we will return a view, else we will return a copy.
+  if (!Source.valuesCopied_) {
+    if(valuesCopied_)       {
+      // Clean up stored data if this was previously a copy.
+      deleteArrays();
+    }
+    numRowCols_ = Source.numRowCols_;
+    stride_ = Source.stride_;
+    values_ = Source.values_;
+    upper_ = Source.upper_;
+    UPLO_ = Source.UPLO_;
+  }
+  else {
+    // If we were a view, we will now be a copy.
+    if(!valuesCopied_) {
+      numRowCols_ = Source.numRowCols_;
+      stride_ = Source.numRowCols_;
+      upper_ = Source.upper_;
+      UPLO_ = Source.UPLO_;
+      if(stride_ > 0 && numRowCols_ > 0) {
+        values_ = allocateValues(stride_, numRowCols_);
+        valuesCopied_ = true;
+      }
+      else {
+        values_ = 0;
+      }
+    }
+    // If we were a copy, we will stay a copy.
+    else {
+      if((Source.numRowCols_ <= stride_) && (Source.numRowCols_ == numRowCols_)) { // we don't need to reallocate
+        numRowCols_ = Source.numRowCols_;
+        upper_ = Source.upper_;
+        UPLO_ = Source.UPLO_;
+      }
+      else { // we need to allocate more space (or less space)
+        deleteArrays();
+        numRowCols_ = Source.numRowCols_;
+        stride_ = Source.numRowCols_;
+        upper_ = Source.upper_;
+        UPLO_ = Source.UPLO_;
+        if(stride_ > 0 && numRowCols_ > 0) {
+          values_ = allocateValues(stride_, numRowCols_);
+          valuesCopied_ = true;
+        }
+      }
+    }
+    copyMat(Source.upper_, Source.values_, Source.stride_, Source.numRowCols_, upper_, values_, stride_, 0);
+  }
+  return(*this);
+}
+
+template<typename OrdinalType, typename ScalarType>
+SerialSymDenseMatrix<OrdinalType, ScalarType>& SerialSymDenseMatrix<OrdinalType, ScalarType>::operator*= (const ScalarType alpha)
+{
+  this->scale(alpha);
+  return(*this);
+}
+
+template<typename OrdinalType, typename ScalarType>
+SerialSymDenseMatrix<OrdinalType, ScalarType>& SerialSymDenseMatrix<OrdinalType, ScalarType>::operator+= (const SerialSymDenseMatrix<OrdinalType,ScalarType>& Source )
+{
+  // Check for compatible dimensions
+  if ((numRowCols_ != Source.numRowCols_))
+    {
+      TEUCHOS_CHK_REF(*this); // Return *this without altering it.
+    }
+  copyMat(Source.upper_, Source.values_, Source.stride_, numRowCols_, upper_, values_, stride_, 0, ScalarTraits<ScalarType>::one());
+  return(*this);
+}
+
+template<typename OrdinalType, typename ScalarType>
+SerialSymDenseMatrix<OrdinalType, ScalarType>& SerialSymDenseMatrix<OrdinalType, ScalarType>::operator-= (const SerialSymDenseMatrix<OrdinalType,ScalarType>& Source )
+{
+  // Check for compatible dimensions
+  if ((numRowCols_ != Source.numRowCols_))
+  {
+    TEUCHOS_CHK_REF(*this); // Return *this without altering it.
+  }
+  copyMat(Source.upper_, Source.values_, Source.stride_, numRowCols_, upper_, values_, stride_, 0, -ScalarTraits<ScalarType>::one());
+  return(*this);
+}
+
+template<typename OrdinalType, typename ScalarType>
+SerialSymDenseMatrix<OrdinalType,ScalarType>& SerialSymDenseMatrix<OrdinalType, ScalarType>::assign (const SerialSymDenseMatrix<OrdinalType,ScalarType>& Source) {
+  if(this == &Source)
+    return(*this); // Special case of source same as target
+  if((!valuesCopied_) && (!Source.valuesCopied_) && (values_ == Source.values_)) {
+    upper_ = Source.upper_; // We may have to change the active part of the matrix.
+    return(*this); // Special case of both are views to same data.
+  }
+
+  // Check for compatible dimensions
+  if ((numRowCols_ != Source.numRowCols_))
+  {
+    TEUCHOS_CHK_REF(*this); // Return *this without altering it.
+  }
+  copyMat(Source.upper_, Source.values_, Source.stride_, numRowCols_, upper_, values_, stride_, 0 );
+  return(*this);
+}
+
+//----------------------------------------------------------------------------------------------------
+//   Accessor methods
+//----------------------------------------------------------------------------------------------------
+
+template<typename OrdinalType, typename ScalarType>
+inline ScalarType& SerialSymDenseMatrix<OrdinalType, ScalarType>::operator () (OrdinalType rowIndex, OrdinalType colIndex)
+{
+#ifdef HAVE_TEUCHOS_ARRAY_BOUNDSCHECK
+  checkIndex( rowIndex, colIndex );
+#endif
+  if ( rowIndex <= colIndex ) {
+    // Accessing upper triangular part of matrix
+    if (upper_)
+      return(values_[colIndex * stride_ + rowIndex]);
+    else
+      return(values_[rowIndex * stride_ + colIndex]);
+  }
+  else {
+    // Accessing lower triangular part of matrix
+    if (upper_)
+      return(values_[rowIndex * stride_ + colIndex]);
+    else
+      return(values_[colIndex * stride_ + rowIndex]);
+  }
+}
+
+template<typename OrdinalType, typename ScalarType>
+inline const ScalarType& SerialSymDenseMatrix<OrdinalType, ScalarType>::operator () (OrdinalType rowIndex, OrdinalType colIndex) const
+{
+#ifdef HAVE_TEUCHOS_ARRAY_BOUNDSCHECK
+  checkIndex( rowIndex, colIndex );
+#endif
+  if ( rowIndex <= colIndex ) {
+    // Accessing upper triangular part of matrix
+    if (upper_)
+      return(values_[colIndex * stride_ + rowIndex]);
+    else
+      return(values_[rowIndex * stride_ + colIndex]);
+  }
+  else {
+    // Accessing lower triangular part of matrix
+    if (upper_)
+      return(values_[rowIndex * stride_ + colIndex]);
+    else
+      return(values_[colIndex * stride_ + rowIndex]);
+  }
+}
+
+//----------------------------------------------------------------------------------------------------
+//   Norm methods
+//----------------------------------------------------------------------------------------------------
+
+template<typename OrdinalType, typename ScalarType>
+typename ScalarTraits<ScalarType>::magnitudeType SerialSymDenseMatrix<OrdinalType, ScalarType>::normOne() const
+{
+  return(normInf());
+}
+
+template<typename OrdinalType, typename ScalarType>
+typename ScalarTraits<ScalarType>::magnitudeType SerialSymDenseMatrix<OrdinalType, ScalarType>::normInf() const
+{
+  typedef typename ScalarTraits<ScalarType>::magnitudeType MT;
+
+  OrdinalType i, j;
+
+  MT sum, anorm = ScalarTraits<MT>::zero();
+  ScalarType* ptr;
+
+  if (upper_) {
+    for (j=0; j<numRowCols_; j++) {
+      sum = ScalarTraits<MT>::zero();
+      ptr = values_ + j*stride_;
+      for (i=0; i<j; i++) {
+        sum += ScalarTraits<ScalarType>::magnitude( *ptr++ );
+      }
+      ptr = values_ + j + j*stride_;
+      for (i=j; i<numRowCols_; i++) {
+        sum += ScalarTraits<ScalarType>::magnitude( *ptr );
+        ptr += stride_;
+      }
+      anorm = TEUCHOS_MAX( anorm, sum );
+    }
+  }
+  else {
+    for (j=0; j<numRowCols_; j++) {
+      sum = ScalarTraits<MT>::zero();
+      ptr = values_ + j + j*stride_;
+      for (i=j; i<numRowCols_; i++) {
+        sum += ScalarTraits<ScalarType>::magnitude( *ptr++ );
+      }
+      ptr = values_ + j;
+      for (i=0; i<j; i++) {
+        sum += ScalarTraits<ScalarType>::magnitude( *ptr );
+        ptr += stride_;
+      }
+      anorm = TEUCHOS_MAX( anorm, sum );
+    }
+  }
+  return(anorm);
+}
+
+template<typename OrdinalType, typename ScalarType>
+typename ScalarTraits<ScalarType>::magnitudeType SerialSymDenseMatrix<OrdinalType, ScalarType>::normFrobenius() const
+{
+  typedef typename ScalarTraits<ScalarType>::magnitudeType MT;
+
+  OrdinalType i, j;
+
+  MT sum = ScalarTraits<MT>::zero(), anorm = ScalarTraits<MT>::zero();
+
+  if (upper_) {
+    for (j = 0; j < numRowCols_; j++) {
+      for (i = 0; i < j; i++) {
+        sum += ScalarTraits<ScalarType>::magnitude(2.0*values_[i+j*stride_]*values_[i+j*stride_]);
+      }
+      sum += ScalarTraits<ScalarType>::magnitude(values_[j + j*stride_]*values_[j + j*stride_]);
+    }
+  }
+  else {
+    for (j = 0; j < numRowCols_; j++) {
+      sum += ScalarTraits<ScalarType>::magnitude(values_[j + j*stride_]*values_[j + j*stride_]);
+      for (i = j+1; i < numRowCols_; i++) {
+        sum += ScalarTraits<ScalarType>::magnitude(2.0*values_[i+j*stride_]*values_[i+j*stride_]);
+      }
+    }
+  }
+  anorm = ScalarTraits<ScalarType>::magnitude(ScalarTraits<ScalarType>::squareroot(sum));
+  return(anorm);
+}
+
+//----------------------------------------------------------------------------------------------------
+//   Comparison methods
+//----------------------------------------------------------------------------------------------------
+
+template<typename OrdinalType, typename ScalarType>
+bool SerialSymDenseMatrix<OrdinalType, ScalarType>::operator== (const SerialSymDenseMatrix<OrdinalType, ScalarType> &Operand) const
+{
+  bool result = 1;
+  if((numRowCols_ != Operand.numRowCols_)) {
+    result = 0;
+  }
+  else {
+    OrdinalType i, j;
+    for(i = 0; i < numRowCols_; i++) {
+      for(j = 0; j < numRowCols_; j++) {
+        if((*this)(i, j) != Operand(i, j)) {
+          return 0;
+        }
+      }
+    }
+  }
+  return result;
+}
+
+template<typename OrdinalType, typename ScalarType>
+bool SerialSymDenseMatrix<OrdinalType, ScalarType>::operator!= (const SerialSymDenseMatrix<OrdinalType, ScalarType> &Operand) const
+{
+  return !((*this) == Operand);
+}
+
+//----------------------------------------------------------------------------------------------------
+//   Multiplication method
+//----------------------------------------------------------------------------------------------------
+
+template<typename OrdinalType, typename ScalarType>
+void SerialSymDenseMatrix<OrdinalType, ScalarType>::scale( const ScalarType alpha )
+{
+  OrdinalType i, j;
+  ScalarType* ptr;
+
+  if (upper_) {
+    for (j=0; j<numRowCols_; j++) {
+      ptr = values_ + j*stride_;
+      for (i=0; i<= j; i++) { *ptr = alpha * (*ptr); ptr++; }
+    }
+  }
+  else {
+    for (j=0; j<numRowCols_; j++) {
+      ptr = values_ + j*stride_ + j;
+      for (i=j; i<numRowCols_; i++) { *ptr = alpha * (*ptr); ptr++; }
+    }
+  }
+}
+
+/*
+template<typename OrdinalType, typename ScalarType>
+int SerialSymDenseMatrix<OrdinalType, ScalarType>::scale( const SerialSymDenseMatrix<OrdinalType,ScalarType>& A )
+{
+  OrdinalType i, j;
+  ScalarType* ptr;
+
+  // Check for compatible dimensions
+  if ((numRowCols_ != A.numRowCols_)) {
+    TEUCHOS_CHK_ERR(-1); // Return error
+  }
+
+  if (upper_) {
+    for (j=0; j<numRowCols_; j++) {
+      ptr = values_ + j*stride_;
+      for (i=0; i<= j; i++) { *ptr = A(i,j) * (*ptr); ptr++; }
+    }
+  }
+  else {
+    for (j=0; j<numRowCols_; j++) {
+      ptr = values_ + j*stride_;
+      for (i=j; i<numRowCols_; i++) { *ptr = A(i,j) * (*ptr); ptr++; }
+    }
+  }
+
+  return(0);
+}
+*/
+
+template<typename OrdinalType, typename ScalarType>
+std::ostream& SerialSymDenseMatrix<OrdinalType, ScalarType>::print(std::ostream& os) const
+{
+  os << std::endl;
+  if(valuesCopied_)
+    os << "Values_copied : yes" << std::endl;
+  else
+    os << "Values_copied : no" << std::endl;
+  os << "Rows / Columns : " << numRowCols_ << std::endl;
+  os << "LDA : " << stride_ << std::endl;
+  if (upper_)
+    os << "Storage: Upper" << std::endl;
+  else
+    os << "Storage: Lower" << std::endl;
+  if(numRowCols_ == 0) {
+    os << "(matrix is empty, no values to display)" << std::endl;
+  } else {
+    for(OrdinalType i = 0; i < numRowCols_; i++) {
+      for(OrdinalType j = 0; j < numRowCols_; j++){
+        os << (*this)(i,j) << " ";
+      }
+      os << std::endl;
+    }
+  }
+  return os;
+}
+
+//----------------------------------------------------------------------------------------------------
+//   Protected methods
+//----------------------------------------------------------------------------------------------------
+
+template<typename OrdinalType, typename ScalarType>
+inline void SerialSymDenseMatrix<OrdinalType, ScalarType>::checkIndex( OrdinalType rowIndex, OrdinalType colIndex ) const {
+  TEUCHOS_TEST_FOR_EXCEPTION(rowIndex < 0 || rowIndex >= numRowCols_, std::out_of_range,
+    "SerialSymDenseMatrix<T>::checkIndex: "
+    "Row index " << rowIndex << " out of range [0, "<< numRowCols_ << ")");
+  TEUCHOS_TEST_FOR_EXCEPTION(colIndex < 0 || colIndex >= numRowCols_, std::out_of_range,
+    "SerialSymDenseMatrix<T>::checkIndex: "
+    "Col index " << colIndex << " out of range [0, "<< numRowCols_ << ")");
+}
+
+template<typename OrdinalType, typename ScalarType>
+void SerialSymDenseMatrix<OrdinalType, ScalarType>::deleteArrays(void)
+{
+  if (valuesCopied_)
+  {
+    delete [] values_;
+    values_ = 0;
+    valuesCopied_ = false;
+  }
+}
+
+template<typename OrdinalType, typename ScalarType>
+void SerialSymDenseMatrix<OrdinalType, ScalarType>::copyMat(
+                                                            bool inputUpper, ScalarType* inputMatrix,
+                                                            OrdinalType inputStride, OrdinalType numRowCols_in,
+                                                            bool outputUpper, ScalarType* outputMatrix,
+                                                            OrdinalType outputStride, OrdinalType startRowCol,
+                                                            ScalarType alpha
+                                                            )
+{
+  OrdinalType i, j;
+  ScalarType* ptr1 = 0;
+  ScalarType* ptr2 = 0;
+
+  for (j = 0; j < numRowCols_in; j++) {
+    if (inputUpper == true) {
+      // The input matrix is upper triangular, start at the beginning of each column.
+      ptr2 = inputMatrix + (j + startRowCol) * inputStride + startRowCol;
+      if (outputUpper == true) {
+        // The output matrix matches the same pattern as the input matrix.
+        ptr1 = outputMatrix + j*outputStride;
+        if (alpha != Teuchos::ScalarTraits<ScalarType>::zero() ) {
+          for(i = 0; i <= j; i++) {
+            *ptr1++ += alpha*(*ptr2++);
+          }
+        } else {
+          for(i = 0; i <= j; i++) {
+            *ptr1++ = *ptr2++;
+          }
+        }
+      }
+      else {
+        // The output matrix has the opposite pattern as the input matrix.
+        // Copy down across rows of the output matrix, but down columns of the input matrix.
+        ptr1 = outputMatrix + j;
+        if (alpha != Teuchos::ScalarTraits<ScalarType>::zero() ) {
+          for(i = 0; i <= j; i++) {
+            *ptr1 += alpha*(*ptr2++);
+            ptr1 += outputStride;
+          }
+        } else {
+          for(i = 0; i <= j; i++) {
+            *ptr1 = *ptr2++;
+            ptr1 += outputStride;
+          }
+        }
+      }
+    }
+    else {
+      // The input matrix is lower triangular, start at the diagonal of each row.
+      ptr2 = inputMatrix + (startRowCol+j) * inputStride + startRowCol + j;
+      if (outputUpper == true) {
+        // The output matrix has the opposite pattern as the input matrix.
+        // Copy across rows of the output matrix, but down columns of the input matrix.
+        ptr1 = outputMatrix + j*outputStride + j;
+        if (alpha != Teuchos::ScalarTraits<ScalarType>::zero() ) {
+          for(i = j; i < numRowCols_in; i++) {
+            *ptr1 += alpha*(*ptr2++);
+            ptr1 += outputStride;
+          }
+        } else {
+          for(i = j; i < numRowCols_in; i++) {
+            *ptr1 = *ptr2++;
+            ptr1 += outputStride;
+          }
+        }
+      }
+      else {
+        // The output matrix matches the same pattern as the input matrix.
+        ptr1 = outputMatrix + j*outputStride + j;
+        if (alpha != Teuchos::ScalarTraits<ScalarType>::zero() ) {
+          for(i = j; i < numRowCols_in; i++) {
+            *ptr1++ += alpha*(*ptr2++);
+          }
+        } else {
+          for(i = j; i < numRowCols_in; i++) {
+            *ptr1++ = *ptr2++;
+          }
+        }
+      }
+    }
+  }
+}
+
+template<typename OrdinalType, typename ScalarType>
+void SerialSymDenseMatrix<OrdinalType, ScalarType>::copyUPLOMat(
+                                                                bool inputUpper, ScalarType* inputMatrix,
+                                                                OrdinalType inputStride, OrdinalType inputRows
+                                                                )
+{
+  OrdinalType i, j;
+  ScalarType * ptr1 = 0;
+  ScalarType * ptr2 = 0;
+
+  if (inputUpper) {
+    for (j=1; j<inputRows; j++) {
+      ptr1 = inputMatrix + j;
+      ptr2 = inputMatrix + j*inputStride;
+      for (i=0; i<j; i++) {
+        *ptr1 = *ptr2++;
+        ptr1+=inputStride;
+      }
+    }
+  }
+  else {
+    for (i=1; i<inputRows; i++) {
+      ptr1 = inputMatrix + i;
+      ptr2 = inputMatrix + i*inputStride;
+      for (j=0; j<i; j++) {
+        *ptr2++ = *ptr1;
+        ptr1+=inputStride;
+      }
+    }
+  }
+}
+
+/// \brief Ostream manipulator for SerialSymDenseMatrix
+template<typename OrdinalType, typename ScalarType>
+struct SerialSymDenseMatrixPrinter {
+public:
+  const SerialSymDenseMatrix<OrdinalType,ScalarType> &obj;
+  SerialSymDenseMatrixPrinter(
+        const SerialSymDenseMatrix<OrdinalType,ScalarType> &obj_in)
+      : obj(obj_in) {}
+};
+
+/// \brief Output SerialSymDenseMatrix object through its stream manipulator.
+template<typename OrdinalType, typename ScalarType>
+std::ostream&
+operator<<(std::ostream &out,
+           const SerialSymDenseMatrixPrinter<OrdinalType,ScalarType> printer)
+{
+  printer.obj.print(out);
+  return out;
+}
+
+/// \brief Return SerialSymDenseMatrix ostream manipulator Use as:
+template<typename OrdinalType, typename ScalarType>
+SerialSymDenseMatrixPrinter<OrdinalType,ScalarType>
+printMat(const SerialSymDenseMatrix<OrdinalType,ScalarType> &obj)
+{
+  return SerialSymDenseMatrixPrinter<OrdinalType,ScalarType>(obj);
+}
+
+
+} // namespace Teuchos
+
+#endif /* _TEUCHOS_SERIALSYMDENSEMATRIX_HPP_ */
diff --git a/packages/rol/src/compatibility/teuchos-lite/Teuchos_TestForException.cpp b/packages/rol/src/compatibility/teuchos-lite/Teuchos_TestForException.cpp
new file mode 100644
index 000000000000..ad072c026177
--- /dev/null
+++ b/packages/rol/src/compatibility/teuchos-lite/Teuchos_TestForException.cpp
@@ -0,0 +1,95 @@
+// @HEADER
+// *****************************************************************************
+//                    Teuchos: Common Tools Package
+//
+// Copyright 2004 NTESS and the Teuchos contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+#include "Teuchos_TestForException.hpp"
+//#include "Teuchos_GlobalMPISession.hpp"
+
+#include <iostream>
+
+//
+// ToDo: Make these functions thread-safe!
+//
+
+
+namespace {
+
+
+int throwNumber = 0;
+
+
+bool& loc_enableStackTrace()
+{
+  static bool static_enableStackTrace =
+#ifdef HAVE_TEUCHOS_DEFAULT_STACKTRACE
+    true
+#else
+    false
+#endif
+    ;
+  return static_enableStackTrace;
+}
+
+
+} // namespace
+
+
+void Teuchos::TestForException_incrThrowNumber()
+{
+  ++throwNumber;
+}
+
+
+int Teuchos::TestForException_getThrowNumber()
+{
+  return throwNumber;
+}
+
+
+void Teuchos::TestForException_break(const std::string &errorMsg, int throwNumber)
+{
+  (void)throwNumber; // Ignore unused arg
+  // Provide a statement to break on
+  size_t break_on_me;
+  break_on_me = errorMsg.length(); // Use errMsg to avoid compiler warning.
+  if (break_on_me) {} // Avoid warning
+  // Above is just some statement for the debugger to break on.  Note: now is
+  // a good time to examine the stack trace and look at the error message in
+  // 'errorMsg' to see what happened.  In GDB just type 'where' or you can go
+  // up by typing 'up' and moving up in the stack trace to see where you are
+  // and how you got to this point in the code where you are throwing this
+  // exception!  Typing in a 'p errorMsg' will show you what the error message
+  // is.  Also, you should consider adding a conditional break-point in this
+  // function based on a specific value of 'throwNumber' if the exception you
+  // want to examine is not the first exception thrown.
+}
+
+
+void Teuchos::TestForException_setEnableStacktrace(bool enableStrackTrace)
+{
+  loc_enableStackTrace() = enableStrackTrace;
+}
+
+
+bool Teuchos::TestForException_getEnableStacktrace()
+{
+  return loc_enableStackTrace();
+}
+
+void Teuchos::TestForTermination_terminate(const std::string &msg) {
+  std::ostringstream omsg;
+  //if (GlobalMPISession::getNProc() > 1) {
+  //  omsg << "p="<<GlobalMPISession::getRank()<<": ";
+  //}
+  omsg << msg << "\n";
+  std::cerr << omsg.str();
+  std::terminate();
+}
+// NOTE: The above usage of ostringstream is so that the output to std::cerr
+// is done as one string.  This should help to avoid jumbled output like is
+// occurring in tests that grep for this output (see #3163).
diff --git a/packages/rol/src/compatibility/teuchos-lite/Teuchos_TestForException.hpp b/packages/rol/src/compatibility/teuchos-lite/Teuchos_TestForException.hpp
new file mode 100644
index 000000000000..858332fd4c19
--- /dev/null
+++ b/packages/rol/src/compatibility/teuchos-lite/Teuchos_TestForException.hpp
@@ -0,0 +1,404 @@
+// @HEADER
+// ***********************************************************************
+//
+//                    Teuchos: Common Tools Package
+//                 Copyright (2004) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
+//
+// ***********************************************************************
+// @HEADER
+
+#ifndef TEUCHOS_TEST_FOR_EXCEPTION_H
+#define TEUCHOS_TEST_FOR_EXCEPTION_H
+
+/*! \file Teuchos_TestForException.hpp
+\brief Standard test and throw macros.
+*/
+
+#include "Teuchos_TypeNameTraits.hpp"
+#include "Teuchos_stacktrace.hpp"
+
+
+namespace Teuchos {
+
+
+/*! \defgroup TestForException_grp Utility code for throwing exceptions and setting breakpoints.
+\ingroup teuchos_language_support_grp
+*/
+
+/** \brief Increment the throw number.  \ingroup TestForException_grp */
+TEUCHOSCORE_LIB_DLL_EXPORT void TestForException_incrThrowNumber();
+
+/** \brief Increment the throw number.  \ingroup TestForException_grp */
+TEUCHOSCORE_LIB_DLL_EXPORT int TestForException_getThrowNumber();
+
+/** \brief The only purpose for this function is to set a breakpoint.
+    \ingroup TestForException_grp */
+TEUCHOSCORE_LIB_DLL_EXPORT void TestForException_break(const std::string &msg,
+  int throwNumber);
+
+/** \brief Set at runtime if stacktracing functionality is enabled when *
+    exceptions are thrown.  \ingroup TestForException_grp */
+TEUCHOSCORE_LIB_DLL_EXPORT void TestForException_setEnableStacktrace(bool enableStrackTrace);
+
+/** \brief Get at runtime if stacktracing functionality is enabled when
+ * exceptions are thrown. */
+TEUCHOSCORE_LIB_DLL_EXPORT bool TestForException_getEnableStacktrace();
+
+/** \brief Prints the message to std::cerr and calls std::terminate. */
+[[noreturn]] TEUCHOSCORE_LIB_DLL_EXPORT void TestForTermination_terminate(const std::string &msg);
+
+
+} // namespace Teuchos
+
+
+#ifdef HAVE_TEUCHOS_STACKTRACE
+#  define TEUCHOS_STORE_STACKTRACE() \
+  if (Teuchos::TestForException_getEnableStacktrace()) { \
+    Teuchos::store_stacktrace(); \
+  }
+#else
+#  define TEUCHOS_STORE_STACKTRACE()
+#endif
+
+
+/** \brief Macro for throwing an exception with breakpointing to ease debugging
+ *
+ * \param throw_exception_test [in] Test for when to throw the exception.
+ * This can and should be an expression that may mean something to the user.
+ * The text verbatim of this expression is included in the formed error
+ * string.
+ *
+ * \param Exception [in] This should be the name of an exception class.  The
+ * only requirement for this class is that it have a constructor that accepts
+ * an std::string object (as all of the standard exception classes do).
+ *
+ * \param msg [in] This is any expression that can be included in an output
+ * stream operation.  This is useful when buinding error messages on the fly.
+ * Note that the code in this argument only gets evaluated if
+ * <tt>throw_exception_test</tt> evaluates to <tt>true</tt> when an exception
+ * is throw.
+ *
+ * The way that this macro is intended to be used is to
+ * call it in the source code like a function.  For example,
+ * suppose that in a piece of code in the file <tt>my_source_file.cpp</tt>
+ * that the exception <tt>std::out_of_range</tt> is thrown if <tt>n > 100</tt>.
+ * To use the macro, the source code would contain (at line 225
+ * for instance):
+ \verbatim
+
+ TEUCHOS_TEST_FOR_EXCEPTION( n > 100, std::out_of_range,
+    "Error, n = " << n << is bad" );
+ \endverbatim
+ * When the program runs and with <tt>n = 125 > 100</tt> for instance,
+ * the <tt>std::out_of_range</tt> exception would be thrown with the
+ * error message:
+ \verbatim
+
+ /home/bob/project/src/my_source_file.cpp:225: n > 100: Error, n = 125 is bad
+ \endverbatim
+ *
+ * In order to debug this, simply open your debugger (gdb for instance),
+ * set a break point at <tt>my_soure_file.cpp:225</tt> and then set the condition
+ * to break for <tt>n > 100</tt> (e.g. in gdb the command
+ * is <tt>cond break_point_number n > 100</tt> and then run the
+ * program.  The program should stop a the point in the source file
+ * right where the exception will be thrown at but before the exception
+ * is thrown.  Try not to use expression for <tt>throw_exception_test</tt> that
+ * includes virtual function calls, etc. as most debuggers will not be able to check
+ * these types of conditions in order to stop at a breakpoint.  For example,
+ * instead of:
+ \verbatim
+
+ TEUCHOS_TEST_FOR_EXCEPTION( obj1->val() > obj2->val(), std::logic_error, "Oh no!" );
+ \endverbatim
+ * try:
+ \verbatim
+
+ double obj1_val = obj1->val(), obj2_val = obj2->val();
+ TEUCHOS_TEST_FOR_EXCEPTION( obj1_val > obj2_val, std::logic_error, "Oh no!" );
+ \endverbatim
+ * If the developer goes to the line in the source file that is contained
+ * in the error message of the exception thrown, he/she will see the
+ * underlying condition.
+ *
+ * As an alternative, you can set a breakpoint for any exception thrown by
+ * setting a breakpoint in the function <tt>ThrowException_break()</tt>.  If
+ * multiple exceptions are thrown, then set a conditional breakpoint to break
+ * on the exact exception using the <tt>throwNumber</tt>.  For example, if the
+ * throw number printed in the exception message is <tt>10</tt> then set the
+ * break point as (assuming this is the first breakpoint):
+ \verbatim
+ (gdb) b 'Teuchos::TestForException_break( [TAB] [ENTER]
+ (gdb) cond 1 thrownNumber==10
+ \endverbatim
+ * NOTE: This macro will only evaluate <tt>throw_exception_test</tt> once
+ * reguardless if the test fails and the exception is thrown or
+ * not. Therefore, it is safe to call a function with side-effects as the
+ * <tt>throw_exception_test</tt> argument.
+ *
+ * NOTE: This macro will result in creating a stacktrace snapshot in some
+ * cases (see the main doc page for details) and will be printed automatically
+ * when main() uses TEUCHOS_STANDARD_CATCH_STATEMENTS() to catch uncaught
+ * excpetions.
+ *
+ * \ingroup TestForException_grp
+ */
+#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg) \
+{ \
+  const bool throw_exception = (throw_exception_test); \
+  if(throw_exception) { \
+    Teuchos::TestForException_incrThrowNumber(); \
+    const int throwNumber = Teuchos::TestForException_getThrowNumber(); \
+    std::ostringstream omsg; \
+    omsg \
+      << __FILE__ << ":" << __LINE__ << ":\n\n" \
+      << "Throw number = " << throwNumber \
+      << "\n\n" \
+      << "Throw test that evaluated to true: "#throw_exception_test \
+      << "\n\n" \
+      << msg; \
+    const std::string &omsgstr = omsg.str(); \
+    TEUCHOS_STORE_STACKTRACE(); \
+    Teuchos::TestForException_break(omsgstr, throwNumber); \
+    throw Exception(omsgstr); \
+  } \
+}
+
+
+/** \brief Macro for throwing an exception from within a class method with
+ * breakpointing to ease debugging.
+ *
+ * \param throw_exception_test [in] Test for when to throw the exception.
+ * This can and should be an expression that may mean something to the user.
+ * The text verbatim of this expression is included in the formed error
+ * string.
+ *
+ * \param Exception [in] This should be the name of an exception class.  The
+ * only requirement for this class is that it have a constructor that accepts
+ * an std::string object (as all of the standard exception classes do).
+ *
+ * \param msg [in] This is any expression that can be included in an output
+ * stream operation.  This is useful when buinding error messages on the fly.
+ * Note that the code in this argument only gets evaluated if
+ * <tt>throw_exception_test</tt> evaluates to <tt>true</tt> when an exception
+ * is throw.
+ *
+ * \param tfecfFuncName [implicit] This is a variable in the current scope that is
+ * required to exist and assumed to contain the name of the current class method.
+ *
+ * \param this [implicit] This is the variable (*this), used for printing the
+ * typename of the enclosing class.
+ *
+ * The way that this macro is intended to be used is to call it from a member
+ * of of a class. It is used similarly to TEUCHOS_TEST_FOR_EXCEPTION, except that it
+ * assumes that the (above) variables <tt>this</tt> and <tt>fecfFuncName</tt>
+ * exist and are properly defined. Example usage is:
+
+ \code
+
+   std::string tfecfFuncName("someMethod");
+    TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( test, std::runtime_error,
+      ": can't call this method in that way.");
+
+ \endcode
+
+ * See <tt>TEUCHOS_TEST_FOR_EXCEPTION()</tt> for more details.
+ *
+ * \ingroup TestForException_grp
+ */
+#define TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(throw_exception_test, Exception, msg) \
+{ \
+   TEUCHOS_TEST_FOR_EXCEPTION( (throw_exception_test), Exception, \
+   Teuchos::typeName(*this) << "::" << tfecfFuncName << msg ) \
+}
+
+
+/** \brief Macro for throwing an exception with breakpointing to ease debugging
+ *
+ * This macro is equivalent to the <tt>TEUCHOS_TEST_FOR_EXCEPTION()</tt> macro except
+ * the file name, line number, and test condition are not printed.
+ *
+ * \ingroup TestForException_grp
+ */
+#define TEUCHOS_TEST_FOR_EXCEPTION_PURE_MSG(throw_exception_test, Exception, msg) \
+{ \
+  const bool throw_exception = (throw_exception_test); \
+  if(throw_exception) { \
+    Teuchos::TestForException_incrThrowNumber(); \
+    const int throwNumber = Teuchos::TestForException_getThrowNumber(); \
+    std::ostringstream omsg; \
+	  omsg << msg; \
+    omsg << "\n\nThrow number = " << throwNumber << "\n\n"; \
+    const std::string &omsgstr = omsg.str(); \
+    Teuchos::TestForException_break(omsgstr, throwNumber); \
+    TEUCHOS_STORE_STACKTRACE(); \
+    throw Exception(omsgstr); \
+  } \
+}
+
+
+/** \brief This macro is the same as <tt>TEUCHOS_TEST_FOR_EXCEPTION()</tt> except that the
+ * exception will be caught, the message printed, and then rethrown.
+ *
+ * \param throw_exception_test [in] See <tt>TEUCHOS_TEST_FOR_EXCEPTION()</tt>.
+ *
+ * \param Exception [in] See <tt>TEUCHOS_TEST_FOR_EXCEPTION()</tt>.
+ *
+ * \param msg [in] See <tt>TEUCHOS_TEST_FOR_EXCEPTION()</tt>.
+ *
+ * \param out_ptr [in] If <tt>out_ptr!=NULL</tt> then <tt>*out_ptr</tt> will
+ * receive a printout of a line of output that gives the exception type and
+ * the error message that is generated.
+ *
+ * See <tt>TEUCHOS_TEST_FOR_EXCEPTION()</tt> for more details.
+ *
+ * \ingroup TestForException_grp
+ */
+#define TEUCHOS_TEST_FOR_EXCEPTION_PRINT(throw_exception_test, Exception, msg, out_ptr) \
+try { \
+  TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg); \
+} \
+catch(const std::exception &except) { \
+  std::ostream *l_out_ptr = (out_ptr); \
+  if(l_out_ptr) { \
+    *l_out_ptr \
+      << "\nThrowing an std::exception of type \'"<<Teuchos::typeName(except) \
+      <<"\' with the error message: " \
+      << except.what(); \
+  } \
+  throw; \
+}
+
+
+/** \brief This macro is designed to be a short version of
+ * <tt>TEUCHOS_TEST_FOR_EXCEPTION()</tt> that is easier to call.
+ *
+ * \param throw_exception_test [in] Test for when to throw the exception.
+ * This can and should be an expression that may mean something to the user.
+ * The text verbatim of this expression is included in the formed error
+ * string.
+ *
+ * \note The exception thrown is <tt>std::logic_error</tt>.
+ *
+ * \ingroup TestForException_grp
+ */
+#define TEUCHOS_TEST_FOR_EXCEPT(throw_exception_test) \
+  TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, std::logic_error, "Error!")
+
+
+/** \brief This macro is designed to be a short version of
+ * <tt>TEUCHOS_TEST_FOR_EXCEPTION()</tt> that is easier to call.
+ *
+ * \param throw_exception_test [in] Test for when to throw the exception.
+ * This can and should be an expression that may mean something to the user.
+ * The text verbatim of this expression is included in the formed error
+ * string.
+ *
+ * \param msg [in] The error message.
+ *
+ * \note The exception thrown is <tt>std::logic_error</tt>.
+ *
+ * See <tt>TEUCHOS_TEST_FOR_EXCEPTION()</tt> for more details.
+ *
+ * \ingroup TestForException_grp
+ */
+#define TEUCHOS_TEST_FOR_EXCEPT_MSG(throw_exception_test, msg) \
+  TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, std::logic_error, msg)
+
+
+/** \brief This macro is the same as <tt>TEUCHOS_TEST_FOR_EXCEPT()</tt> except that the
+ * exception will be caught, the message printed, and then rethrown.
+ *
+ * \param throw_exception_test [in] See <tt>TEUCHOS_TEST_FOR_EXCEPT()</tt>.
+ *
+ * \param out_ptr [in] If <tt>out_ptr!=NULL</tt> then <tt>*out_ptr</tt> will
+ * receive a printout of a line of output that gives the exception type and
+ * the error message that is generated.
+ *
+ * See <tt>TEUCHOS_TEST_FOR_EXCEPTION()</tt> for more details.
+ *
+ * \ingroup TestForException_grp
+ */
+#define TEUCHOS_TEST_FOR_EXCEPT_PRINT(throw_exception_test, out_ptr) \
+  TEUCHOS_TEST_FOR_EXCEPTION_PRINT(throw_exception_test, std::logic_error, "Error!", out_ptr)
+
+
+/** \brief This macro intercepts an exception, prints a standardized message
+ * including the current filename and line number, and then throws the
+ * exception up the stack.
+ *
+ * \param exc [in] the exception that has been caught
+ *
+ * \ingroup TestForException_grp
+ */
+#define TEUCHOS_TRACE(exc)\
+{ \
+  std::ostringstream omsg; \
+	omsg << exc.what() << std::endl \
+       << "caught in " << __FILE__ << ":" << __LINE__ << std::endl ; \
+  throw std::runtime_error(omsg.str()); \
+}
+
+/** \brief This macro is to be used instead of
+ * <tt>TEUCHOS_TEST_FOR_EXCEPTION()</tt> to report an error in situations
+ * where an exception can't be throw (like in an destructor).
+ *
+ * \param terminate_test [in] See <tt>TEUCHOS_TEST_FOR_EXCEPTION()</tt>.
+ *
+ * \param msg [in] See <tt>TEUCHOS_TEST_FOR_EXCEPTION()</tt>.
+ *
+ * If the termination test evaluates to <tt>true</tt>, then
+ * <tt>std::terminate()</tt> is called (which should bring down an entire
+ * multi-process MPI program even if only one process calls
+ * <tt>std::terminate()</tt> with most MPI implementations).
+ *
+ * \ingroup TestForException_grp
+ */
+#define TEUCHOS_TEST_FOR_TERMINATION(terminate_test, msg) \
+{ \
+  const bool call_terminate = (terminate_test); \
+  if (call_terminate) { \
+    std::ostringstream omsg; \
+    omsg \
+      << __FILE__ << ":" << __LINE__ << ":\n\n" \
+      << "Terminate test that evaluated to true: "#terminate_test \
+      << "\n\n" \
+      << msg << "\n\n"; \
+    auto str = omsg.str(); \
+    Teuchos::TestForTermination_terminate(str); \
+  } \
+}
+
+#endif // TEUCHOS_TEST_FOR_EXCEPTION_H
diff --git a/packages/rol/src/compatibility/teuchos-lite/Teuchos_TypeNameTraits.hpp b/packages/rol/src/compatibility/teuchos-lite/Teuchos_TypeNameTraits.hpp
new file mode 100644
index 000000000000..bb3e89f9b537
--- /dev/null
+++ b/packages/rol/src/compatibility/teuchos-lite/Teuchos_TypeNameTraits.hpp
@@ -0,0 +1,233 @@
+// @HEADER
+// ***********************************************************************
+//
+//                    Teuchos: Common Tools Package
+//                 Copyright (2004) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
+//
+// ***********************************************************************
+// @HEADER
+
+#ifndef _TEUCHOS_TYPE_NAME_TRAITS_HPP_
+#define _TEUCHOS_TYPE_NAME_TRAITS_HPP_
+
+/*! \file Teuchos_TypeNameTraits.hpp
+ \brief Defines basic traits returning the
+    name of a type in a portable and readable way.
+*/
+
+// mfh 30 Jan 2013: Thanks to Jim Willenbring for reporting this, and
+// to Mike Glass and Paul Lin for updating the fix for dealing with a
+// bug in IBM's XL C++ compiler.  The update was necessary due to a
+// relapse of the bug in a newer version of the compiler.
+//
+// If you don't have this update, you can fix the problem by defining
+// the macro TEUCHOS_TYPE_NAME_TRAITS_OLD_IBM when compiling anything
+// that includes this header file.  If you have the current version of
+// this file, then you don't need to do anything.
+#if defined(__IBMCPP__) && ( __IBMCPP__ < 900 || __IBMCPP__ == 1210 )
+# define TEUCHOS_TYPE_NAME_TRAITS_OLD_IBM
+#endif
+
+#include <typeinfo>
+
+#include "Teuchos_ConfigDefs.hpp"
+
+namespace  Teuchos {
+
+
+/** \brief Demangle a C++ name if valid.
+ *
+ * The name must have come from <tt>typeid(...).name()</tt> in order to be
+ * valid name to pass to this function.
+ *
+ * \ingroup teuchos_language_support_grp
+ */
+TEUCHOSCORE_LIB_DLL_EXPORT std::string demangleName( const std::string &mangledName );
+
+
+/** \brief Default traits class that just returns <tt>typeid(T).name()</tt>.
+ *
+ * \ingroup teuchos_language_support_grp
+ */
+template<typename T>
+class TypeNameTraits {
+public:
+  /** \brief . */
+  static std::string name()
+    {
+      return demangleName(typeid(T).name());
+    }
+  /** \brief . */
+#ifndef TEUCHOS_TYPE_NAME_TRAITS_OLD_IBM
+  static std::string concreteName( const T& t )
+#else
+  // the IBM compilers on AIX have a problem with const
+  static std::string concreteName( T t )
+#endif
+    {
+      return demangleName(typeid(t).name());
+    }
+};
+
+
+/** \brief Template function for returning the concrete type name of a
+ * passed-in object.
+ *
+ * Uses the traits class TypeNameTraits so the behavior of this function can
+ * be specialized in every possible way.  The default return value is
+ * typically derived from <tt>typeid(t).name()</tt>.
+ *
+ * \ingroup teuchos_language_support_grp
+ */
+template<typename T>
+std::string typeName( const T &t )
+{
+  typedef typename std::remove_const_t<T> ncT;
+#ifndef TEUCHOS_TYPE_NAME_TRAITS_OLD_IBM
+  return TypeNameTraits<ncT>::concreteName(t);
+#else
+  // You can't pass general objects to AIX by value as above.  This means that
+  // you will not get the concrete name printed on AIX but that is life on
+  // such compilers.
+  return TypeNameTraits<ncT>::name();
+#endif
+}
+
+
+/** \brief Template function for returning the type name of the actual
+ * concrete name of a passed-in object.
+ *
+ * Uses the traits class TypeNameTraits so the behavior of this function can
+ * be specialized in every possible way.
+ *
+ * \ingroup teuchos_language_support_grp
+ */
+template<typename T>
+std::string concreteTypeName( const T &t )
+{
+  typedef typename std::remove_const_t<T> ncT;
+  return TypeNameTraits<ncT>::concreteName(t);
+}
+
+
+#define TEUCHOS_TYPE_NAME_TRAITS_BUILTIN_TYPE_SPECIALIZATION(TYPE) \
+template<> \
+class TypeNameTraits<TYPE> { \
+public: \
+  static std::string name() { return (#TYPE); } \
+  static std::string concreteName(const TYPE&) { return name(); } \
+} \
+
+TEUCHOS_TYPE_NAME_TRAITS_BUILTIN_TYPE_SPECIALIZATION(bool);
+TEUCHOS_TYPE_NAME_TRAITS_BUILTIN_TYPE_SPECIALIZATION(char);
+TEUCHOS_TYPE_NAME_TRAITS_BUILTIN_TYPE_SPECIALIZATION(signed char);
+TEUCHOS_TYPE_NAME_TRAITS_BUILTIN_TYPE_SPECIALIZATION(unsigned char);
+TEUCHOS_TYPE_NAME_TRAITS_BUILTIN_TYPE_SPECIALIZATION(short int);
+TEUCHOS_TYPE_NAME_TRAITS_BUILTIN_TYPE_SPECIALIZATION(int);
+TEUCHOS_TYPE_NAME_TRAITS_BUILTIN_TYPE_SPECIALIZATION(long int);
+TEUCHOS_TYPE_NAME_TRAITS_BUILTIN_TYPE_SPECIALIZATION(unsigned short int);
+TEUCHOS_TYPE_NAME_TRAITS_BUILTIN_TYPE_SPECIALIZATION(unsigned int);
+TEUCHOS_TYPE_NAME_TRAITS_BUILTIN_TYPE_SPECIALIZATION(unsigned long int);
+TEUCHOS_TYPE_NAME_TRAITS_BUILTIN_TYPE_SPECIALIZATION(float);
+TEUCHOS_TYPE_NAME_TRAITS_BUILTIN_TYPE_SPECIALIZATION(double);
+
+#ifdef HAVE_TEUCHOSCORE_QUADMATH
+TEUCHOS_TYPE_NAME_TRAITS_BUILTIN_TYPE_SPECIALIZATION(__float128);
+#endif // HAVE_TEUCHOSCORE_QUADMATH
+
+#ifdef HAVE_TEUCHOS_LONG_DOUBLE
+TEUCHOS_TYPE_NAME_TRAITS_BUILTIN_TYPE_SPECIALIZATION(long double);
+#endif // HAVE_TEUCHOS_LONG_DOUBLE
+
+template<typename T>
+class TEUCHOSCORE_LIB_DLL_EXPORT TypeNameTraits<T*> {
+public:
+  typedef T* T_ptr;
+  static std::string name() { return TypeNameTraits<T>::name() + "*"; }
+  static std::string concreteName(T_ptr) { return name(); }
+};
+
+
+template<>
+class TEUCHOSCORE_LIB_DLL_EXPORT TypeNameTraits<std::string> {
+public:
+  static std::string name() { return "string"; }
+  static std::string concreteName(const std::string&)
+    { return name(); }
+};
+
+
+template<>
+class TEUCHOSCORE_LIB_DLL_EXPORT TypeNameTraits<void*> {
+public:
+  static std::string name() { return "void*"; }
+  static std::string concreteName(const std::string&) { return name(); }
+};
+
+// mfh 31 Jul 2012: Specialization for "void" will hopefully fix
+// compile errors on Windows, such as the following:
+//
+// http://testing.sandia.gov/cdash/viewBuildError.php?buildid=611137
+//
+// I'm imitating the specialization of void* above.
+template<>
+class TEUCHOSCORE_LIB_DLL_EXPORT TypeNameTraits<void> {
+public:
+  static std::string name() { return "void"; }
+  static std::string concreteName(const std::string&) { return name(); }
+};
+
+
+#ifdef HAVE_TEUCHOS_COMPLEX
+
+
+template<typename T>
+class TEUCHOSCORE_LIB_DLL_EXPORT TypeNameTraits<std::complex<T> > {
+public:
+  static std::string name()
+    { return "complex<"+TypeNameTraits<T>::name()+">"; }
+  static std::string concreteName(const std::complex<T>&)
+    { return name(); }
+};
+
+
+#endif // HAVE_TEUCHOS_COMPLEX
+
+
+
+} // namespace Teuchos
+
+
+#endif // _TEUCHOS_TYPE_NAME_TRAITS_HPP_
diff --git a/packages/rol/src/compatibility/teuchos-lite/Teuchos_config.h b/packages/rol/src/compatibility/teuchos-lite/Teuchos_config.h
new file mode 100644
index 000000000000..6dba0060ade0
--- /dev/null
+++ b/packages/rol/src/compatibility/teuchos-lite/Teuchos_config.h
@@ -0,0 +1,176 @@
+
+/* #undef TEUCHOS_STANDALONE_PACKAGE */
+
+/* #undef BUILD_SHARED_LIBS */
+
+#define HAVE_TEUCHOS_PARAMETERLIST
+
+/* Define the Fortran name mangling to be used for the BLAS */
+#ifndef F77_BLAS_MANGLE
+ #define F77_BLAS_MANGLE(name,NAME) name ## _
+#endif
+
+/* Define to dummy `main' function (if any) required to link to the Fortran
+   libraries. */
+#ifndef F77_DUMMY_MAIN
+/* #undef F77_DUMMY_MAIN */
+#endif
+
+/* Define to a macro mangling the given C identifier (in lower and upper
+   case), which must not contain underscores, for linking with Fortran. */
+#ifndef F77_FUNC
+ #define F77_FUNC(name,NAME) name ## _
+#endif
+
+/* As F77_FUNC, but for C identifiers containing underscores. */
+#ifndef F77_FUNC_
+ #define F77_FUNC_(name,NAME) name ## _
+#endif
+
+/* Define if F77 and FC dummy `main' functions are identical. */
+#ifndef FC_DUMMY_MAIN_EQ_F77
+/* #undef FC_DUMMY_MAIN_EQ_F77 */
+#endif
+
+/* Define to 1 if you have the <fpu_control.h> header file. */
+/* #undef HAVE_FPU_CONTROL_H */
+
+/* Define if the compiler supports abi::__cxa_demangle(...) */
+#define HAVE_GCC_ABI_DEMANGLE
+
+/* Define if the C++ compiler knows how to compile __attribute__((constructor)) */
+#define HAVE_TEUCHOS_CXX_ATTRIBUTE_CONSTRUCTOR
+
+/* Define if the C++ compiler knows how to compile __attribute__((weak)), and
+   if a program can test weak functions and call them if they are not NULL. */
+/* #undef HAVE_TEUCHOS_CXX_ATTRIBUTE_WEAK */
+
+/* Define if the C++ compiler knows how to compile "#pragma weak", and
+   if a program can test weak functions and call them if they are not NULL. */
+/* #undef HAVE_TEUCHOS_CXX_PRAGMA_WEAK */
+
+/* Define if building dynamic shared libraries (instead of static libraries) */
+/* #undef HAVE_TEUCHOS_DYNAMIC_LIBS */
+
+/* Define if the (Windows) compiler has intrinsic datatype __int64 */
+/* #undef HAVE_TEUCHOS___INT64 */
+
+/* Not namespaced so should be deprecated. */
+/* #undef HAVE_MPI */
+
+/* #undef HAVE_TEUCHOS_MPI */
+
+/* detected problems with the blas and solution methods */
+#define HAVE_TEUCHOS_BLASFLOAT
+/* #define HAVE_TEUCHOS_BLASFLOAT_APPLE_VECLIB_BUGFIX */
+/* #undef HAVE_TEUCHOS_BLASFLOAT_DOUBLE_RETURN */
+
+#define HAVE_SLAPY2_PROBLEM
+#define HAVE_SLAPY2_DOUBLE_RETURN
+
+/* #undef HAVE_COMPLEX_BLAS */
+/* #undef HAVE_COMPLEX_BLAS_PROBLEM */
+/* #undef HAVE_FIXABLE_COMPLEX_BLAS_PROBLEM */
+/* #undef HAVE_VECLIB_COMPLEX_BLAS */
+
+/* define if the compiler supports access of protected templated nested
+   classes in derived classes */
+/* #undef HAVE_PROTECTED_NESTED_TEMPLATE_CLASS_ACCESS */
+
+/* #undef HAVE_TEUCHOS_ARRAY_BOUNDSCHECK */
+
+/* #undef HAVE_TEUCHOS_THREAD_SAFE */
+
+/* #undef HAVE_TEUCHOS_LAPACKLARND */
+
+/* Deprecated */
+/* #undef HAVE_TEUCHOS_BOOST */
+
+/* Deprecated */
+/* #undef HAVE_TEUCHOS_QT */
+
+/* #undef HAVE_TEUCHOS_QD */
+
+/* #undef HAVE_TEUCHOSNUMERICS_EIGEN */
+
+/* #undef HAVE_TEUCHOS_DOUBLE_TO_QD */
+
+/* #undef HAVE_TEUCHOS_ARPREC */
+
+/* #undef HAVE_TEUCHOS_DOUBLE_TO_ARPREC */
+
+#define HAVE_TEUCHOS_ADD_TIME_MONITOR_TO_STACKED_TIMER
+
+/* #undef HAVE_TEUCHOS_COMM_TIMERS */
+
+/* #undef HAVE_TEUCHOS_FLOAT */
+
+/* #undef HAVE_TEUCHOS_LONG_DOUBLE */
+
+#define TEUCHOS_ORDINAL_TYPE ptrdiff_t
+
+/* #undef HAVE_TEUCHOS_COMPLEX */
+
+/* #undef HAVE_TEUCHOS_INST_FLOAT */
+
+/* #undef HAVE_TEUCHOS_INST_COMPLEX_FLOAT */
+
+/* #undef HAVE_TEUCHOS_INST_COMPLEX_DOUBLE */
+
+// This exists only for backwards compatibility.
+#define HAVE_TEUCHOS_LONG_LONG_INT 1
+
+/* #undef HAVE_TEUCHOS_DEBUG */
+
+/* #undef HAVE_TEUCHOS_DEBUG_RCP_NODE_TRACING */
+
+#define HAVE_TEUCHOS_DEMANGLE
+
+/* #undef HAVE_TEUCHOS_EXPAT */
+
+#define HAVE_TEUCHOS_EXPLICIT_INSTANTIATION
+
+#define HAVE_TEUCHOS_EXTENDED
+
+/* #undef HAVE_TEUCHOS_GNU_MP */
+
+/* #undef HAVE_TEUCHOS_LIBXML2 */
+
+/* #undef HAVE_TEUCHOS_C_EXCEPTIONS */
+
+/* #undef HAVE_TEUCHOS_LINK */
+
+/* #undef HAVE_TEUCHOS_BFD */
+
+/* #undef HAVE_TEUCHOS_STACKTRACE */
+
+/* #undef HAVE_TEUCHOS_DEFAULT_STACKTRACE */
+
+/* #undef HAVE_TEUCHOS_GLOBALLY_REDUCE_UNITTEST_RESULTS */
+
+/* #undef HAVE_TEUCHOS_TIME_MASSIF_SNAPSHOTS */
+
+/* #undef HAVE_TEUCHOS_KOKKOS_PROFILING */
+
+/* template qualifier required for calling template methods from non-template
+   code */
+#define INVALID_TEMPLATE_QUALIFIER 
+
+#ifndef TEUCHOS_DEPRECATED
+#  if (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
+#    define TEUCHOS_DEPRECATED  __attribute__((__deprecated__))
+#  else
+#    define TEUCHOS_DEPRECATED
+#  endif
+#endif
+
+#ifndef TEUCHOS_DEPRECATED_MSG
+#  if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5))
+#    define TEUCHOS_DEPRECATED_MSG(MSG)  __attribute__((__deprecated__ (#MSG) ))
+#  elif (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
+#    define TEUCHOS_DEPRECATED_MSG(MSG)  __attribute__((__deprecated__))
+#  else
+#    define TEUCHOS_DEPRECATED_MSG(MSG)
+#  endif
+#endif
+
diff --git a/packages/rol/src/compatibility/teuchos-lite/Teuchos_stacktrace.hpp b/packages/rol/src/compatibility/teuchos-lite/Teuchos_stacktrace.hpp
new file mode 100644
index 000000000000..ffc37a3b1f8b
--- /dev/null
+++ b/packages/rol/src/compatibility/teuchos-lite/Teuchos_stacktrace.hpp
@@ -0,0 +1,90 @@
+/*
+Copyright (c) 2010, Ondrej Certik
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+* Neither the name of the Sandia Corporation nor the names of its contributors
+  may be used to endorse or promote products derived from this software without
+  specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef TEUCHOS_STACKTRACE_HPP
+#define TEUCHOS_STACKTRACE_HPP
+
+/*! \file Teuchos_stacktrace.hpp
+
+\brief Functions for returning stacktrace info (GCC only initially).
+*/
+
+#include "Teuchos_ConfigDefs.hpp"
+
+
+#ifdef HAVE_TEUCHOS_STACKTRACE
+
+
+/*! \defgroup TeuchosStackTrace_grp Utility code for generating stacktraces.
+ *
+ * \ingroup teuchos_language_support_grp
+ */
+
+namespace Teuchos {
+
+
+/** \brief Stores the current stacktrace into an internal global variable.
+ *
+ * \ingroup TeuchosStackTrace_grp
+ */
+void store_stacktrace();
+
+/** \brief Returns the last stored stacktrace as a string.
+ *
+ * \ingroup TeuchosStackTrace_grp
+ */
+std::string get_stored_stacktrace();
+
+/** \brief Returns the current stacktrace as a string.
+ *
+ * \param impl_stacktrace_depth [in] The stacktrace depth to remove from the
+ * stacktrace printout to avoid showing users implementation functions in the
+ * stacktrace.
+ *
+ * \ingroup TeuchosStackTrace_grp
+ */
+std::string get_stacktrace(int impl_stacktrace_depth=0);
+
+/** \brief Prints the current stacktrace to stdout.
+ *
+ * \ingroup TeuchosStackTrace_grp
+ */
+void show_stacktrace();
+
+/** \brief Prints the current stacktrace to stdout on segfault.
+ *
+ * \ingroup TeuchosStackTrace_grp
+ */
+void print_stack_on_segfault();
+
+} // end namespace Teuchos
+
+#endif // HAVE_TEUCHOS_STACKTRACE
+
+#endif // TEUCHOS_STACKTRACE_HPP
+
diff --git a/packages/rol/src/compatibility/teuchos/mpi/ROL_GlobalMPISession.hpp b/packages/rol/src/compatibility/teuchos/mpi/ROL_GlobalMPISession.hpp
new file mode 100644
index 000000000000..f73d8e5ffa69
--- /dev/null
+++ b/packages/rol/src/compatibility/teuchos/mpi/ROL_GlobalMPISession.hpp
@@ -0,0 +1,23 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+#pragma once
+#ifndef ROL_GLOBAL_MPI_SESSION_HPP
+#define ROL_GLOBAL_MPI_SESSION_HPP
+
+#include "Teuchos_GlobalMPISession.hpp"
+
+namespace ROL {
+
+  using Teuchos::GlobalMPISession;
+
+} 
+
+#endif // ROL_GLOBAL_MPI_SESSION_HPP
+
diff --git a/packages/rol/src/compatibility/teuchos/parameterlist/ROL_ParameterList.hpp b/packages/rol/src/compatibility/teuchos/parameterlist/ROL_ParameterList.hpp
index 028615efdf85..6cd391905b84 100644
--- a/packages/rol/src/compatibility/teuchos/parameterlist/ROL_ParameterList.hpp
+++ b/packages/rol/src/compatibility/teuchos/parameterlist/ROL_ParameterList.hpp
@@ -8,50 +8,68 @@
 // @HEADER
 
 #pragma once
+
 #include "ROL_Ptr.hpp"
 #include "ROL_ParameterList.hpp"
 #include "Teuchos_XMLParameterListCoreHelpers.hpp"
+#include "Teuchos_YamlParameterListCoreHelpers.hpp"
 
 namespace ROL {
 using ParameterList = Teuchos::ParameterList;
 
-inline
-void readParametersFromXml( const std::string &filename,
-                            ParameterList& parlist ) {
-  Teuchos::Ptr<ParameterList> p{&parlist};
-  Teuchos::updateParametersFromXmlFile( filename, p );
-}
+  inline ROL::Ptr<ParameterList> getParametersFromXmlFile( const std::string& filename ) {
+    auto parlist = ROL::makePtr<ParameterList>();
+    Teuchos::Ptr<ParameterList> p{&*parlist};
+    Teuchos::updateParametersFromXmlFile( filename, p );
+    return parlist;
+  }
 
-  inline void writeParameterListToXmlFile( ParameterList& parlist,
-                                           const std::string& filename ) {
-    Teuchos::writeParameterListToXmlFile(parlist, filename);
+  inline ROL::Ptr<ParameterList> getParametersFromYamlFile( const std::string& filename ) {
+    auto parlist = ROL::makePtr<ParameterList>();
+    Teuchos::Ptr<ParameterList> p{&*parlist};
+    Teuchos::updateParametersFromYamlFile( filename, p );
+    return parlist;
   }
 
-  inline void updateParametersFromXmlFile(const std::string& filename ,
-                                          ParameterList& parlist) {
+  inline void readParametersFromXml( const std::string &filename,
+                                     ParameterList& parlist ) {
     Teuchos::Ptr<ParameterList> p{&parlist};
     Teuchos::updateParametersFromXmlFile( filename, p );
   }
 
+  inline void readParametersFromYaml( const std::string &filename,
+                                      ParameterList& parlist ) {
+    Teuchos::Ptr<ParameterList> p{&parlist};
+    Teuchos::updateParametersFromYamlFile( filename, p );
+  }
 
-
-inline
-ROL::Ptr<ParameterList> getParametersFromXmlFile( const std::string& filename )
-  {
-    auto parlist = ROL::makePtr<ParameterList>();
-    Teuchos::Ptr<ParameterList> p{&*parlist};
+  inline void updateParametersFromXmlFile( const std::string& filename,
+                                           ParameterList& parlist ) {
+    Teuchos::Ptr<ParameterList> p{&parlist};
     Teuchos::updateParametersFromXmlFile( filename, p );
-    return parlist;
   }
 
+  inline void updateParametersFromYamlFile( const std::string& filename,
+                                            ParameterList& parlist ) {
+    Teuchos::Ptr<ParameterList> p{&parlist};
+    Teuchos::updateParametersFromYamlFile( filename, p );
+  }
 
-template <class T>
-inline std::vector<T> getArrayFromStringParameter( const ParameterList& parlist,
-                                                  const std::string& name ) {
-  auto teuchos_arr = Teuchos::getArrayFromStringParameter<T>( parlist, name );
+  inline void writeParameterListToXmlFile( ParameterList& parlist,
+                                           const std::string& filename ) {
+    Teuchos::writeParameterListToXmlFile(parlist, filename);
+  }
 
-    return teuchos_arr.toVector();
-}
+  inline void writeParameterListToYamlFile( ParameterList& parlist,
+                                           const std::string& filename ) {
+    Teuchos::writeParameterListToYamlFile(parlist, filename);
+  }
 
+  template <class T>
+  inline std::vector<T> getArrayFromStringParameter( const ParameterList& parlist,
+                                                     const std::string& name ) {
+    auto teuchos_arr = Teuchos::getArrayFromStringParameter<T>( parlist, name );
+    return teuchos_arr.toVector();
+  }
 
 } // namespace ROL
diff --git a/packages/rol/src/function/std/ROL_StdLinearOperatorFactory.hpp b/packages/rol/src/function/std/ROL_StdLinearOperatorFactory.hpp
index cc20785314eb..3c4de824d2af 100644
--- a/packages/rol/src/function/std/ROL_StdLinearOperatorFactory.hpp
+++ b/packages/rol/src/function/std/ROL_StdLinearOperatorFactory.hpp
@@ -38,7 +38,7 @@ class StdLinearOperatorFactory {
 private:
 
 
-  Teuchos::BLAS<int,Real>     blas_;
+  ROL::BLAS<int,Real>     blas_;
   ROL::LAPACK<int,Real>   lapack_;
 
   // Fill x with uniformly-distributed random values from [lower,upper]
diff --git a/packages/rol/src/step/krylov/ROL_GMRES.hpp b/packages/rol/src/step/krylov/ROL_GMRES.hpp
index 74ba97ceb59c..7e924359a554 100644
--- a/packages/rol/src/step/krylov/ROL_GMRES.hpp
+++ b/packages/rol/src/step/krylov/ROL_GMRES.hpp
@@ -22,6 +22,48 @@
 
 namespace ROL {
 
+template<class Real>
+class VectorArray {
+private:
+  std::vector<Ptr<Vector<Real>>> data_;
+  unsigned size_;
+
+public:
+  VectorArray() : size_(0u) {}
+
+  void allocate(const Vector<Real> &x, unsigned numVec) {
+    data_.clear();
+    for (unsigned i = 0u; i < numVec; ++i)
+      data_.push_back(x.clone());
+    size_ = numVec;
+  }
+
+  const Ptr<Vector<Real>> get(const Vector<Real> &x, unsigned ind) {
+    if (ind < size_) {
+      return data_[ind];
+    }
+    else if (ind == size_) {
+      data_.push_back(x.clone());
+      size_++;
+      return data_[ind];
+    }
+    else {
+      throw Exception::NotImplemented(">>> GMRES : Index out of range!");
+      return nullPtr;
+    }
+  }
+
+  const Ptr<Vector<Real>> get(unsigned ind) {
+    if (size_ > 0u) {
+      return get(*data_[0],ind);
+    }
+    else {
+      throw Exception::NotImplemented(">>> GMRES : No vectors allocated!");
+      return nullPtr;
+    }
+  }
+};
+
 template<class Real>
 class GMRES : public Krylov<Real> {
 
@@ -30,30 +72,31 @@ class GMRES : public Krylov<Real> {
 
 private:
 
-  ROL::Ptr<Vector<Real> > r_;
-  ROL::Ptr<Vector<Real> > z_;
-  ROL::Ptr<Vector<Real> > w_;
+  Ptr<Vector<Real>> r_;
+  Ptr<Vector<Real>> z_;
+  Ptr<Vector<Real>> w_;
 
-  ROL::Ptr<SDMatrix> H_;      // quasi-Hessenberg matrix
-  ROL::Ptr<SDVector> cs_;     // Givens Rotations cosine components
-  ROL::Ptr<SDVector> sn_;     // Givens Rotations sine components
-  ROL::Ptr<SDVector> s_;
-  ROL::Ptr<SDVector> y_;
-  ROL::Ptr<SDVector> cnorm_;
+  Ptr<SDMatrix> H_;      // quasi-Hessenberg matrix
+  Ptr<SDVector> cs_;     // Givens Rotations cosine components
+  Ptr<SDVector> sn_;     // Givens Rotations sine components
+  Ptr<SDVector> s_;
+  Ptr<SDVector> y_;
+  Ptr<SDVector> cnorm_;
 
-  ROL::Ptr<std::vector<Real> > res_;
+  Ptr<std::vector<Real>> res_;
+  Ptr<VectorArray<Real>> V_, Z_;
 
   bool isInitialized_;
   bool useInexact_;
   bool useInitialGuess_;    // If false, inital x will be ignored and zero vec used
   bool printIters_;
-  ROL::Ptr<std::ostream> outStream_;
+  Ptr<std::ostream> outStream_;
 
-  ROL::LAPACK<int,Real> lapack_;
+  LAPACK<int,Real> lapack_;
 
 public:
 
-  GMRES( ROL::ParameterList &parlist ) : Krylov<Real>(parlist), isInitialized_(false), printIters_(false) {
+  GMRES( ParameterList &parlist ) : Krylov<Real>(parlist), isInitialized_(false), printIters_(false) {
 
     using std::vector;
 
@@ -66,14 +109,16 @@ class GMRES : public Krylov<Real> {
     useInitialGuess_ = kList.get("Use Initial Guess",false);
     int maxit = Krylov<Real>::getMaximumIteration();
 
-    H_     = ROL::makePtr<SDMatrix>( maxit+1, maxit );
-    cs_    = ROL::makePtr<SDVector>( maxit );
-    sn_    = ROL::makePtr<SDVector>( maxit );
-    s_     = ROL::makePtr<SDVector>( maxit+1 );
-    y_     = ROL::makePtr<SDVector>( maxit+1 );
-    cnorm_ = ROL::makePtr<SDVector>( maxit );
-    res_   = ROL::makePtr<std::vector<Real>>(maxit+1,zero);
+    H_     = makePtr<SDMatrix>( maxit+1, maxit );
+    cs_    = makePtr<SDVector>( maxit );
+    sn_    = makePtr<SDVector>( maxit );
+    s_     = makePtr<SDVector>( maxit+1 );
+    y_     = makePtr<SDVector>( maxit+1 );
+    cnorm_ = makePtr<SDVector>( maxit );
+    res_   = makePtr<std::vector<Real>>(maxit+1,zero);
 
+    V_ = makePtr<VectorArray<Real>>();
+    Z_ = makePtr<VectorArray<Real>>();
   }
 
   Real run( Vector<Real> &x, LinearOperator<Real> &A, const Vector<Real> &b,
@@ -92,6 +137,9 @@ class GMRES : public Krylov<Real> {
       w_  = b.clone();
       z_  = x.clone();
 
+      V_->allocate(b,std::min(20u,static_cast<unsigned>(maxit))+1u);
+      Z_->allocate(x,std::min(20u,static_cast<unsigned>(maxit)));
+
       isInitialized_ = true;
     }
 
@@ -112,14 +160,10 @@ class GMRES : public Krylov<Real> {
 
     Real temp  = 0;
 
-    std::vector<ROL::Ptr<Vector<Real > > > V;
-    std::vector<ROL::Ptr<Vector<Real > > > Z;
-
     (*res_)[0] = r_->norm();
 
-    if (printIters_) {
+    if (printIters_)
       *outStream_ << "GMRES Iteration " << 0 << ", Residual = " << (*res_)[0] << "\n";
-    }
 
     // This should be a tolerance check
     Real rtol = std::min(absTol,relTol*(*res_)[0]);
@@ -129,42 +173,36 @@ class GMRES : public Krylov<Real> {
       return (*res_)[0];
     }
 
-    V.push_back(b.clone());
-    (V[0])->set(*r_);
-    (V[0])->scale(one/(*res_)[0]);
+    V_->get(0)->set(*r_);
+    V_->get(0)->scale(one/(*res_)[0]);
 
     (*s_)(0) = (*res_)[0];
 
     for( iter=0; iter<maxit; ++iter ) {
 
-      if( useInexact_ ) {
-        itol = rtol/(maxit*(*res_)[iter]);
-      }
-
-      Z.push_back(x.clone());
+      if( useInexact_ ) itol = rtol/(maxit*(*res_)[iter]);
 
       // Apply right preconditioner
-      M.applyInverse(*(Z[iter]),*(V[iter]),itol);
+      M.applyInverse(*(Z_->get(iter)),*(V_->get(iter)),itol);
 
       // Apply operator
-      A.apply(*w_,*(Z[iter]),itol);
+      A.apply(*w_,*(Z_->get(iter)),itol);
 
       // Evaluate coefficients and orthogonalize using Gram-Schmidt
       for( int k=0; k<=iter; ++k ) {
-        (*H_)(k,iter) = w_->dot(*(V[k]));
-        w_->axpy( -(*H_)(k,iter), *(V[k]) );
+        (*H_)(k,iter) = w_->dot(*(V_->get(k)));
+        w_->axpy( -(*H_)(k,iter), *(V_->get(k)) );
       }
 
       (*H_)(iter+1,iter) = w_->norm();
 
-      V.push_back( b.clone() );
-      (V[iter+1])->set(*w_);
-      (V[iter+1])->scale(one/((*H_)(iter+1,iter)));
+      (V_->get(iter+1))->set(*w_);
+      (V_->get(iter+1))->scale(one/((*H_)(iter+1,iter)));
 
       // Apply Givens rotations
       for( int k=0; k<=iter-1; ++k ) {
         temp            =  (*cs_)(k)*(*H_)(k,iter) + (*sn_)(k)*(*H_)(k+1,iter);
-        (*H_)(k+1,iter) = -(*sn_)(k)*(*H_)(k,iter) + (*cs_)(k)*(*H_)(k+1,iter); 
+        (*H_)(k+1,iter) = -(*sn_)(k)*(*H_)(k,iter) + (*cs_)(k)*(*H_)(k+1,iter);
         (*H_)(k,iter)   = temp;
       }
 
@@ -209,7 +247,7 @@ class GMRES : public Krylov<Real> {
       z_->zero();
 
       for( int k=0; k<=iter;++k ) {
-        z_->axpy((*y_)(k),*(Z[k]));
+        z_->axpy((*y_)(k),*(Z_->get(k)));
       }
 
       if( (*res_)[iter+1] <= rtol ) {
diff --git a/packages/rol/src/vector/ROL_Vector.hpp b/packages/rol/src/vector/ROL_Vector.hpp
index d37b19529979..856379fffc61 100644
--- a/packages/rol/src/vector/ROL_Vector.hpp
+++ b/packages/rol/src/vector/ROL_Vector.hpp
@@ -13,7 +13,8 @@
 
 #define ROL_UNUSED(x) (void) x
 
-#include <ostream>
+#include <iostream>
+#include <iomanip>
 #include <vector>
 #include <algorithm>
 
@@ -37,7 +38,7 @@
     a duality pairing (in general Banach space).
 
     There are additional virtual member functions that can be
-    overloaded for computational efficiency. 
+    overloaded for computational efficiency.
 */
 
 namespace ROL {
@@ -45,8 +46,8 @@ namespace ROL {
 template <class Real>
 class Vector
 #ifdef ENABLE_PYROL
- : public std::enable_shared_from_this<Vector<Real>> 
-#endif 
+ : public std::enable_shared_from_this<Vector<Real>>
+#endif
 {
 public:
 
@@ -100,7 +101,7 @@ class Vector
 
              Provides the means of allocating temporary memory in ROL.
 
-             ---             
+             ---
   */
   virtual ROL::Ptr<Vector> clone() const = 0;
 
@@ -221,7 +222,7 @@ class Vector
   virtual Real reduce( const Elementwise::ReductionOp<Real> &r ) const {
     ROL_UNUSED(r);
     ROL_TEST_FOR_EXCEPTION( true, std::logic_error,
-      "The method reduce was called, but not implemented" << std::endl); 
+      "The method reduce was called, but not implemented" << std::endl);
   }
 
   virtual void print( std::ostream &outStream ) const {
diff --git a/packages/rol/src/zoo/ROL_HelperFunctions.hpp b/packages/rol/src/zoo/ROL_HelperFunctions.hpp
index 0dbdad547d3d..39977e86c07b 100644
--- a/packages/rol/src/zoo/ROL_HelperFunctions.hpp
+++ b/packages/rol/src/zoo/ROL_HelperFunctions.hpp
@@ -19,19 +19,18 @@
 #include "ROL_Objective.hpp"
 #include "ROL_BoundConstraint.hpp"
 #include "ROL_Secant.hpp"
-#include "Teuchos_SerialDenseMatrix.hpp"
-#include "Teuchos_SerialDenseVector.hpp"
-#include "Teuchos_LAPACK.hpp"
+#include "ROL_LinearAlgebra.hpp"
+#include "ROL_LAPACK.hpp"
 
 namespace ROL {
 
   template<class Real>
-  Teuchos::SerialDenseMatrix<int, Real> computeDenseHessian(Objective<Real> &obj, const Vector<Real> &x) {
+  ROL::LA::Matrix< Real> computeDenseHessian(Objective<Real> &obj, const Vector<Real> &x) {
 
     Real tol = std::sqrt(ROL_EPSILON<Real>());
 
     int dim = x.dimension();
-    Teuchos::SerialDenseMatrix<int, Real> H(dim, dim);
+    ROL::LA::Matrix< Real> H(dim, dim);
 
     ROL::Ptr<Vector<Real> > e = x.clone();
     ROL::Ptr<Vector<Real> > h = x.dual().clone();
@@ -52,12 +51,12 @@ namespace ROL {
 
 
   template<class Real>
-  Teuchos::SerialDenseMatrix<int, Real> computeScaledDenseHessian(Objective<Real> &obj, const Vector<Real> &x) {
+  ROL::LA::Matrix< Real> computeScaledDenseHessian(Objective<Real> &obj, const Vector<Real> &x) {
 
     Real tol = std::sqrt(ROL_EPSILON<Real>());
 
     int dim = x.dimension();
-    Teuchos::SerialDenseMatrix<int, Real> H(dim, dim);
+    ROL::LA::Matrix< Real> H(dim, dim);
 
     ROL::Ptr<Vector<Real> > ei = x.clone();
     ROL::Ptr<Vector<Real> > ej = x.dual().clone();
@@ -78,10 +77,10 @@ namespace ROL {
 
 
   template<class Real>
-  Teuchos::SerialDenseMatrix<int, Real> computeDotMatrix(const Vector<Real> &x) {
+  ROL::LA::Matrix< Real> computeDotMatrix(const Vector<Real> &x) {
 
     int dim = x.dimension();
-    Teuchos::SerialDenseMatrix<int, Real> M(dim, dim);
+    ROL::LA::Matrix< Real> M(dim, dim);
 
     ROL::Ptr<Vector<Real> > ei = x.clone();
     ROL::Ptr<Vector<Real> > ej = x.clone();
@@ -99,11 +98,11 @@ namespace ROL {
   }
 
   template<class Real>
-  std::vector<std::vector<Real> > computeEigenvalues(const Teuchos::SerialDenseMatrix<int, Real> & mat) {
+  std::vector<std::vector<Real> > computeEigenvalues(const ROL::LA::Matrix< Real> & mat) {
 
-    Teuchos::LAPACK<int, Real> lapack;
+    ROL::LAPACK<int, Real> lapack;
 
-    Teuchos::SerialDenseMatrix<int, Real> mymat(Teuchos::Copy, mat);
+    ROL::LA::Matrix< Real> mymat(mat);
 
     char jobvl = 'N';
     char jobvr = 'N';
@@ -137,13 +136,13 @@ namespace ROL {
 
 
   template<class Real>
-  std::vector<std::vector<Real> > computeGenEigenvalues(const Teuchos::SerialDenseMatrix<int, Real> & A,
-                                                        const Teuchos::SerialDenseMatrix<int, Real> & B) {
+  std::vector<std::vector<Real> > computeGenEigenvalues(const ROL::LA::Matrix< Real> & A,
+                                                        const ROL::LA::Matrix< Real> & B) {
 
-    Teuchos::LAPACK<int, Real> lapack;
+    ROL::LAPACK<int, Real> lapack;
 
-    Teuchos::SerialDenseMatrix<int, Real> myA(Teuchos::Copy, A);
-    Teuchos::SerialDenseMatrix<int, Real> myB(Teuchos::Copy, B);
+    ROL::LA::Matrix< Real> myA(A);
+    ROL::LA::Matrix< Real> myB(B);
 
     char jobvl = 'N';
     char jobvr = 'N';
@@ -184,11 +183,11 @@ namespace ROL {
 
 
   template<class Real>
-  Teuchos::SerialDenseMatrix<int, Real> computeInverse(const Teuchos::SerialDenseMatrix<int, Real> & mat) {
+  ROL::LA::Matrix< Real> computeInverse(const ROL::LA::Matrix< Real> & mat) {
 
-    Teuchos::LAPACK<int, Real> lapack;
+    ROL::LAPACK<int, Real> lapack;
 
-    Teuchos::SerialDenseMatrix<int, Real> mymat(Teuchos::Copy, mat);
+    ROL::LA::Matrix< Real> mymat(mat);
 
     int n = mat.numRows();
 
diff --git a/packages/rol/src/zoo/testproblems/ROL_ParaboloidCircle.hpp b/packages/rol/src/zoo/testproblems/ROL_ParaboloidCircle.hpp
index 56fe7807a95e..e6f032ccf9c4 100644
--- a/packages/rol/src/zoo/testproblems/ROL_ParaboloidCircle.hpp
+++ b/packages/rol/src/zoo/testproblems/ROL_ParaboloidCircle.hpp
@@ -21,8 +21,7 @@
 
 #include "ROL_TestProblem.hpp"
 #include "ROL_StdVector.hpp"
-#include "Teuchos_SerialDenseVector.hpp"
-#include "Teuchos_SerialDenseSolver.hpp"
+#include "ROL_LinearAlgebra.hpp"
 
 namespace ROL {
 namespace ZOO {
diff --git a/packages/rol/src/zoo/testproblems/ROL_PoissonInversion.hpp b/packages/rol/src/zoo/testproblems/ROL_PoissonInversion.hpp
index 94a8231f1921..70fe3ef72ebd 100644
--- a/packages/rol/src/zoo/testproblems/ROL_PoissonInversion.hpp
+++ b/packages/rol/src/zoo/testproblems/ROL_PoissonInversion.hpp
@@ -23,7 +23,8 @@
 #include "ROL_TestProblem.hpp"
 #include "ROL_HelperFunctions.hpp"
 
-#include "Teuchos_LAPACK.hpp"
+#include "ROL_LAPACK.hpp"
+#include "ROL_LinearAlgebra.hpp"
 
 namespace ROL {
 namespace ZOO {
@@ -214,7 +215,7 @@ class Objective_PoissonInversion : public Objective<Real> {
     }
 
     // Solve Tridiagonal System Using LAPACK's SPD Tridiagonal Solver
-    Teuchos::LAPACK<int,Real> lp;
+    ROL::LAPACK<int,Real> lp;
     int info;
     int ldb  = nu_;
     int nhrs = 1;
@@ -467,7 +468,7 @@ class Objective_PoissonInversion : public Objective<Real> {
 
 
     // Compute dense Hessian.
-    Teuchos::SerialDenseMatrix<int, Real> H(dim, dim);
+    ROL::LA::Matrix<Real> H(dim, dim);
     Objective_PoissonInversion<Real> & obj = *this; 
     H = computeDenseHessian<Real>(obj, x);
 
@@ -496,12 +497,19 @@ class Objective_PoissonInversion : public Objective<Real> {
     }
 
     // Compute dense inverse Hessian.
-    Teuchos::SerialDenseMatrix<int, Real> invH = computeInverse<Real>(H);
+    ROL::LA::Matrix<Real> invH = computeInverse<Real>(H);
 
-    // Apply dense inverse Hessian.
-    Teuchos::SerialDenseVector<int, Real> hv_teuchos(Teuchos::View, &((*hvp)[0]), dim);
-    const Teuchos::SerialDenseVector<int, Real> v_teuchos(Teuchos::View, &(vp[0]), dim);
-    hv_teuchos.multiply(Teuchos::NO_TRANS, Teuchos::NO_TRANS, 1.0, invH, v_teuchos, 0.0);
+    // Apply dense inverse Hessian using view constructors
+    ROL::LA::Vector<Real> hv_la(ROL::LA::View, &((*hvp)[0]), dim);
+    ROL::LA::Vector<Real> v_la(ROL::LA::View, &(vp[0]), dim);
+    
+    // Apply matrix-vector multiply: hv_la = invH * v_la
+    for (int i = 0; i < dim; i++) {
+      hv_la(i) = 0.0;
+      for (int j = 0; j < dim; j++) {
+        hv_la(i) += invH(i,j) * v_la(j);
+      }
+    }
   }
 
 };
@@ -541,7 +549,7 @@ class getPoissonInversion : public TestProblem<Real> {
         (*xp)[li] = std::log(k1);
       }
       else if ( pt >= 0.5 && pt < 1.0 ) {
-        (*xp)[li] = std::log(k2); 
+        (*xp)[li] = std::log(k2);
       }
     }
     return ROL::makePtr<StdVector<Real>>(xp);
diff --git a/packages/rol/src/zoo/testproblems/ROL_SimpleEqConstrained.hpp b/packages/rol/src/zoo/testproblems/ROL_SimpleEqConstrained.hpp
index 88a9639e55e8..209b1a7846f6 100644
--- a/packages/rol/src/zoo/testproblems/ROL_SimpleEqConstrained.hpp
+++ b/packages/rol/src/zoo/testproblems/ROL_SimpleEqConstrained.hpp
@@ -19,8 +19,7 @@
 
 #include "ROL_TestProblem.hpp"
 #include "ROL_StdVector.hpp"
-#include "Teuchos_SerialDenseVector.hpp"
-#include "Teuchos_SerialDenseSolver.hpp"
+#include "ROL_LinearAlgebra.hpp"
 
 namespace ROL {
 namespace ZOO {
diff --git a/packages/rol/src/zoo/testproblems/ROL_TestProblem.hpp b/packages/rol/src/zoo/testproblems/ROL_TestProblem.hpp
index 670ee50899b3..05459c416a41 100644
--- a/packages/rol/src/zoo/testproblems/ROL_TestProblem.hpp
+++ b/packages/rol/src/zoo/testproblems/ROL_TestProblem.hpp
@@ -16,6 +16,8 @@
 #define ROL_TESTPROBLEMS_HPP
 
 #include "ROL_OptimizationProblem.hpp"
+#include "ROL_Bounds.hpp"
+#include "ROL_RandomVector.hpp"
 
 namespace ROL {
 
diff --git a/packages/rol/test/algorithm/CMakeLists.txt b/packages/rol/test/algorithm/CMakeLists.txt
index fc13f0a2b3ee..44fd0c2c38f0 100644
--- a/packages/rol/test/algorithm/CMakeLists.txt
+++ b/packages/rol/test/algorithm/CMakeLists.txt
@@ -4,7 +4,7 @@ ADD_SUBDIRECTORY(TypeE)
 ADD_SUBDIRECTORY(TypeG)
 ADD_SUBDIRECTORY(TypeP)
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   OptimizationSolverStatusTestInput
   SOURCES test_01.cpp
   ARGS PrintItAll
@@ -13,7 +13,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   Problem
   SOURCES test_02.cpp
   ARGS PrintItAll
@@ -22,7 +22,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_COPY_FILES_TO_BINARY_DIR(
+ROL_COPY_FILES_TO_BINARY_DIR(
   AlgorithmTestDataCopy
   SOURCE_FILES
     input.xml
diff --git a/packages/rol/test/algorithm/TypeB/CMakeLists.txt b/packages/rol/test/algorithm/TypeB/CMakeLists.txt
index 335d7d6650ac..11de7314e3d1 100644
--- a/packages/rol/test/algorithm/TypeB/CMakeLists.txt
+++ b/packages/rol/test/algorithm/TypeB/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   ProjectedGradient
   SOURCES test_01.cpp
   ARGS PrintItAll
@@ -8,7 +8,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   LinMore
   SOURCES test_02.cpp
   ARGS PrintItAll
@@ -17,7 +17,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   BoxConstrained_LinMore
   SOURCES test_03.cpp
   ARGS PrintItAll
@@ -26,7 +26,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   BoxConstrained_NewtonKrylov
   SOURCES test_04.cpp
   ARGS PrintItAll
@@ -35,7 +35,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   BoxConstrained_MoreauYosida
   SOURCES test_05.cpp
   ARGS PrintItAll
@@ -44,7 +44,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   MoreauYosida
   SOURCES test_06.cpp
   ARGS PrintItAll
@@ -53,7 +53,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   InteriorPoint
   SOURCES test_07.cpp
   ARGS PrintItAll
@@ -62,7 +62,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   BoxConstrained_PDAS
   SOURCES test_08.cpp
   ARGS PrintItAll
@@ -71,7 +71,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   PDAS
   SOURCES test_09.cpp
   ARGS PrintItAll
@@ -80,7 +80,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   BoxConstrained_KelleySachs
   SOURCES test_10.cpp
   ARGS PrintItAll
@@ -89,7 +89,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   SpectralProjectedGradient
   SOURCES test_11.cpp
   ARGS PrintItAll
@@ -98,7 +98,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   ProjectedQuasiNewton
   SOURCES test_12.cpp
   ARGS PrintItAll
@@ -107,7 +107,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   TrustRegionSPG
   SOURCES test_13.cpp
   ARGS PrintItAll
@@ -116,7 +116,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   BoundConstrained_LSecantB
   SOURCES test_14.cpp
   ARGS PrintItAll
@@ -125,7 +125,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   LSecantB
   SOURCES test_15.cpp
   ARGS PrintItAll
@@ -134,7 +134,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
 BoxConstrained_ColemanLi
   SOURCES test_16.cpp
   ARGS PrintItAll
@@ -143,7 +143,7 @@ BoxConstrained_ColemanLi
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_COPY_FILES_TO_BINARY_DIR(
+ROL_COPY_FILES_TO_BINARY_DIR(
   TypeBTestDataCopy
   SOURCE_FILES
     input.xml
diff --git a/packages/rol/test/algorithm/TypeB/test_01.cpp b/packages/rol/test/algorithm/TypeB/test_01.cpp
index 5d644edf6824..2ce4fdd0fde8 100644
--- a/packages/rol/test/algorithm/TypeB/test_01.cpp
+++ b/packages/rol/test/algorithm/TypeB/test_01.cpp
@@ -16,13 +16,13 @@
 #include "ROL_TypeB_GradientAlgorithm.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a
   // (dummy) command-line argument is provided.
diff --git a/packages/rol/test/algorithm/TypeB/test_02.cpp b/packages/rol/test/algorithm/TypeB/test_02.cpp
index b343c49b5796..14abb8bbcce6 100644
--- a/packages/rol/test/algorithm/TypeB/test_02.cpp
+++ b/packages/rol/test/algorithm/TypeB/test_02.cpp
@@ -17,13 +17,13 @@
 #include "ROL_TypeB_LinMoreAlgorithm.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a
   // (dummy) command-line argument is provided.
diff --git a/packages/rol/test/algorithm/TypeB/test_03.cpp b/packages/rol/test/algorithm/TypeB/test_03.cpp
index 21c08f9e3832..c78b927a57ea 100644
--- a/packages/rol/test/algorithm/TypeB/test_03.cpp
+++ b/packages/rol/test/algorithm/TypeB/test_03.cpp
@@ -17,7 +17,7 @@
 #include "ROL_OptimizationSolver.hpp"
 #include "ROL_TypeB_LinMoreAlgorithm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 #include <unordered_map>
@@ -28,7 +28,7 @@ typedef double RealT;
 int main(int argc, char *argv[]) {
   //feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/algorithm/TypeB/test_04.cpp b/packages/rol/test/algorithm/TypeB/test_04.cpp
index f7feb7531315..f8e46b501a58 100644
--- a/packages/rol/test/algorithm/TypeB/test_04.cpp
+++ b/packages/rol/test/algorithm/TypeB/test_04.cpp
@@ -17,7 +17,7 @@
 #include "ROL_OptimizationSolver.hpp"
 #include "ROL_TypeB_NewtonKrylovAlgorithm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 //#include <fenv.h>
@@ -27,7 +27,7 @@ typedef double RealT;
 int main(int argc, char *argv[]) {
   //feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/algorithm/TypeB/test_05.cpp b/packages/rol/test/algorithm/TypeB/test_05.cpp
index 587d771556f0..22bf7d1d06a1 100644
--- a/packages/rol/test/algorithm/TypeB/test_05.cpp
+++ b/packages/rol/test/algorithm/TypeB/test_05.cpp
@@ -17,7 +17,7 @@
 #include "ROL_OptimizationSolver.hpp"
 #include "ROL_TypeB_MoreauYosidaAlgorithm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 //#include <fenv.h>
@@ -27,7 +27,7 @@ typedef double RealT;
 int main(int argc, char *argv[]) {
   //feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/algorithm/TypeB/test_06.cpp b/packages/rol/test/algorithm/TypeB/test_06.cpp
index 33a548b51bbb..9a0e6abf1eb4 100644
--- a/packages/rol/test/algorithm/TypeB/test_06.cpp
+++ b/packages/rol/test/algorithm/TypeB/test_06.cpp
@@ -17,13 +17,13 @@
 #include "ROL_TypeB_MoreauYosidaAlgorithm.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a
   // (dummy) command-line argument is provided.
diff --git a/packages/rol/test/algorithm/TypeB/test_07.cpp b/packages/rol/test/algorithm/TypeB/test_07.cpp
index 6ee77103afb9..5efa7685e1b5 100644
--- a/packages/rol/test/algorithm/TypeB/test_07.cpp
+++ b/packages/rol/test/algorithm/TypeB/test_07.cpp
@@ -17,13 +17,13 @@
 #include "ROL_TypeB_InteriorPointAlgorithm.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a
   // (dummy) command-line argument is provided.
diff --git a/packages/rol/test/algorithm/TypeB/test_08.cpp b/packages/rol/test/algorithm/TypeB/test_08.cpp
index 250c99f5b5d1..a95071cf0895 100644
--- a/packages/rol/test/algorithm/TypeB/test_08.cpp
+++ b/packages/rol/test/algorithm/TypeB/test_08.cpp
@@ -17,7 +17,7 @@
 #include "ROL_OptimizationSolver.hpp"
 #include "ROL_TypeB_PrimalDualActiveSetAlgorithm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 //#include <fenv.h>
@@ -27,7 +27,7 @@ typedef double RealT;
 int main(int argc, char *argv[]) {
   //feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/algorithm/TypeB/test_09.cpp b/packages/rol/test/algorithm/TypeB/test_09.cpp
index 2a3cc4ab6444..27e49612a762 100644
--- a/packages/rol/test/algorithm/TypeB/test_09.cpp
+++ b/packages/rol/test/algorithm/TypeB/test_09.cpp
@@ -17,13 +17,13 @@
 #include "ROL_TypeB_PrimalDualActiveSetAlgorithm.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a
   // (dummy) command-line argument is provided.
diff --git a/packages/rol/test/algorithm/TypeB/test_10.cpp b/packages/rol/test/algorithm/TypeB/test_10.cpp
index c4eeeb668f43..aeaf609a746d 100644
--- a/packages/rol/test/algorithm/TypeB/test_10.cpp
+++ b/packages/rol/test/algorithm/TypeB/test_10.cpp
@@ -17,7 +17,7 @@
 //#include "ROL_OptimizationSolver.hpp"
 #include "ROL_TypeB_KelleySachsAlgorithm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 //#include <fenv.h>
@@ -27,7 +27,7 @@ typedef double RealT;
 int main(int argc, char *argv[]) {
   //feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/algorithm/TypeB/test_11.cpp b/packages/rol/test/algorithm/TypeB/test_11.cpp
index 16c501dc1cb3..292c6336dfee 100644
--- a/packages/rol/test/algorithm/TypeB/test_11.cpp
+++ b/packages/rol/test/algorithm/TypeB/test_11.cpp
@@ -17,13 +17,13 @@
 #include "ROL_Problem.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a
   // (dummy) command-line argument is provided.
diff --git a/packages/rol/test/algorithm/TypeB/test_12.cpp b/packages/rol/test/algorithm/TypeB/test_12.cpp
index 6f118566dc87..022de139862c 100644
--- a/packages/rol/test/algorithm/TypeB/test_12.cpp
+++ b/packages/rol/test/algorithm/TypeB/test_12.cpp
@@ -17,13 +17,13 @@
 #include "ROL_Problem.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a
   // (dummy) command-line argument is provided.
diff --git a/packages/rol/test/algorithm/TypeB/test_13.cpp b/packages/rol/test/algorithm/TypeB/test_13.cpp
index e0fc7c3b9f7e..dedacca05d54 100644
--- a/packages/rol/test/algorithm/TypeB/test_13.cpp
+++ b/packages/rol/test/algorithm/TypeB/test_13.cpp
@@ -17,13 +17,13 @@
 #include "ROL_Problem.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a
   // (dummy) command-line argument is provided.
diff --git a/packages/rol/test/algorithm/TypeB/test_14.cpp b/packages/rol/test/algorithm/TypeB/test_14.cpp
index 70b3b76bf02c..1d369f7124f9 100644
--- a/packages/rol/test/algorithm/TypeB/test_14.cpp
+++ b/packages/rol/test/algorithm/TypeB/test_14.cpp
@@ -17,7 +17,7 @@
 #include "ROL_OptimizationSolver.hpp"
 #include "ROL_TypeB_LSecantBAlgorithm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 #include <unordered_map>
@@ -28,7 +28,7 @@ typedef double RealT;
 int main(int argc, char *argv[]) {
   //feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/algorithm/TypeB/test_15.cpp b/packages/rol/test/algorithm/TypeB/test_15.cpp
index 445d759dd040..a77b8cda0ac5 100644
--- a/packages/rol/test/algorithm/TypeB/test_15.cpp
+++ b/packages/rol/test/algorithm/TypeB/test_15.cpp
@@ -17,13 +17,13 @@
 #include "ROL_TypeB_LSecantBAlgorithm.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a
   // (dummy) command-line argument is provided.
diff --git a/packages/rol/test/algorithm/TypeB/test_16.cpp b/packages/rol/test/algorithm/TypeB/test_16.cpp
index c14ff0cd67b5..8f38ba64d26b 100644
--- a/packages/rol/test/algorithm/TypeB/test_16.cpp
+++ b/packages/rol/test/algorithm/TypeB/test_16.cpp
@@ -17,7 +17,7 @@
 #include "ROL_OptimizationSolver.hpp"
 #include "ROL_TypeB_ColemanLiAlgorithm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 #include <unordered_map>
@@ -28,7 +28,7 @@ typedef double RealT;
 int main(int argc, char *argv[]) {
   //feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/algorithm/TypeE/CMakeLists.txt b/packages/rol/test/algorithm/TypeE/CMakeLists.txt
index 75423d0a8683..b49559984605 100644
--- a/packages/rol/test/algorithm/TypeE/CMakeLists.txt
+++ b/packages/rol/test/algorithm/TypeE/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   AugmentedLagrangian
   SOURCES test_01.cpp
   ARGS PrintItAll
@@ -8,7 +8,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   Fletcher
   SOURCES test_02.cpp
   ARGS PrintItAll
@@ -17,7 +17,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   CompositeStep
   SOURCES test_03.cpp
   ARGS PrintItAll
@@ -26,7 +26,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   StabilizedLCL
   SOURCES test_04.cpp
   ARGS PrintItAll
@@ -35,7 +35,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_COPY_FILES_TO_BINARY_DIR(
+ROL_COPY_FILES_TO_BINARY_DIR(
   TypeETestDataCopy
   SOURCE_FILES
     input.xml
diff --git a/packages/rol/test/algorithm/TypeE/test_01.cpp b/packages/rol/test/algorithm/TypeE/test_01.cpp
index 038edb05e7c1..811436c27b71 100644
--- a/packages/rol/test/algorithm/TypeE/test_01.cpp
+++ b/packages/rol/test/algorithm/TypeE/test_01.cpp
@@ -18,7 +18,7 @@
 #include "ROL_TypeE_AugmentedLagrangianAlgorithm.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 
 #include <iostream>
@@ -27,7 +27,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   std::string filename = "input.xml";
   auto parlist = ROL::getParametersFromXmlFile( filename );
diff --git a/packages/rol/test/algorithm/TypeE/test_02.cpp b/packages/rol/test/algorithm/TypeE/test_02.cpp
index aaa8980c3237..10a8f01bdeb2 100644
--- a/packages/rol/test/algorithm/TypeE/test_02.cpp
+++ b/packages/rol/test/algorithm/TypeE/test_02.cpp
@@ -18,7 +18,7 @@
 #include "ROL_TypeE_FletcherAlgorithm.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 
 #include <iostream>
@@ -27,7 +27,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   std::string filename = "input.xml";
   auto parlist = ROL::getParametersFromXmlFile( filename );
diff --git a/packages/rol/test/algorithm/TypeE/test_03.cpp b/packages/rol/test/algorithm/TypeE/test_03.cpp
index bf7d8e6cc6f5..02c123ea6a60 100644
--- a/packages/rol/test/algorithm/TypeE/test_03.cpp
+++ b/packages/rol/test/algorithm/TypeE/test_03.cpp
@@ -18,7 +18,7 @@
 #include "ROL_TypeE_CompositeStepAlgorithm.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 
 #include <iostream>
@@ -27,7 +27,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   std::string filename = "input.xml";
   auto parlist = ROL::getParametersFromXmlFile( filename );
diff --git a/packages/rol/test/algorithm/TypeE/test_04.cpp b/packages/rol/test/algorithm/TypeE/test_04.cpp
index 1fddf21f7818..3a3dd1f3a240 100644
--- a/packages/rol/test/algorithm/TypeE/test_04.cpp
+++ b/packages/rol/test/algorithm/TypeE/test_04.cpp
@@ -18,7 +18,7 @@
 #include "ROL_TypeE_StabilizedLCLAlgorithm.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 
 #include <iostream>
@@ -27,7 +27,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   std::string filename = "input.xml";
   auto parlist = ROL::getParametersFromXmlFile( filename );
diff --git a/packages/rol/test/algorithm/TypeG/CMakeLists.txt b/packages/rol/test/algorithm/TypeG/CMakeLists.txt
index facc5450ba75..f23b0e78603b 100644
--- a/packages/rol/test/algorithm/TypeG/CMakeLists.txt
+++ b/packages/rol/test/algorithm/TypeG/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   AugmentedLagrangian
   SOURCES test_01.cpp
   ARGS PrintItAll
@@ -8,7 +8,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   MoreauYosida
   SOURCES test_02.cpp
   ARGS PrintItAll
@@ -17,7 +17,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   InteriorPoint
   SOURCES test_03.cpp
   ARGS PrintItAll
@@ -26,7 +26,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   StabilizedLCL
   SOURCES test_04.cpp
   ARGS PrintItAll
@@ -35,7 +35,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_COPY_FILES_TO_BINARY_DIR(
+ROL_COPY_FILES_TO_BINARY_DIR(
   TypeGTestDataCopy
   SOURCE_FILES
     input.xml
diff --git a/packages/rol/test/algorithm/TypeG/test_01.cpp b/packages/rol/test/algorithm/TypeG/test_01.cpp
index 8b276039e876..66ebda5568e5 100644
--- a/packages/rol/test/algorithm/TypeG/test_01.cpp
+++ b/packages/rol/test/algorithm/TypeG/test_01.cpp
@@ -17,13 +17,13 @@
 #include "ROL_TypeG_AugmentedLagrangianAlgorithm.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a
   // (dummy) command-line argument is provided.
diff --git a/packages/rol/test/algorithm/TypeG/test_02.cpp b/packages/rol/test/algorithm/TypeG/test_02.cpp
index 82696192d86f..a8f3cdd82c7c 100644
--- a/packages/rol/test/algorithm/TypeG/test_02.cpp
+++ b/packages/rol/test/algorithm/TypeG/test_02.cpp
@@ -17,13 +17,13 @@
 #include "ROL_TypeG_MoreauYosidaAlgorithm.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a
   // (dummy) command-line argument is provided.
diff --git a/packages/rol/test/algorithm/TypeG/test_03.cpp b/packages/rol/test/algorithm/TypeG/test_03.cpp
index b619e2cb89b8..e743405556fa 100644
--- a/packages/rol/test/algorithm/TypeG/test_03.cpp
+++ b/packages/rol/test/algorithm/TypeG/test_03.cpp
@@ -17,13 +17,13 @@
 #include "ROL_TypeG_InteriorPointAlgorithm.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a
   // (dummy) command-line argument is provided.
diff --git a/packages/rol/test/algorithm/TypeG/test_04.cpp b/packages/rol/test/algorithm/TypeG/test_04.cpp
index 85a36cba74f0..ba93f0aa8752 100644
--- a/packages/rol/test/algorithm/TypeG/test_04.cpp
+++ b/packages/rol/test/algorithm/TypeG/test_04.cpp
@@ -17,13 +17,13 @@
 #include "ROL_TypeG_StabilizedLCLAlgorithm.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a
   // (dummy) command-line argument is provided.
diff --git a/packages/rol/test/algorithm/TypeP/CMakeLists.txt b/packages/rol/test/algorithm/TypeP/CMakeLists.txt
index 8a8572722d0b..97a74d59148e 100644
--- a/packages/rol/test/algorithm/TypeP/CMakeLists.txt
+++ b/packages/rol/test/algorithm/TypeP/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   ProximalGradient
   SOURCES test_01.cpp
   ARGS PrintItAll
@@ -8,7 +8,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   SpectralGradient
   SOURCES test_02.cpp
   ARGS PrintItAll
@@ -17,7 +17,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   iPiano
   SOURCES test_03.cpp
   ARGS PrintItAll
@@ -26,7 +26,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   TrustRegion
   SOURCES test_04.cpp
   ARGS PrintItAll
@@ -35,7 +35,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   QuasiNewton
   SOURCES test_05.cpp
   ARGS PrintItAll
@@ -44,7 +44,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   InexactNewton
   SOURCES test_06.cpp
   ARGS PrintItAll
@@ -53,7 +53,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   CompareTypeU
   SOURCES test_07.cpp
   ARGS PrintItAll
@@ -62,7 +62,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   TestTypePSolver 
   SOURCES test_08.cpp
   ARGS PrintItAll
@@ -71,7 +71,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
  )
  
-TRIBITS_COPY_FILES_TO_BINARY_DIR(
+ROL_COPY_FILES_TO_BINARY_DIR(
   TypePTestDataCopy
   SOURCE_FILES
     input.xml
diff --git a/packages/rol/test/algorithm/TypeP/test_01.cpp b/packages/rol/test/algorithm/TypeP/test_01.cpp
index 3c2c73b29953..1e07b1220615 100644
--- a/packages/rol/test/algorithm/TypeP/test_01.cpp
+++ b/packages/rol/test/algorithm/TypeP/test_01.cpp
@@ -15,7 +15,7 @@
 #include "ROL_StdObjective.hpp"
 #include "ROL_l1Objective.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include <random>
 
 template<typename Real>
@@ -65,7 +65,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a
   // (dummy) command-line argument is provided.
diff --git a/packages/rol/test/algorithm/TypeP/test_02.cpp b/packages/rol/test/algorithm/TypeP/test_02.cpp
index c72ddf9301a1..bd7f3102485c 100644
--- a/packages/rol/test/algorithm/TypeP/test_02.cpp
+++ b/packages/rol/test/algorithm/TypeP/test_02.cpp
@@ -15,7 +15,7 @@
 #include "ROL_StdObjective.hpp"
 #include "ROL_l1Objective.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include <random>
 
 template<typename Real>
@@ -65,7 +65,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a
   // (dummy) command-line argument is provided.
diff --git a/packages/rol/test/algorithm/TypeP/test_03.cpp b/packages/rol/test/algorithm/TypeP/test_03.cpp
index 51cc3aca469a..75124eb11f6b 100644
--- a/packages/rol/test/algorithm/TypeP/test_03.cpp
+++ b/packages/rol/test/algorithm/TypeP/test_03.cpp
@@ -15,7 +15,7 @@
 #include "ROL_StdObjective.hpp"
 #include "ROL_l1Objective.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include <random>
 
 template<typename Real>
@@ -65,7 +65,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a
   // (dummy) command-line argument is provided.
diff --git a/packages/rol/test/algorithm/TypeP/test_04.cpp b/packages/rol/test/algorithm/TypeP/test_04.cpp
index cdf2335d18a0..521d0cee98e3 100644
--- a/packages/rol/test/algorithm/TypeP/test_04.cpp
+++ b/packages/rol/test/algorithm/TypeP/test_04.cpp
@@ -15,7 +15,7 @@
 #include "ROL_StdObjective.hpp"
 #include "ROL_l1Objective.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include <random>
 #include <chrono>
 
@@ -66,7 +66,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a
   // (dummy) command-line argument is provided.
diff --git a/packages/rol/test/algorithm/TypeP/test_05.cpp b/packages/rol/test/algorithm/TypeP/test_05.cpp
index 3fbabf89c266..4b73cc78baf2 100644
--- a/packages/rol/test/algorithm/TypeP/test_05.cpp
+++ b/packages/rol/test/algorithm/TypeP/test_05.cpp
@@ -15,7 +15,7 @@
 #include "ROL_StdObjective.hpp"
 #include "ROL_l1Objective.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include <random>
 
 template<typename Real>
@@ -65,7 +65,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a
   // (dummy) command-line argument is provided.
diff --git a/packages/rol/test/algorithm/TypeP/test_06.cpp b/packages/rol/test/algorithm/TypeP/test_06.cpp
index d47f1c70b56a..2358b9af10c2 100644
--- a/packages/rol/test/algorithm/TypeP/test_06.cpp
+++ b/packages/rol/test/algorithm/TypeP/test_06.cpp
@@ -15,7 +15,7 @@
 #include "ROL_StdObjective.hpp"
 #include "ROL_l1Objective.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include <random>
 
 template<typename Real>
@@ -65,7 +65,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a
   // (dummy) command-line argument is provided.
diff --git a/packages/rol/test/algorithm/TypeP/test_07.cpp b/packages/rol/test/algorithm/TypeP/test_07.cpp
index e29bf5449986..ae22bf8af0c9 100644
--- a/packages/rol/test/algorithm/TypeP/test_07.cpp
+++ b/packages/rol/test/algorithm/TypeP/test_07.cpp
@@ -17,7 +17,7 @@
 #include "ROL_StdObjective.hpp"
 #include "ROL_l1Objective.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include <random>
 
 template<typename Real>
@@ -67,7 +67,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a
   // (dummy) command-line argument is provided.
diff --git a/packages/rol/test/algorithm/TypeP/test_08.cpp b/packages/rol/test/algorithm/TypeP/test_08.cpp
index 9b459e7f8f08..b5700997c7f3 100644
--- a/packages/rol/test/algorithm/TypeP/test_08.cpp
+++ b/packages/rol/test/algorithm/TypeP/test_08.cpp
@@ -51,7 +51,7 @@
 #include "ROL_Solver.hpp"
 #include "ROL_Problem.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include <random>
 #include <chrono>
 
@@ -102,7 +102,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a
   // (dummy) command-line argument is provided.
diff --git a/packages/rol/test/algorithm/TypeU/CMakeLists.txt b/packages/rol/test/algorithm/TypeU/CMakeLists.txt
index 049aaa33e928..f18451aaa2a4 100644
--- a/packages/rol/test/algorithm/TypeU/CMakeLists.txt
+++ b/packages/rol/test/algorithm/TypeU/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   LineSearch
   SOURCES test_01.cpp
   ARGS PrintItAll
@@ -8,7 +8,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   TrustRegion
   SOURCES test_02.cpp
   ARGS PrintItAll
@@ -17,7 +17,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   Bundle
   SOURCES test_03.cpp
   ARGS PrintItAll
@@ -26,7 +26,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   ReduceLinearConstraints
   SOURCES test_04.cpp
   ARGS PrintItAll
@@ -35,7 +35,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_COPY_FILES_TO_BINARY_DIR(
+ROL_COPY_FILES_TO_BINARY_DIR(
   TypeUTestDataCopy
   SOURCE_FILES
     input.xml
diff --git a/packages/rol/test/algorithm/TypeU/test_01.cpp b/packages/rol/test/algorithm/TypeU/test_01.cpp
index b48b8e15a2ef..cbf5b6be4c7c 100644
--- a/packages/rol/test/algorithm/TypeU/test_01.cpp
+++ b/packages/rol/test/algorithm/TypeU/test_01.cpp
@@ -16,7 +16,7 @@
 #include "ROL_GetTestProblems.hpp"
 #include "ROL_TypeU_LineSearchAlgorithm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 
@@ -24,7 +24,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/algorithm/TypeU/test_02.cpp b/packages/rol/test/algorithm/TypeU/test_02.cpp
index 971b43699801..c082acc37cf2 100644
--- a/packages/rol/test/algorithm/TypeU/test_02.cpp
+++ b/packages/rol/test/algorithm/TypeU/test_02.cpp
@@ -16,7 +16,7 @@
 #include "ROL_GetTestProblems.hpp"
 #include "ROL_TypeU_TrustRegionAlgorithm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 #include <unordered_map>
@@ -27,7 +27,7 @@ typedef double RealT;
 int main(int argc, char *argv[]) {
   //feenableexcept(FE_INVALID | FE_OVERFLOW);
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/algorithm/TypeU/test_03.cpp b/packages/rol/test/algorithm/TypeU/test_03.cpp
index 9ec25bd0fe76..7a4a66b6b075 100644
--- a/packages/rol/test/algorithm/TypeU/test_03.cpp
+++ b/packages/rol/test/algorithm/TypeU/test_03.cpp
@@ -16,7 +16,7 @@
 #include "ROL_GetTestProblems.hpp"
 #include "ROL_TypeU_BundleAlgorithm.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 
@@ -24,7 +24,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/algorithm/TypeU/test_04.cpp b/packages/rol/test/algorithm/TypeU/test_04.cpp
index d787339f7b62..b37e0409901f 100644
--- a/packages/rol/test/algorithm/TypeU/test_04.cpp
+++ b/packages/rol/test/algorithm/TypeU/test_04.cpp
@@ -22,13 +22,13 @@
 #include "ROL_HS52.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a
   // (dummy) command-line argument is provided.
diff --git a/packages/rol/test/algorithm/test_01.cpp b/packages/rol/test/algorithm/test_01.cpp
index 01ece657d3f1..a88cb82efe00 100644
--- a/packages/rol/test/algorithm/test_01.cpp
+++ b/packages/rol/test/algorithm/test_01.cpp
@@ -14,7 +14,7 @@
 #include "ROL_GetTestProblems.hpp"
 #include "ROL_OptimizationSolver.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 
 #include <iostream>
@@ -42,7 +42,7 @@ class myStatusTest : public ROL::StatusTest<Real> {
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/algorithm/test_02.cpp b/packages/rol/test/algorithm/test_02.cpp
index 305f2a03f1e6..b9968c91c31f 100644
--- a/packages/rol/test/algorithm/test_02.cpp
+++ b/packages/rol/test/algorithm/test_02.cpp
@@ -26,13 +26,13 @@
 #include "ROL_HS63.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a
   // (dummy) command-line argument is provided.
diff --git a/packages/rol/test/compatibility/CMakeLists.txt b/packages/rol/test/compatibility/CMakeLists.txt
index f76fe0fea4c9..21af14d21eb3 100644
--- a/packages/rol/test/compatibility/CMakeLists.txt
+++ b/packages/rol/test/compatibility/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   stacktrace 
   SOURCES test_01.cpp
   ARGS PrintItAll
@@ -8,3 +8,5 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
+add_subdirectory(eigen)
+
diff --git a/packages/rol/test/compatibility/eigen/CMakeLists.txt b/packages/rol/test/compatibility/eigen/CMakeLists.txt
new file mode 100644
index 000000000000..d40d63a5f352
--- /dev/null
+++ b/packages/rol/test/compatibility/eigen/CMakeLists.txt
@@ -0,0 +1,8 @@
+# Eigen compatibility tests
+
+ROL_ADD_EXECUTABLE_AND_TEST(
+  eigen_LapackCompatibility
+  SOURCES test_01.cpp
+  PASS_REGULAR_EXPRESSION "TEST PASSED"
+  ADD_DIR_TO_NAME
+)
\ No newline at end of file
diff --git a/packages/rol/test/compatibility/eigen/test_01.cpp b/packages/rol/test/compatibility/eigen/test_01.cpp
new file mode 100644
index 000000000000..1c6001b678d0
--- /dev/null
+++ b/packages/rol/test/compatibility/eigen/test_01.cpp
@@ -0,0 +1,176 @@
+// @HEADER
+// *****************************************************************************
+//               Rapid Optimization Library (ROL) Package
+//
+// Copyright 2014 NTESS and the ROL contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+/*! \file  test_01.cpp
+    \brief Test ROL::LA::Matrix and ROL::LA::Vector compatibility with LAPACK
+*/
+
+#include <iostream>
+#include <vector>
+#include <cmath>
+
+#include "ROL_Stream.hpp"
+#include "ROL_LinearAlgebra.hpp"
+#include "ROL_LAPACK.hpp"
+
+int main(int argc, char* argv[]) {
+    
+    // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
+    auto outStream = ROL::makeStreamPtr(std::cout, argc > 1);
+    
+    int errorFlag = 0;
+    
+    try {
+        
+        *outStream << "Testing ROL::LA::Matrix LAPACK compatibility..." << std::endl;
+        
+        // Test 1: Basic matrix properties
+        *outStream << "\n=== Test 1: Basic Matrix Properties ===" << std::endl;
+        
+        ROL::LA::Matrix<double> A(3, 3);
+        A(0,0) = 1.0; A(0,1) = 2.0; A(0,2) = 3.0;
+        A(1,0) = 4.0; A(1,1) = 5.0; A(1,2) = 6.0;
+        A(2,0) = 7.0; A(2,1) = 8.0; A(2,2) = 10.0; // Make it slightly non-singular
+        
+        *outStream << "Matrix dimensions: " << A.numRows() << "x" << A.numCols() << std::endl;
+        
+        // Test 2: Memory layout and data access
+        *outStream << "\n=== Test 2: Memory Layout ===" << std::endl;
+        
+        double* data = A.values();
+        *outStream << "Raw data pointer validity: " << (data != nullptr ? "OK" : "FAIL") << std::endl;
+        
+        if (data == nullptr) {
+            errorFlag++;
+        }
+        
+        // Check if we can access matrix elements correctly
+        *outStream << "Element access test:" << std::endl;
+        for(int i = 0; i < 3; i++) {
+            for(int j = 0; j < 3; j++) {
+                *outStream << "A(" << i << "," << j << ") = " << A(i,j) << std::endl;
+            }
+        }
+        
+        // Test 3: LAPACK GEEV eigenvalue computation
+        *outStream << "\n=== Test 3: LAPACK GEEV Call ===" << std::endl;
+        
+        ROL::LAPACK<int, double> lapack;
+        ROL::LA::Matrix<double> mymat(A);  // Copy for GEEV (it will be destroyed)
+        
+        char jobvl = 'N';
+        char jobvr = 'N';
+        int n = 3;
+        
+        std::vector<double> real(n, 0);
+        std::vector<double> imag(n, 0);
+        
+        double* vl = nullptr;
+        double* vr = nullptr;
+        int ldvl = 1;
+        int ldvr = 1;
+        int lwork = 4*n;
+        std::vector<double> work(lwork, 0);
+        int info = 0;
+        
+        *outStream << "Matrix leading dimension (stride): " << mymat.stride() << std::endl;
+        *outStream << "Expected leading dimension: " << n << std::endl;
+        
+        if (mymat.stride() != n) {
+            *outStream << "WARNING: Stride mismatch! This could cause LAPACK issues." << std::endl;
+        }
+        
+        // Call GEEV
+        lapack.GEEV(jobvl, jobvr, n, &mymat(0,0), n, &real[0], &imag[0], vl, ldvl, vr, ldvr, &work[0], lwork, &info);
+        
+        *outStream << "GEEV return code (info): " << info << std::endl;
+        if (info != 0) {
+            *outStream << "ERROR: GEEV failed with info = " << info << std::endl;
+            errorFlag++;
+        }
+        
+        *outStream << "Eigenvalues:" << std::endl;
+        for(int i = 0; i < n; i++) {
+            if (std::abs(imag[i]) < 1e-12) {
+                *outStream << "  λ[" << i << "] = " << real[i] << std::endl;
+            } else {
+                *outStream << "  λ[" << i << "] = " << real[i] << " + " << imag[i] << "i" << std::endl;
+            }
+        }
+        
+        // Test 4: Comparison with known values
+        *outStream << "\n=== Test 4: Eigenvalue Validation ===" << std::endl;
+        
+        // For this specific 3x3 matrix, we expect real eigenvalues
+        // Let's just check that they're reasonable (trace = sum of eigenvalues)
+        double trace = A(0,0) + A(1,1) + A(2,2);  // Should be 1 + 5 + 10 = 16
+        double eigenvalue_sum = real[0] + real[1] + real[2];
+        
+        *outStream << "Matrix trace: " << trace << std::endl;
+        *outStream << "Sum of eigenvalues: " << eigenvalue_sum << std::endl;
+        *outStream << "Difference: " << std::abs(trace - eigenvalue_sum) << std::endl;
+        
+        if (std::abs(trace - eigenvalue_sum) > 1e-10) {
+            *outStream << "ERROR: Trace and eigenvalue sum differ significantly!" << std::endl;
+            errorFlag++;
+        }
+        
+        // Test 5: Matrix stride/memory layout verification
+        *outStream << "\n=== Test 5: Memory Layout Verification ===" << std::endl;
+        
+        ROL::LA::Matrix<double> B(2, 3);
+        B(0,0) = 1; B(0,1) = 2; B(0,2) = 3;
+        B(1,0) = 4; B(1,1) = 5; B(1,2) = 6;
+        
+        *outStream << "2x3 Matrix B:" << std::endl;
+        for(int i = 0; i < 2; i++) {
+            for(int j = 0; j < 3; j++) {
+                *outStream << B(i,j) << " ";
+            }
+            *outStream << std::endl;
+        }
+        
+        double* bdata = B.values();
+        *outStream << "Raw memory layout: ";
+        for(int k = 0; k < 6; k++) {
+            *outStream << bdata[k] << " ";
+        }
+        *outStream << std::endl;
+        
+        *outStream << "Stride: " << B.stride() << std::endl;
+        
+        // For column-major storage, we expect: 1, 4, 2, 5, 3, 6
+        // For row-major storage, we expect: 1, 2, 3, 4, 5, 6
+        bool is_column_major = (bdata[0] == 1 && bdata[1] == 4 && bdata[2] == 2);
+        bool is_row_major = (bdata[0] == 1 && bdata[1] == 2 && bdata[2] == 3);
+        
+        *outStream << "Matrix appears to be: ";
+        if (is_column_major) {
+            *outStream << "Column-major" << std::endl;
+        } else if (is_row_major) {
+            *outStream << "Row-major" << std::endl;
+        } else {
+            *outStream << "Unknown layout!" << std::endl;
+            errorFlag++;
+        }
+        
+    }
+    catch (std::exception& e) {
+        *outStream << "Exception caught: " << e.what() << std::endl;
+        errorFlag++;
+    }
+    
+    if (errorFlag == 0) {
+      std::cout << "\nEnd Result: TEST PASSED" << std::endl;
+    } else {
+      std::cout << "\nEnd Result: TEST FAILED (errors: " << errorFlag << ")" << std::endl;
+    }
+    
+    return 0;
+}
diff --git a/packages/rol/test/compatibility/test_01.cpp b/packages/rol/test/compatibility/test_01.cpp
index 4f97e9cb3905..952546b693b3 100644
--- a/packages/rol/test/compatibility/test_01.cpp
+++ b/packages/rol/test/compatibility/test_01.cpp
@@ -12,8 +12,9 @@
 */
 
 #include <exception>
+#include <iostream>
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "ROL_stacktrace.hpp"
 #include "ROL_Stream.hpp"
@@ -27,7 +28,7 @@ int main( int argc, char* argv[] ) {
     ROL_TEST_FOR_EXCEPTION( 0<1, std::logic_error, "End Result: TEST PASSED\n")
   }
   catch( std::exception& e ) {
-    *outStream << e.what() << std::endl;   
+    *outStream << e.what() << std::endl;
   }
 
   return 0;
diff --git a/packages/rol/test/elementwise/CMakeLists.txt b/packages/rol/test/elementwise/CMakeLists.txt
index 26624b1bdeda..be9ab6def067 100644
--- a/packages/rol/test/elementwise/CMakeLists.txt
+++ b/packages/rol/test/elementwise/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   StdVector
   SOURCES test_01.cpp
   ARGS PrintItAll
@@ -10,7 +10,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
 
 IF(${PROJECT_NAME}_ENABLE_Tpetra)
 TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/vector)
-  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  ROL_ADD_EXECUTABLE_AND_TEST(
     TpetraMultiVector
     SOURCES test_02.cpp
     ARGS PrintItAll
@@ -20,7 +20,7 @@ TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/tpetra/src/ve
     )
 ENDIF()
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   BoundConstraint
   SOURCES test_03.cpp
   ARGS PrintItAll
diff --git a/packages/rol/test/elementwise/test_01.cpp b/packages/rol/test/elementwise/test_01.cpp
index 30504c86efc4..4f0799f740ab 100644
--- a/packages/rol/test/elementwise/test_01.cpp
+++ b/packages/rol/test/elementwise/test_01.cpp
@@ -13,7 +13,7 @@
 
 #include "ROL_StdVector.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 
 // Unary function
@@ -57,12 +57,10 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  using namespace Teuchos;
- 
   typedef std::vector<RealT>     vec;
   typedef ROL::StdVector<RealT>  V;
 
-  GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/elementwise/test_02.cpp b/packages/rol/test/elementwise/test_02.cpp
index 8f9a65cdb7e7..85030a801833 100644
--- a/packages/rol/test/elementwise/test_02.cpp
+++ b/packages/rol/test/elementwise/test_02.cpp
@@ -14,7 +14,7 @@
 #include "ROL_TpetraMultiVector.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "Tpetra_Core.hpp"
 
@@ -64,7 +64,7 @@ class Square : public ROL::Elementwise::UnaryFunction<Real> {
 
 int main(int argc, char *argv[]) {
   
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv,0);
+  ROL::GlobalMPISession mpiSession(&argc, &argv,0);
   ROL::Ptr<const Teuchos::Comm<int> > comm = ROL::toPtr(Tpetra::getDefaultComm());
 
   int iprint = argc - 1;
diff --git a/packages/rol/test/elementwise/test_03.cpp b/packages/rol/test/elementwise/test_03.cpp
index eb55115b0557..2dcd7ed355f2 100644
--- a/packages/rol/test/elementwise/test_03.cpp
+++ b/packages/rol/test/elementwise/test_03.cpp
@@ -17,7 +17,7 @@
 #include "ROL_StdBoundConstraint.hpp"
 #include "ROL_StdVector.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 #include <iomanip>
@@ -26,15 +26,13 @@
 
 int main(int argc, char *argv[]) {
 
-  using namespace Teuchos;
-
   typedef double RealT;
 
   typedef std::vector<RealT>      vec;
   typedef ROL::Vector<RealT>      V;
   typedef ROL::StdVector<RealT>   SV;
 
-  GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
  
   int iprint = argc - 1;
   ROL::Ptr<std::ostream>  outStream;
diff --git a/packages/rol/test/elementwise/test_04.cpp b/packages/rol/test/elementwise/test_04.cpp
index 079856f24ee6..42924ad46480 100644
--- a/packages/rol/test/elementwise/test_04.cpp
+++ b/packages/rol/test/elementwise/test_04.cpp
@@ -16,7 +16,7 @@
 #include "ROL_BarrierFunctions.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "ROL_ParameterList.hpp"
 
 int main( int argc, char *argv[] ) {
diff --git a/packages/rol/test/function/CMakeLists.txt b/packages/rol/test/function/CMakeLists.txt
index c1a7ad12c762..ccbcba07ed62 100644
--- a/packages/rol/test/function/CMakeLists.txt
+++ b/packages/rol/test/function/CMakeLists.txt
@@ -1,8 +1,8 @@
-ADD_SUBDIRECTORY(operator)
+ADD_SUBDIRECTORY(operator) 
 ADD_SUBDIRECTORY(constraint)
 ADD_SUBDIRECTORY(sketching)
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   DerivativeCheck
   SOURCES test_01.cpp
   ARGS PrintItAll
@@ -11,7 +11,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   BarrierFunctionCheck
   SOURCES test_02.cpp
   ARGS PrintItAll
@@ -20,7 +20,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   BlockOperatorCheck
   SOURCES test_03.cpp
   ARGS PrintItAll
@@ -29,7 +29,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   SimOptSolveCheck
   SOURCES test_04.cpp
   ARGS PrintItAll
@@ -38,7 +38,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   NonlinearLeastSquaresCheck
   SOURCES test_05.cpp
   ARGS PrintItAll
@@ -47,7 +47,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   StdObjectiveCheck
   SOURCES test_06.cpp
   ARGS PrintItAll
@@ -56,7 +56,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   CompositeObjectiveCheck
   SOURCES test_07.cpp
   ARGS PrintItAll
@@ -65,7 +65,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   CompositeObjectiveSimOptCheck
   SOURCES test_08.cpp
   ARGS PrintItAll
@@ -74,7 +74,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   ConstraintPartionedCheck
   SOURCES test_09.cpp
   ARGS PrintItAll
@@ -83,7 +83,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   CompositeConstraintCheck
   SOURCES test_10.cpp
   ARGS PrintItAll
@@ -92,13 +92,13 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE(
+ROL_ADD_EXECUTABLE(
   ColemanLiModelCheck
   SOURCES test_11.cpp
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   HouseholderReflectorCheck
   SOURCES test_12.cpp
   ARGS PrintItAll
@@ -107,7 +107,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   BinaryConstraintCheck
   SOURCES test_13.cpp
   ARGS PrintItAll
@@ -115,7 +115,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   PolyhedralProjectionCheck
   SOURCES test_14.cpp
   ARGS PrintItAll
@@ -123,7 +123,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   NullSpaceProjectionCheck
   SOURCES test_15.cpp
   ARGS PrintItAll
@@ -131,7 +131,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   PolyhedralProjectionCheck2
   SOURCES test_16.cpp
   ARGS PrintItAll
@@ -139,7 +139,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   BoundConstraintScalingCheck
   SOURCES test_17.cpp
   ARGS PrintItAll
@@ -147,7 +147,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   ChainRuleObjectiveCheck
   SOURCES test_18.cpp
   ARGS PrintItAll
@@ -155,7 +155,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   ReducedConstraintSimOptCheck
   SOURCES test_19.cpp
   ARGS PrintItAll
@@ -163,7 +163,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_COPY_FILES_TO_BINARY_DIR(
+ROL_COPY_FILES_TO_BINARY_DIR(
   BinaryConstraintDataCopy
   SOURCE_FILES
     binary_constraint.xml
diff --git a/packages/rol/test/function/constraint/CMakeLists.txt b/packages/rol/test/function/constraint/CMakeLists.txt
index 10017f46cf68..0626ad9aad7f 100644
--- a/packages/rol/test/function/constraint/CMakeLists.txt
+++ b/packages/rol/test/function/constraint/CMakeLists.txt
@@ -1,4 +1,4 @@
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   ObjectiveConstraintConversion
   SOURCES test_01.cpp
   ARGS PrintItAll
@@ -8,7 +8,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   )
 
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   ScalarLinearConstraint
   SOURCES test_02.cpp
   ARGS PrintItAll
@@ -17,23 +17,23 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-# Moved to adapters/mpi/test/function
-#TRIBITS_ADD_EXECUTABLE_AND_TEST(
-#  Constraint_TimeSimOpt
-#  SOURCES test_03.cpp
-#  ARGS PrintItAll
-#  NUM_MPI_PROCS 1
-#  PASS_REGULAR_EXPRESSION "TEST PASSED"
-#  ADD_DIR_TO_NAME
-#  )
-
-# Moved to adapters/mpi/test/function
-#TRIBITS_ADD_EXECUTABLE_AND_TEST(
-#  Constraint_PinTSimOpt 
-#  SOURCES test_04.cpp
-#  ARGS PrintItAll
-#  COMM mpi
-#  NUM_MPI_PROCS 3
-#  PASS_REGULAR_EXPRESSION "TEST PASSED"
-#  ADD_DIR_TO_NAME
-#  )
+  # Moved to adapters/mpi/test/function
+  #ROL_ADD_EXECUTABLE_AND_TEST(
+  #  Constraint_TimeSimOpt
+  #  SOURCES test_03.cpp
+  #  ARGS PrintItAll
+  #  NUM_MPI_PROCS 1
+  #  PASS_REGULAR_EXPRESSION "TEST PASSED"
+  #  ADD_DIR_TO_NAME
+  #  )
+  #
+  # Moved to adapters/mpi/test/function
+  #ROL_ADD_EXECUTABLE_AND_TEST(
+  #  Constraint_PinTSimOpt 
+  #  SOURCES test_04.cpp
+  #  ARGS PrintItAll
+  #  COMM mpi
+  #  NUM_MPI_PROCS 3
+  #  PASS_REGULAR_EXPRESSION "TEST PASSED"
+  #  ADD_DIR_TO_NAME
+  #  )
diff --git a/packages/rol/test/function/constraint/test_01.cpp b/packages/rol/test/function/constraint/test_01.cpp
index 76ca13c28b5f..a23d01c692e8 100644
--- a/packages/rol/test/function/constraint/test_01.cpp
+++ b/packages/rol/test/function/constraint/test_01.cpp
@@ -22,7 +22,7 @@
 
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 
@@ -30,7 +30,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
    
 
diff --git a/packages/rol/test/function/constraint/test_02.cpp b/packages/rol/test/function/constraint/test_02.cpp
index 718d2dcf11d4..3248f14311bd 100644
--- a/packages/rol/test/function/constraint/test_02.cpp
+++ b/packages/rol/test/function/constraint/test_02.cpp
@@ -16,7 +16,7 @@
 #include "ROL_StdVector.hpp"
 #include "ROL_RandomVector.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 
@@ -24,7 +24,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   using V = ROL::Vector<RealT>;
 
diff --git a/packages/rol/test/function/constraint/test_03.cpp b/packages/rol/test/function/constraint/test_03.cpp
index c909b0f99ea9..160f0a3873f2 100644
--- a/packages/rol/test/function/constraint/test_03.cpp
+++ b/packages/rol/test/function/constraint/test_03.cpp
@@ -19,7 +19,7 @@
 #include "ROL_StdConstraint.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 using RealT   = double;
 using VectorT = std::vector<RealT>;
@@ -154,7 +154,7 @@ class OuterConstraint : public ROL::StdConstraint<RealT> {
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/function/operator/CMakeLists.txt b/packages/rol/test/function/operator/CMakeLists.txt
index 97614b56f88d..ef4ab29739b9 100644
--- a/packages/rol/test/function/operator/CMakeLists.txt
+++ b/packages/rol/test/function/operator/CMakeLists.txt
@@ -1,4 +1,4 @@
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   StdLinearOperator  
   SOURCES test_01.cpp
   ARGS PrintItAll
@@ -7,7 +7,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   StdTridiagonalOperator  
   SOURCES test_02.cpp
   ARGS PrintItAll
diff --git a/packages/rol/test/function/operator/test_01.cpp b/packages/rol/test/function/operator/test_01.cpp
index 506737d46593..312b4d1d13e2 100644
--- a/packages/rol/test/function/operator/test_01.cpp
+++ b/packages/rol/test/function/operator/test_01.cpp
@@ -26,7 +26,7 @@
 
 #include "ROL_StdLinearOperator.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 typedef double RealT;
 
@@ -41,7 +41,7 @@ int main(int argc, char *argv[]) {
   typedef ROL::StdLinearOperator<RealT> StdLinearOperator;
 
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/function/operator/test_02.cpp b/packages/rol/test/function/operator/test_02.cpp
index aef5d9722cb7..aafe0f1477ae 100644
--- a/packages/rol/test/function/operator/test_02.cpp
+++ b/packages/rol/test/function/operator/test_02.cpp
@@ -13,7 +13,7 @@
 
 #include "ROL_StdTridiagonalOperator.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 typedef double RealT;
 
@@ -28,7 +28,7 @@ int main(int argc, char *argv[]) {
   using vector = std::vector<RealT>;
   
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/function/sketching/CMakeLists.txt b/packages/rol/test/function/sketching/CMakeLists.txt
index 6d7ecc023bc3..dd3067f41500 100644
--- a/packages/rol/test/function/sketching/CMakeLists.txt
+++ b/packages/rol/test/function/sketching/CMakeLists.txt
@@ -1,4 +1,4 @@
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   Interface
   SOURCES test_01.cpp
   ARGS PrintItAll
diff --git a/packages/rol/test/function/sketching/test_01.cpp b/packages/rol/test/function/sketching/test_01.cpp
index 0cc9266cc2e6..ad026a6711aa 100644
--- a/packages/rol/test/function/sketching/test_01.cpp
+++ b/packages/rol/test/function/sketching/test_01.cpp
@@ -15,7 +15,7 @@
 #include "ROL_StdVector.hpp"
 #include "ROL_Stream.hpp"
 #include "ROL_Sketch.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 
@@ -23,7 +23,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/function/test_01.cpp b/packages/rol/test/function/test_01.cpp
index d61efd96b313..e7cb44a6b43a 100644
--- a/packages/rol/test/function/test_01.cpp
+++ b/packages/rol/test/function/test_01.cpp
@@ -17,7 +17,7 @@
 #include "ROL_GetTestProblems.hpp"
 #include "ROL_HelperFunctions.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 
@@ -25,7 +25,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
@@ -122,7 +122,7 @@ int main(int argc, char *argv[]) {
                << std::setw(20) << hsymCheck[2]
                << "\n";
 
-    Teuchos::SerialDenseMatrix<int, RealT> H(x.dimension(), x.dimension());
+    ROL::LA::Matrix<RealT> H(x.dimension(), x.dimension());
     H = ROL::computeDenseHessian(obj, x);
     //H.print(*outStream);
 
diff --git a/packages/rol/test/function/test_02.cpp b/packages/rol/test/function/test_02.cpp
index a89a1d896e94..9d9cde9ce96d 100644
--- a/packages/rol/test/function/test_02.cpp
+++ b/packages/rol/test/function/test_02.cpp
@@ -17,7 +17,7 @@
 #include "ROL_ObjectiveFromBoundConstraint.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "ROL_ParameterList.hpp"
 
 
@@ -31,7 +31,7 @@ int main(int argc, char *argv[]) {
 
   typedef typename vector::size_type luint;
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/function/test_03.cpp b/packages/rol/test/function/test_03.cpp
index 163a5066c580..5228bddd9619 100644
--- a/packages/rol/test/function/test_03.cpp
+++ b/packages/rol/test/function/test_03.cpp
@@ -25,7 +25,7 @@
 #include "ROL_Types.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 template<class Real>
 void print_vector( const ROL::Vector<Real> &x ) {
@@ -65,9 +65,7 @@ int main(int argc, char *argv[]) {
 
   typedef typename vector::size_type    luint;
 
-  using namespace Teuchos;
-
-  GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   int iprint = argc - 1;
 
diff --git a/packages/rol/test/function/test_04.cpp b/packages/rol/test/function/test_04.cpp
index 54cb5da70258..49012fac241e 100644
--- a/packages/rol/test/function/test_04.cpp
+++ b/packages/rol/test/function/test_04.cpp
@@ -13,10 +13,10 @@
 
 #include "ROL_Stream.hpp"
 
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_Comm.hpp"
-#include "Teuchos_DefaultComm.hpp"
-#include "Teuchos_CommHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
+//#include "Teuchos_Comm.hpp"
+//#include "Teuchos_DefaultComm.hpp"
+//#include "Teuchos_CommHelpers.hpp"
 
 #include <iostream>
 #include <fstream>
@@ -34,7 +34,7 @@ typedef H1VectorPrimal<RealT> DualConstraintVector;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint = argc - 1;
diff --git a/packages/rol/test/function/test_04.hpp b/packages/rol/test/function/test_04.hpp
index a06baa7cc987..630d6a633682 100644
--- a/packages/rol/test/function/test_04.hpp
+++ b/packages/rol/test/function/test_04.hpp
@@ -17,7 +17,7 @@
 #include "ROL_Constraint_SimOpt.hpp"
 #include "ROL_Objective_SimOpt.hpp"
 
-#include "Teuchos_LAPACK.hpp"
+#include "ROL_LAPACK.hpp"
 
 template<class Real>
 class L2VectorPrimal;
@@ -77,7 +77,7 @@ class BurgersFEM {
     else {
       u.assign(r.begin(),r.end());
       // Perform LDL factorization
-      Teuchos::LAPACK<int,Real> lp;
+      ROL::LAPACK<int,Real> lp;
       std::vector<Real> du2(r.size()-2,0.0);
       std::vector<int> ipiv(r.size(),0);
       int info;
diff --git a/packages/rol/test/function/test_05.cpp b/packages/rol/test/function/test_05.cpp
index 279ded02586c..58f4bd37b3fe 100644
--- a/packages/rol/test/function/test_05.cpp
+++ b/packages/rol/test/function/test_05.cpp
@@ -19,7 +19,7 @@
 #include "ROL_Algorithm.hpp"
 #include "ROL_TrustRegionStep.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 
@@ -28,7 +28,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/function/test_06.cpp b/packages/rol/test/function/test_06.cpp
index 021cb8e5e9bf..3fccdab33699 100644
--- a/packages/rol/test/function/test_06.cpp
+++ b/packages/rol/test/function/test_06.cpp
@@ -8,7 +8,7 @@
 // @HEADER
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "ROL_StdVector.hpp"
 #include "ROL_StdObjective.hpp"
@@ -57,7 +57,7 @@ void setRandomVector(std::vector<RealT> &x) {
 
 int main(int argc, char* argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/function/test_07.cpp b/packages/rol/test/function/test_07.cpp
index 099ceecc4104..fad1bb10bc07 100644
--- a/packages/rol/test/function/test_07.cpp
+++ b/packages/rol/test/function/test_07.cpp
@@ -8,7 +8,7 @@
 // @HEADER
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "ROL_StdVector.hpp"
 #include "ROL_StdObjective.hpp"
@@ -89,7 +89,7 @@ void setRandomVector(std::vector<RealT> &x) {
 
 int main(int argc, char* argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/function/test_08.cpp b/packages/rol/test/function/test_08.cpp
index 98fbec4aacdf..960b4d579e18 100644
--- a/packages/rol/test/function/test_08.cpp
+++ b/packages/rol/test/function/test_08.cpp
@@ -8,7 +8,7 @@
 // @HEADER
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "ROL_StdVector.hpp"
 #include "ROL_StdObjective.hpp"
@@ -170,7 +170,7 @@ void setRandomVector(std::vector<RealT> &x) {
 
 int main(int argc, char* argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/function/test_09.cpp b/packages/rol/test/function/test_09.cpp
index 95f50c2a7e6f..48ceeed88da3 100644
--- a/packages/rol/test/function/test_09.cpp
+++ b/packages/rol/test/function/test_09.cpp
@@ -21,7 +21,7 @@
 #include "ROL_ConstraintStatusTest.hpp"
 #include "ROL_CompositeStep.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 
@@ -38,7 +38,7 @@ int main(int argc, char *argv[]) {
 
   typedef typename vector::size_type       luint;
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/function/test_10.cpp b/packages/rol/test/function/test_10.cpp
index 52b66038cd9b..882729960213 100644
--- a/packages/rol/test/function/test_10.cpp
+++ b/packages/rol/test/function/test_10.cpp
@@ -14,7 +14,7 @@
 #include "ROL_CompositeConstraint_SimOpt.hpp"
 #include "ROL_StdVector.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 
@@ -321,7 +321,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/function/test_11.cpp b/packages/rol/test/function/test_11.cpp
index 1aae9801c090..6b41c1d212d8 100644
--- a/packages/rol/test/function/test_11.cpp
+++ b/packages/rol/test/function/test_11.cpp
@@ -17,7 +17,7 @@
 #include "ROL_StdVector.hpp"
 #include "ROL_RandomVector.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 //#include <fenv.h>
 
 typedef double RealT;
@@ -29,7 +29,7 @@ int main(int argc, char *argv[]) {
   typedef ROL::Objective<RealT>       OBJ;
   typedef ROL::BoundConstraint<RealT> CON; 
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/function/test_12.cpp b/packages/rol/test/function/test_12.cpp
index 24cc7155bc95..0d3b2e0035c3 100644
--- a/packages/rol/test/function/test_12.cpp
+++ b/packages/rol/test/function/test_12.cpp
@@ -17,7 +17,7 @@
 #include "ROL_StdVector.hpp"
 #include "ROL_RandomVector.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 template<class Real> 
 void printVector( const ROL::Vector<Real> &x, std::ostream &outStream ) {
@@ -42,7 +42,7 @@ int main(int argc, char *argv[]) {
 
    
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/function/test_13.cpp b/packages/rol/test/function/test_13.cpp
index bc5a9de0a005..5356a9c39f41 100644
--- a/packages/rol/test/function/test_13.cpp
+++ b/packages/rol/test/function/test_13.cpp
@@ -26,7 +26,7 @@
 #include "ROL_Bounds.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 
 
@@ -55,7 +55,7 @@ int main(int argc, char *argv[]) {
 
    
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a
   // (dummy) command-line argument is provided.
diff --git a/packages/rol/test/function/test_14.cpp b/packages/rol/test/function/test_14.cpp
index d163d088ca64..cd70bdbc5573 100644
--- a/packages/rol/test/function/test_14.cpp
+++ b/packages/rol/test/function/test_14.cpp
@@ -19,13 +19,13 @@
 #include "ROL_PolyhedralProjectionFactory.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a
   // (dummy) command-line argument is provided.
diff --git a/packages/rol/test/function/test_15.cpp b/packages/rol/test/function/test_15.cpp
index 869c0821b6a9..1692e93b6235 100644
--- a/packages/rol/test/function/test_15.cpp
+++ b/packages/rol/test/function/test_15.cpp
@@ -18,7 +18,7 @@
 #include "ROL_StdConstraint.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 template<typename Real>
 class con2d : public ROL::StdConstraint<Real> {
@@ -39,7 +39,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a
   // (dummy) command-line argument is provided.
diff --git a/packages/rol/test/function/test_16.cpp b/packages/rol/test/function/test_16.cpp
index 5a99766ab0f7..44655d3d3793 100644
--- a/packages/rol/test/function/test_16.cpp
+++ b/packages/rol/test/function/test_16.cpp
@@ -18,7 +18,7 @@
 #include "ROL_StdConstraint.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 template<typename Real>
 class con2d : public ROL::StdConstraint<Real> {
@@ -39,7 +39,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a
   // (dummy) command-line argument is provided.
diff --git a/packages/rol/test/function/test_17.cpp b/packages/rol/test/function/test_17.cpp
index 356f72072d58..51efe2197ffd 100644
--- a/packages/rol/test/function/test_17.cpp
+++ b/packages/rol/test/function/test_17.cpp
@@ -13,7 +13,7 @@
 */
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "ROL_Bounds.hpp"
 #include "ROL_Constraint.hpp"
@@ -266,7 +266,7 @@ int testCases(RealT tol, ROL::Ptr<std::ostream> outStream) {
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a
   // (dummy) command-line argument is provided.
diff --git a/packages/rol/test/function/test_18.cpp b/packages/rol/test/function/test_18.cpp
index 95e2aec05b49..9d9c13e61174 100644
--- a/packages/rol/test/function/test_18.cpp
+++ b/packages/rol/test/function/test_18.cpp
@@ -33,7 +33,7 @@
 #include "ROL_ChainRuleObjective.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "ROL_SimpleEqConstrained.hpp"
 #include "ROL_Zakharov.hpp"
@@ -46,7 +46,7 @@ int main(int argc, char *argv[]) {
   using ObjectiveT   = ROL::ZOO::Objective_Zakharov<RealT>;
   using ConstraintT  = ROL::ZOO::EqualityConstraint_SimpleEqConstrained<RealT>;
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/function/test_19.cpp b/packages/rol/test/function/test_19.cpp
index 3d1b5f0c68fa..3728382d374f 100644
--- a/packages/rol/test/function/test_19.cpp
+++ b/packages/rol/test/function/test_19.cpp
@@ -50,7 +50,7 @@
 #include "ROL_Constraint_SimOpt.hpp"
 #include "ROL_Reduced_Constraint_SimOpt.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include <cassert>
 
 template<typename Real>
@@ -386,7 +386,7 @@ class constraint2 : public ROL::Constraint_SimOpt<Real> {
 int main(int argc, char *argv[]) {
   using RealT = double;
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/parameterlist/CMakeLists.txt b/packages/rol/test/parameterlist/CMakeLists.txt
index 7981bd703463..a32c003adabe 100644
--- a/packages/rol/test/parameterlist/CMakeLists.txt
+++ b/packages/rol/test/parameterlist/CMakeLists.txt
@@ -1,7 +1,7 @@
 #GET_PROPERTY( PTR_STRING GLOBAL PROPERTY PTR_IMPL )
 IF( ROL_ParameterList STREQUAL "simple")
   message("Enabling tests of ROL's native ParameterList implementation")
-  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  ROL_ADD_EXECUTABLE_AND_TEST(
     ParameterList 
     SOURCES test_01.cpp
     ARGS PrintItAll
diff --git a/packages/rol/test/parameters/CMakeLists.txt b/packages/rol/test/parameters/CMakeLists.txt
index f18b336bd67b..b264a6636460 100644
--- a/packages/rol/test/parameters/CMakeLists.txt
+++ b/packages/rol/test/parameters/CMakeLists.txt
@@ -1,12 +1,12 @@
 
-TRIBITS_ADD_EXECUTABLE(
+ROL_ADD_EXECUTABLE(
   ParameterCheck
   SOURCES test_01.cpp
   ADD_DIR_TO_NAME
   )
 
 
-TRIBITS_COPY_FILES_TO_BINARY_DIR(
+ROL_COPY_FILES_TO_BINARY_DIR(
   ParameterDataCopy
   SOURCE_FILES
     parameters.xml
diff --git a/packages/rol/test/parameters/test_01.cpp b/packages/rol/test/parameters/test_01.cpp
index ed9145f5ac54..a535b978a451 100644
--- a/packages/rol/test/parameters/test_01.cpp
+++ b/packages/rol/test/parameters/test_01.cpp
@@ -8,15 +8,13 @@
 // @HEADER
 
 #include <iostream>
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "ROL_Types.hpp"
 #include "ROL_ParameterListConverters.hpp"
 
 int main(int argc, char *argv[]) {
 
-  using namespace Teuchos;
-
   std::string infile  = "parameters.xml";
   std::string outfile = "tiered_parameters.xml";
 
diff --git a/packages/rol/test/sol/CMakeLists.txt b/packages/rol/test/sol/CMakeLists.txt
index eb0f1c8c5933..12690fc0230e 100644
--- a/packages/rol/test/sol/CMakeLists.txt
+++ b/packages/rol/test/sol/CMakeLists.txt
@@ -1,4 +1,4 @@
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   solInterface
   SOURCES test_01.cpp
   ARGS PrintItAll
@@ -7,7 +7,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   solDistributions
   SOURCES test_02.cpp
   ARGS PrintItAll
@@ -17,8 +17,8 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   )
 
 # moved to adapters/teuchos/tests/sol
-#TRIBITS_ADD_EXECUTABLE_AND_TEST(
-#  solTeuchosBatchManager
+#ROL_ADD_EXECUTABLE_AND_TEST(
+#  solROLBatchManager
 #  SOURCES test_03.cpp
 #  ARGS PrintItAll
 #  NUM_MPI_PROCS 4
@@ -27,7 +27,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
 #  )
 
 # moved to adapters/teuchos/tests/sol
-#TRIBITS_ADD_EXECUTABLE_AND_TEST(
+#ROL_ADD_EXECUTABLE_AND_TEST(
 #  solSROMGenerator
 #  SOURCES test_04.cpp
 #  ARGS PrintItAll
@@ -37,7 +37,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
 #  )
 
 # moved to adapters/teuchos/tests/sol
-#TRIBITS_ADD_EXECUTABLE_AND_TEST(
+#ROL_ADD_EXECUTABLE_AND_TEST(
 #  checkParametrizedSimOptSolve
 #  SOURCES test_05.cpp
 #  ARGS PrintItAll
@@ -47,7 +47,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
 #  )
 
 # moved to adapters/teuchos/tests/sol
-#TRIBITS_ADD_EXECUTABLE_AND_TEST(
+#ROL_ADD_EXECUTABLE_AND_TEST(
 #  checkSROMVector
 #  SOURCES test_06.cpp
 #  ARGS PrintItAll
@@ -57,7 +57,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
 #  )
 
 # moved to adapters/teuchos/tests/sol
-#TRIBITS_ADD_EXECUTABLE_AND_TEST(
+#ROL_ADD_EXECUTABLE_AND_TEST(
 #  checkHMCR
 #  SOURCES test_07.cpp
 #  ARGS PrintItAll
@@ -66,7 +66,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
 #  ADD_DIR_TO_NAME
 #  )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   checkSuperQuantileQuadrangle
   SOURCES test_08.cpp
   ARGS PrintItAll
@@ -75,7 +75,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   checkSpectralRisk
   SOURCES test_09.cpp
   ARGS PrintItAll
@@ -84,7 +84,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   checkAlmostSureConstraint
   SOURCES test_10.cpp
   ARGS PrintItAll
@@ -93,7 +93,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   checkMeanValueConstraint
   SOURCES test_11.cpp
   ARGS PrintItAll
@@ -102,7 +102,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   checkRiskNeutralConstraint
   SOURCES test_12.cpp
   ARGS PrintItAll
@@ -111,7 +111,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   checkLinearRegression
   SOURCES test_13.cpp
   ARGS PrintItAll
@@ -120,7 +120,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   checkProgressiveHedging
   SOURCES test_14.cpp
   ARGS PrintItAll
@@ -129,7 +129,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   checkPrimalDualRisk
   SOURCES test_15.cpp
   ARGS PrintItAll
@@ -138,7 +138,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   checkStochasticProblem
   SOURCES test_16.cpp
   ARGS PrintItAll
@@ -147,7 +147,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_COPY_FILES_TO_BINARY_DIR(
+ROL_COPY_FILES_TO_BINARY_DIR(
   solDataCopy
   SOURCE_FILES
     input_01.xml
diff --git a/packages/rol/test/sol/test_01.cpp b/packages/rol/test/sol/test_01.cpp
index 527f6627bc47..7c684b1fa0f2 100644
--- a/packages/rol/test/sol/test_01.cpp
+++ b/packages/rol/test/sol/test_01.cpp
@@ -10,7 +10,7 @@
 #include "ROL_ParameterList.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "ROL_StdVector.hpp"
 #include "ROL_StdBoundConstraint.hpp"
@@ -111,7 +111,7 @@ void printSolution(const std::vector<RealT> &x,
 
 int main(int argc, char* argv[]) {
   //feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/sol/test_02.cpp b/packages/rol/test/sol/test_02.cpp
index 7150b7188f02..286d28190937 100644
--- a/packages/rol/test/sol/test_02.cpp
+++ b/packages/rol/test/sol/test_02.cpp
@@ -10,7 +10,7 @@
 #include "ROL_ParameterList.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 #include "ROL_Ptr.hpp"
 #include "ROL_DistributionFactory.hpp"
 
@@ -18,7 +18,7 @@ typedef double RealT;
 
 int main(int argc, char* argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/sol/test_08.cpp b/packages/rol/test/sol/test_08.cpp
index 995baa261970..15acfa1f7d8c 100644
--- a/packages/rol/test/sol/test_08.cpp
+++ b/packages/rol/test/sol/test_08.cpp
@@ -10,7 +10,7 @@
 #include "ROL_ParameterList.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "ROL_StdVector.hpp"
 #include "ROL_StdBoundConstraint.hpp"
@@ -109,7 +109,7 @@ void printSolution(const std::vector<RealT> &x,
 
 int main(int argc, char* argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/sol/test_09.cpp b/packages/rol/test/sol/test_09.cpp
index f20e028143ab..27b13bd38a91 100644
--- a/packages/rol/test/sol/test_09.cpp
+++ b/packages/rol/test/sol/test_09.cpp
@@ -10,7 +10,7 @@
 #include "ROL_ParameterList.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "ROL_StdVector.hpp"
 #include "ROL_StdBoundConstraint.hpp"
@@ -109,7 +109,7 @@ void printSolution(const std::vector<RealT> &x,
 
 int main(int argc, char* argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/sol/test_10.cpp b/packages/rol/test/sol/test_10.cpp
index bba872acde63..c3c4eac1fc5f 100644
--- a/packages/rol/test/sol/test_10.cpp
+++ b/packages/rol/test/sol/test_10.cpp
@@ -10,7 +10,7 @@
 #include "ROL_ParameterList.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "ROL_ScaledStdVector.hpp"
 #include "ROL_StdBoundConstraint.hpp"
@@ -162,7 +162,7 @@ void printSolution(const std::vector<Real> &x,
 
 int main(int argc, char* argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/sol/test_11.cpp b/packages/rol/test/sol/test_11.cpp
index 0c0eae23ee88..a854ba6e3c44 100644
--- a/packages/rol/test/sol/test_11.cpp
+++ b/packages/rol/test/sol/test_11.cpp
@@ -10,7 +10,7 @@
 #include "ROL_ParameterList.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "ROL_ScaledStdVector.hpp"
 #include "ROL_StdBoundConstraint.hpp"
@@ -162,7 +162,7 @@ void printSolution(const std::vector<Real> &x,
 
 int main(int argc, char* argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/sol/test_12.cpp b/packages/rol/test/sol/test_12.cpp
index ff946151abf0..8b99454bc35a 100644
--- a/packages/rol/test/sol/test_12.cpp
+++ b/packages/rol/test/sol/test_12.cpp
@@ -10,7 +10,7 @@
 #include "ROL_ParameterList.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "ROL_ScaledStdVector.hpp"
 #include "ROL_StdBoundConstraint.hpp"
@@ -162,7 +162,7 @@ void printSolution(const std::vector<Real> &x,
 
 int main(int argc, char* argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/sol/test_13.cpp b/packages/rol/test/sol/test_13.cpp
index 457888bfbf13..c969bb22363c 100644
--- a/packages/rol/test/sol/test_13.cpp
+++ b/packages/rol/test/sol/test_13.cpp
@@ -8,7 +8,7 @@
 // @HEADER
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "ROL_ParameterList.hpp"
 #include "ROL_LinearRegression.hpp"
diff --git a/packages/rol/test/sol/test_14.cpp b/packages/rol/test/sol/test_14.cpp
index 615617ff5de4..d7458572fcef 100644
--- a/packages/rol/test/sol/test_14.cpp
+++ b/packages/rol/test/sol/test_14.cpp
@@ -7,10 +7,9 @@
 // *****************************************************************************
 // @HEADER
 
-#include "Teuchos_ParameterList.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
-#include "Teuchos_oblackholestream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_ParameterList.hpp"
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "ROL_StdVector.hpp"
 #include "ROL_StdBoundConstraint.hpp"
@@ -84,12 +83,12 @@ void printSolution(const std::vector<RealT> &x,
 
 int main(int argc, char* argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
   ROL::Ptr<std::ostream> outStream;
-  Teuchos::oblackholestream bhs; // outputs nothing
+  ROL::nullstream bhs; // outputs nothing
   if (iprint > 0)
     outStream = ROL::makePtrFromRef(std::cout);
   else
@@ -103,9 +102,9 @@ int main(int argc, char* argv[]) {
     /**********************************************************************************************/
     // Get ROL parameterlist
     std::string filename = "input_14.xml";
-//    Teuchos::RCP<Teuchos::ParameterList> parlist = Teuchos::rcp( new Teuchos::ParameterList() );
-//    Teuchos::updateParametersFromXmlFile( filename, parlist.ptr() );
-//    Teuchos::ParameterList list = *parlist;
+//    ROL::RCP<ROL::ParameterList> parlist = ROL::rcp( new ROL::ParameterList() );
+//    ROL::updateParametersFromXmlFile( filename, parlist.ptr() );
+//    ROL::ParameterList list = *parlist;
       ROL::Ptr<ROL::ParameterList> parlist = ROL::getParametersFromXmlFile( filename );
     /**********************************************************************************************/
     /************************* CONSTRUCT SOL COMPONENTS *******************************************/
diff --git a/packages/rol/test/sol/test_15.cpp b/packages/rol/test/sol/test_15.cpp
index bf12d33f58b9..58fec2b26c28 100644
--- a/packages/rol/test/sol/test_15.cpp
+++ b/packages/rol/test/sol/test_15.cpp
@@ -7,10 +7,9 @@
 // *****************************************************************************
 // @HEADER
 
-#include "Teuchos_ParameterList.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
-#include "Teuchos_oblackholestream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_ParameterList.hpp"
+#include "ROL_Stream.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "ROL_StdVector.hpp"
 #include "ROL_StdBoundConstraint.hpp"
@@ -84,12 +83,12 @@ void printSolution(const std::vector<RealT> &x,
 
 int main(int argc, char* argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
   ROL::Ptr<std::ostream> outStream;
-  Teuchos::oblackholestream bhs; // outputs nothing
+  ROL::nullstream bhs; // outputs nothing
   if (iprint > 0)
     outStream = ROL::makePtrFromRef(std::cout);
   else
@@ -103,9 +102,9 @@ int main(int argc, char* argv[]) {
     /**********************************************************************************************/
     // Get ROL parameterlist
     std::string filename = "input_15.xml";
-//    Teuchos::RCP<Teuchos::ParameterList> parlist = Teuchos::rcp( new Teuchos::ParameterList() );
-//    Teuchos::updateParametersFromXmlFile( filename, parlist.ptr() );
-//    Teuchos::ParameterList list = *parlist;
+//    ROL::RCP<ROL::ParameterList> parlist = ROL::rcp( new ROL::ParameterList() );
+//    ROL::updateParametersFromXmlFile( filename, parlist.ptr() );
+//    ROL::ParameterList list = *parlist;
       ROL::Ptr<ROL::ParameterList> parlist = ROL::getParametersFromXmlFile( filename );
     /**********************************************************************************************/
     /************************* CONSTRUCT SOL COMPONENTS *******************************************/
diff --git a/packages/rol/test/sol/test_16.cpp b/packages/rol/test/sol/test_16.cpp
index ae3bdb1777fb..02e141fe9689 100644
--- a/packages/rol/test/sol/test_16.cpp
+++ b/packages/rol/test/sol/test_16.cpp
@@ -7,7 +7,7 @@
 // *****************************************************************************
 // @HEADER
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "ROL_ParameterList.hpp"
 #include "ROL_Stream.hpp"
@@ -82,7 +82,7 @@ void printSolution(const std::vector<RealT> &x,
 
 int main(int argc, char* argv[]) {
   //feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/step/CMakeLists.txt b/packages/rol/test/step/CMakeLists.txt
index df0917bb0311..0c5b3b272c21 100644
--- a/packages/rol/test/step/CMakeLists.txt
+++ b/packages/rol/test/step/CMakeLists.txt
@@ -1,9 +1,9 @@
-ADD_SUBDIRECTORY(interiorpoint)
+# ADD_SUBDIRECTORY(interiorpoint) # uses TriBITS
 ADD_SUBDIRECTORY(fletcher)
 ADD_SUBDIRECTORY(trustregion)
 
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
    LineSearch
    SOURCES test_01.cpp
    ARGS PrintItAll
@@ -12,7 +12,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
    ADD_DIR_TO_NAME
    )
  
- TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ ROL_ADD_EXECUTABLE_AND_TEST(
    TrustRegion
    SOURCES test_02.cpp
    ARGS PrintItAll
@@ -21,7 +21,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
    ADD_DIR_TO_NAME
    )
  
- TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ ROL_ADD_EXECUTABLE_AND_TEST(
    BoxConstrained_LineSearch
    SOURCES test_03.cpp
    ARGS PrintItAll
@@ -30,7 +30,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
    ADD_DIR_TO_NAME
    )
  
- TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ ROL_ADD_EXECUTABLE_AND_TEST(
    BoxConstrained_TrustRegion
    SOURCES test_04.cpp
    ARGS PrintItAll
@@ -39,7 +39,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
    ADD_DIR_TO_NAME
    )
  
- TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ ROL_ADD_EXECUTABLE_AND_TEST(
    BoxConstrained_PrimalDualActiveSet
    SOURCES test_05.cpp
    ARGS PrintItAll
@@ -48,7 +48,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
    ADD_DIR_TO_NAME
    )
  
- TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ ROL_ADD_EXECUTABLE_AND_TEST(
    LineSearchTypes
    SOURCES test_06.cpp
    ARGS PrintItAll
@@ -57,43 +57,51 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
    ADD_DIR_TO_NAME
    )
  
- #TRIBITS_ADD_EXECUTABLE_AND_TEST(
- #  InteriorPoint
- #  SOURCES test_07.cpp
- #  ARGS PrintItAll
- #  NUM_MPI_PROCS 1
- #  PASS_REGULAR_EXPRESSION "TEST PASSED"
- #  ADD_DIR_TO_NAME
- #  )
- 
- #TRIBITS_ADD_EXECUTABLE_AND_TEST(
- #  InteriorPointIneq
- #  SOURCES test_08.cpp
- #  ARGS PrintItAll
- #  NUM_MPI_PROCS 1
- #  PASS_REGULAR_EXPRESSION "TEST PASSED"
- #  ADD_DIR_TO_NAME
- #  )
- 
- #TRIBITS_ADD_EXECUTABLE_AND_TEST(
- #  InteriorPointBound
- #  SOURCES test_10.cpp
- #  ARGS PrintItAll
- #  NUM_MPI_PROCS 1
- #  PASS_REGULAR_EXPRESSION "TEST PASSED"
- #  ADD_DIR_TO_NAME
- #  )
- 
- #TRIBITS_ADD_EXECUTABLE_AND_TEST(
- #  InteriorPointBoundIneq
- #  SOURCES test_11.cpp
- #  ARGS PrintItAll
- #  NUM_MPI_PROCS 1
- #  PASS_REGULAR_EXPRESSION "TEST PASSED"
- #  ADD_DIR_TO_NAME
- #  )
- 
- TRIBITS_ADD_EXECUTABLE_AND_TEST(
+   # ROL_ADD_EXECUTABLE_AND_TEST(
+   #   InteriorPoint
+   #   SOURCES test_07.cpp
+   #   ARGS PrintItAll
+   #   NUM_MPI_PROCS 1
+   #   PASS_REGULAR_EXPRESSION "TEST PASSED"
+   #   ADD_DIR_TO_NAME
+   #   )
+ 
+   # ROL_ADD_EXECUTABLE_AND_TEST(
+   #   InteriorPointIneq
+   #   SOURCES test_08.cpp
+   #   ARGS PrintItAll
+   #   NUM_MPI_PROCS 1
+   #   PASS_REGULAR_EXPRESSION "TEST PASSED"
+   #   ADD_DIR_TO_NAME
+   #   )
+   #  ROL_ADD_EXECUTABLE_AND_TEST(
+   #   InteriorPointKKT
+   #   SOURCES test_09.cpp
+   #   ARGS PrintItAll
+   #   NUM_MPI_PROCS 1
+   #   PASS_REGULAR_EXPRESSION "TEST PASSED"
+   #   ADD_DIR_TO_NAME
+   #   )
+   # 
+   # ROL_ADD_EXECUTABLE_AND_TEST(
+   #   InteriorPointBound
+   #   SOURCES test_10.cpp
+   #   ARGS PrintItAll
+   #   NUM_MPI_PROCS 1
+   #   PASS_REGULAR_EXPRESSION "TEST PASSED"
+   #   ADD_DIR_TO_NAME
+   #   )
+   # 
+   # ROL_ADD_EXECUTABLE_AND_TEST(
+   #   InteriorPointBoundIneq
+   #   SOURCES test_11.cpp
+   #   ARGS PrintItAll
+   #   NUM_MPI_PROCS 1
+   #   PASS_REGULAR_EXPRESSION "TEST PASSED"
+   #   ADD_DIR_TO_NAME
+   #   )
+ 
+ ROL_ADD_EXECUTABLE_AND_TEST(
    StepNotImplemented
    SOURCES test_12.cpp
    ARGS PrintItAll
@@ -102,16 +110,8 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
    ADD_DIR_TO_NAME
    )
  
- #TRIBITS_ADD_EXECUTABLE_AND_TEST(
- #  InteriorPointKKT
- #  SOURCES test_09.cpp
- #  ARGS PrintItAll
- #  NUM_MPI_PROCS 1
- #  PASS_REGULAR_EXPRESSION "TEST PASSED"
- #  ADD_DIR_TO_NAME
- #  )
- 
- TRIBITS_ADD_EXECUTABLE_AND_TEST(
+
+ ROL_ADD_EXECUTABLE_AND_TEST(
    InteriorPointStep
    SOURCES test_13.cpp
    ARGS PrintItAll
@@ -120,7 +120,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
    ADD_DIR_TO_NAME
    )
  
- TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ ROL_ADD_EXECUTABLE_AND_TEST(
    MoreauYosidaPenaltyStep
    SOURCES test_14.cpp
    ARGS PrintItAll
@@ -129,7 +129,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
    ADD_DIR_TO_NAME
    )
  
- TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ ROL_ADD_EXECUTABLE_AND_TEST(
    BoxConstrained_LM_TrustRegion
    SOURCES test_15.cpp
    ARGS PrintItAll
@@ -138,7 +138,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
    ADD_DIR_TO_NAME
    )
  
- TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ ROL_ADD_EXECUTABLE_AND_TEST(
    AugmentedLagrangianStep
    SOURCES test_16.cpp
    ARGS PrintItAll
@@ -147,7 +147,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
    ADD_DIR_TO_NAME
    )
  
- TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ ROL_ADD_EXECUTABLE_AND_TEST(
    FletcherStep
    SOURCES test_17.cpp
    ARGS PrintItAll
@@ -156,7 +156,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
    ADD_DIR_TO_NAME
    )
  
- TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ ROL_ADD_EXECUTABLE_AND_TEST(
    CubicTest
    SOURCES test_18.cpp
    ARGS PrintItAll
@@ -165,7 +165,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
    ADD_DIR_TO_NAME
    )
  
- TRIBITS_COPY_FILES_TO_BINARY_DIR(
+ ROL_COPY_FILES_TO_BINARY_DIR(
    StepTestDataCopy
    SOURCE_FILES
      input.xml
diff --git a/packages/rol/test/step/fletcher/CMakeLists.txt b/packages/rol/test/step/fletcher/CMakeLists.txt
index d374769d4d4d..4af80c2ee5f7 100644
--- a/packages/rol/test/step/fletcher/CMakeLists.txt
+++ b/packages/rol/test/step/fletcher/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   FLETCHER
   SOURCES test_01.cpp
   ARGS PrintItAll
@@ -8,7 +8,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   BOUNDFLETCHER
   SOURCES test_02.cpp
   ARGS PrintItAll
@@ -17,7 +17,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   ALLPROBLEMS
   SOURCES test_03.cpp
   ARGS PrintItAll
@@ -26,7 +26,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
- TRIBITS_COPY_FILES_TO_BINARY_DIR(
+ ROL_COPY_FILES_TO_BINARY_DIR(
    StepFletcherTestDataCopy
    SOURCE_FILES
      input.xml input_ex02.xml input_ex03.xml
diff --git a/packages/rol/test/step/fletcher/test_01.cpp b/packages/rol/test/step/fletcher/test_01.cpp
index 9ff34c647485..18523690982b 100644
--- a/packages/rol/test/step/fletcher/test_01.cpp
+++ b/packages/rol/test/step/fletcher/test_01.cpp
@@ -18,7 +18,7 @@
 #include "ROL_Fletcher.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 
 #include <iostream>
@@ -27,7 +27,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   std::string filename = "input.xml";
   
diff --git a/packages/rol/test/step/fletcher/test_02.cpp b/packages/rol/test/step/fletcher/test_02.cpp
index 92508a715427..0d9a93dd7106 100644
--- a/packages/rol/test/step/fletcher/test_02.cpp
+++ b/packages/rol/test/step/fletcher/test_02.cpp
@@ -19,7 +19,7 @@
 #include "ROL_OptimizationSolver.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 
 typedef double RealT;
@@ -31,7 +31,7 @@ int main(int argc, char *argv[]) {
   typedef ROL::StdVector<RealT>         SV;
   typedef ROL::Ptr<ROL::Vector<RealT> > PtrV;
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   std::string filename = "input_ex02.xml";
   
diff --git a/packages/rol/test/step/fletcher/test_03.cpp b/packages/rol/test/step/fletcher/test_03.cpp
index 1073c10a0a07..82dc226a747a 100644
--- a/packages/rol/test/step/fletcher/test_03.cpp
+++ b/packages/rol/test/step/fletcher/test_03.cpp
@@ -15,7 +15,7 @@
 #include "ROL_GetTestProblems.hpp"
 #include "ROL_OptimizationSolver.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 
 #include <iostream>
@@ -24,7 +24,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/step/interiorpoint/CMakeLists.txt b/packages/rol/test/step/interiorpoint/CMakeLists.txt
index 6c787af72a60..57a9da675b52 100644
--- a/packages/rol/test/step/interiorpoint/CMakeLists.txt
+++ b/packages/rol/test/step/interiorpoint/CMakeLists.txt
@@ -1,7 +1,7 @@
 TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/sacado/src/function)
 TRIBITS_INCLUDE_DIRECTORIES(${${PACKAGE_NAME}_SOURCE_DIR}/adapters/sacado/test/HS)
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   BarrierObjective  
   SOURCES test_01.cpp
   ARGS PrintItAll
@@ -11,7 +11,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   )
 
 IF(${PROJECT_NAME}_ENABLE_Sacado)
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   PrimalDualResidual
   SOURCES test_02.cpp
   ARGS PrintItAll
@@ -20,7 +20,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   PrimalDualNewtonKrylov
   SOURCES test_03.cpp
   ARGS PrintItAll
diff --git a/packages/rol/test/step/interiorpoint/test_01.cpp b/packages/rol/test/step/interiorpoint/test_01.cpp
index 7dea14bfc033..fc09b0f69ea7 100644
--- a/packages/rol/test/step/interiorpoint/test_01.cpp
+++ b/packages/rol/test/step/interiorpoint/test_01.cpp
@@ -7,9 +7,9 @@
 // *****************************************************************************
 // @HEADER
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "ROL_InteriorPointPenalty.hpp"
 #include "ROL_RandomVector.hpp"
@@ -66,7 +66,7 @@ int main(int argc, char *argv[]) {
 
   typedef ROL::ParameterList      PL;
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   int iprint = argc - 1;
   ROL::Ptr<std::ostream> outStream;
diff --git a/packages/rol/test/step/interiorpoint/test_02.cpp b/packages/rol/test/step/interiorpoint/test_02.cpp
index af9228dd4d42..23c58aadd31b 100644
--- a/packages/rol/test/step/interiorpoint/test_02.cpp
+++ b/packages/rol/test/step/interiorpoint/test_02.cpp
@@ -9,7 +9,7 @@
 
 #define OPTIMIZATION_PROBLEM_REFACTOR
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "ROL_RandomVector.hpp"
 #include "ROL_StdVector.hpp"
@@ -191,7 +191,7 @@ int main(int argc, char *argv[]) {
 
    
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   int iprint = argc - 1;
   ROL::Ptr<std::ostream> outStream;
diff --git a/packages/rol/test/step/interiorpoint/test_03.cpp b/packages/rol/test/step/interiorpoint/test_03.cpp
index 09e03ab8161e..7e4c70c00dd7 100644
--- a/packages/rol/test/step/interiorpoint/test_03.cpp
+++ b/packages/rol/test/step/interiorpoint/test_03.cpp
@@ -9,7 +9,7 @@
 
 #define OPTIMIZATION_PROBLEM_REFACTOR
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "ROL_RandomVector.hpp"
 #include "ROL_StdVector.hpp"
@@ -112,7 +112,7 @@ int main(int argc, char *argv[]) {
 
    
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   int iprint = argc - 1;
   ROL::Ptr<std::ostream> outStream;
diff --git a/packages/rol/test/step/krylov/CMakeLists.txt b/packages/rol/test/step/krylov/CMakeLists.txt
index ad8807dc2dea..bc8fe4024412 100644
--- a/packages/rol/test/step/krylov/CMakeLists.txt
+++ b/packages/rol/test/step/krylov/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   GMRES
   SOURCES test_01.cpp
   ARGS PrintItAll
@@ -8,7 +8,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-#TRIBITS_ADD_EXECUTABLE_AND_TEST(
+#ROL_ADD_EXECUTABLE_AND_TEST(
 #  Lanczos
 #  SOURCES test_02.cpp
 #  ARGS PrintItAll
@@ -17,7 +17,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
 #  ADD_DIR_TO_NAME
 #  )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   MINRES
   SOURCES test_03.cpp
   ARGS PrintItAll
diff --git a/packages/rol/test/step/krylov/test_01.cpp b/packages/rol/test/step/krylov/test_01.cpp
index a01938795954..e85be714529a 100644
--- a/packages/rol/test/step/krylov/test_01.cpp
+++ b/packages/rol/test/step/krylov/test_01.cpp
@@ -18,7 +18,7 @@
 #include "ROL_ParameterList.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include<iomanip>
 
@@ -128,7 +128,7 @@ int main(int argc, char *argv[]) {
 
   typedef typename vector::size_type    luint;
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   int iprint     = argc - 1;
   ROL::Ptr<std::ostream> outStream;
diff --git a/packages/rol/test/step/krylov/test_02.cpp b/packages/rol/test/step/krylov/test_02.cpp
index 94331e424984..12c93f3ddcc7 100644
--- a/packages/rol/test/step/krylov/test_02.cpp
+++ b/packages/rol/test/step/krylov/test_02.cpp
@@ -15,7 +15,7 @@
 #include "ROL_Lanczos.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include<iomanip>
 
@@ -31,7 +31,7 @@ int main(int argc, char *argv[]) {
 
   typedef typename vector::size_type    uint;
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   int iprint     = argc - 1;
   ROL::Ptr<std::ostream> outStream;
diff --git a/packages/rol/test/step/krylov/test_03.cpp b/packages/rol/test/step/krylov/test_03.cpp
index 4a259a2733ab..8efd4c9cbe78 100644
--- a/packages/rol/test/step/krylov/test_03.cpp
+++ b/packages/rol/test/step/krylov/test_03.cpp
@@ -17,7 +17,7 @@
 #include "ROL_RandomVector.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include<iomanip>
 
@@ -122,7 +122,7 @@ int main(int argc, char *argv[]) {
 
   typedef typename vector::size_type    luint;
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   int iprint     = argc - 1;
   ROL::Ptr<std::ostream> outStream;
diff --git a/packages/rol/test/step/test_01.cpp b/packages/rol/test/step/test_01.cpp
index f7b5481613ad..c21614bcce43 100644
--- a/packages/rol/test/step/test_01.cpp
+++ b/packages/rol/test/step/test_01.cpp
@@ -16,7 +16,7 @@
 #include "ROL_GetTestProblems.hpp"
 #include "ROL_OptimizationSolver.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 
 #include <iostream>
@@ -25,7 +25,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/step/test_02.cpp b/packages/rol/test/step/test_02.cpp
index 04fed1fe0376..0ec8c7a461b6 100644
--- a/packages/rol/test/step/test_02.cpp
+++ b/packages/rol/test/step/test_02.cpp
@@ -16,7 +16,7 @@
 #include "ROL_GetTestProblems.hpp"
 #include "ROL_OptimizationSolver.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 
 #include <iostream>
@@ -25,7 +25,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/step/test_03.cpp b/packages/rol/test/step/test_03.cpp
index 09d87e63c830..940d208e96f4 100644
--- a/packages/rol/test/step/test_03.cpp
+++ b/packages/rol/test/step/test_03.cpp
@@ -16,7 +16,7 @@
 #include "ROL_GetTestProblems.hpp"
 #include "ROL_OptimizationSolver.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 
 #include <iostream>
@@ -25,7 +25,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/step/test_04.cpp b/packages/rol/test/step/test_04.cpp
index 951881c8d49b..2b0eab9784b6 100644
--- a/packages/rol/test/step/test_04.cpp
+++ b/packages/rol/test/step/test_04.cpp
@@ -16,7 +16,7 @@
 #include "ROL_GetTestProblems.hpp"
 #include "ROL_OptimizationSolver.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 
 #include <iostream>
@@ -27,7 +27,7 @@ typedef double RealT;
 int main(int argc, char *argv[]) {
   //feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/step/test_05.cpp b/packages/rol/test/step/test_05.cpp
index 97c0fecda9e3..c23ca02056b9 100644
--- a/packages/rol/test/step/test_05.cpp
+++ b/packages/rol/test/step/test_05.cpp
@@ -16,7 +16,7 @@
 #include "ROL_GetTestProblems.hpp"
 #include "ROL_OptimizationSolver.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 
 #include <iostream>
@@ -25,7 +25,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/step/test_06.cpp b/packages/rol/test/step/test_06.cpp
index 71a0e4af8545..be3f32dc9cd5 100644
--- a/packages/rol/test/step/test_06.cpp
+++ b/packages/rol/test/step/test_06.cpp
@@ -16,7 +16,7 @@
 #include "ROL_GetTestProblems.hpp"
 #include "ROL_OptimizationSolver.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 
 #include <iostream>
@@ -25,7 +25,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/step/test_07.cpp b/packages/rol/test/step/test_07.cpp
index 49fc7f3e73b4..20a19466be2b 100644
--- a/packages/rol/test/step/test_07.cpp
+++ b/packages/rol/test/step/test_07.cpp
@@ -11,7 +11,7 @@
     \brief Interior Point test using Hock & Schittkowski problem 32.
 */
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "ROL_HS32.hpp"
 #include "ROL_Algorithm.hpp"
@@ -31,7 +31,7 @@ int main(int argc, char *argv[]) {
 //  typedef ROL::PartitionedVector<RealT> PV;
 
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   int iprint     = argc - 1;
   ROL::Ptr<std::ostream> outStream;
diff --git a/packages/rol/test/step/test_08.cpp b/packages/rol/test/step/test_08.cpp
index d802b4ba5e21..978a1c7a83a1 100644
--- a/packages/rol/test/step/test_08.cpp
+++ b/packages/rol/test/step/test_08.cpp
@@ -11,7 +11,7 @@
     \brief Interior Point test using Hock & Schittkowski problem 29.
 */
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "ROL_HS29.hpp"
 #include "ROL_Algorithm.hpp"
@@ -25,9 +25,8 @@ int main(int argc, char *argv[]) {
 
   typedef std::vector<RealT>            vec;
   typedef ROL::StdVector<RealT>         SV;
-  typedef ROL::Ptr<ROL::Vector<RealT> >      ROL::PtrV;
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   int iprint     = argc - 1;
   ROL::Ptr<std::ostream> outStream;
@@ -55,8 +54,8 @@ int main(int argc, char *argv[]) {
     using ROL::ZOO::Objective_HS29;
     using ROL::ZOO::InequalityConstraint_HS29;
     
-    ROL::Ptr<ROL::Objective<RealT> >             obj_hs29 = ROL::makePtr<Objective_HS29<RealT>>();
-    ROL::Ptr<ROL::InequalityConstraint<RealT> >  incon_hs29 = ROL::makePtr<InequalityConstraint_HS29<RealT>>();
+    auto obj_hs29 = ROL::makePtr<Objective_HS29<RealT>>();
+    auto incon_hs29 = ROL::makePtr<InequalityConstraint_HS29<RealT>>();
 
     
     std::string stepname = "Interior Point"; 
diff --git a/packages/rol/test/step/test_09.cpp b/packages/rol/test/step/test_09.cpp
index 912ee253ee4d..617f574bc433 100644
--- a/packages/rol/test/step/test_09.cpp
+++ b/packages/rol/test/step/test_09.cpp
@@ -11,7 +11,7 @@
     \brief Test of Primal Dual Interior Point KKT system
 */
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "ROL_HS32.hpp" 
 #include "ROL_InteriorPointPrimalDualResidual.hpp"
@@ -37,7 +37,7 @@ int main(int argc, char *argv[]) {
   
   
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   int iprint     = argc - 1;
   ROL::Ptr<std::ostream> outStream;
diff --git a/packages/rol/test/step/test_10.cpp b/packages/rol/test/step/test_10.cpp
index fded9e18659d..5378b0ef6221 100644
--- a/packages/rol/test/step/test_10.cpp
+++ b/packages/rol/test/step/test_10.cpp
@@ -18,7 +18,7 @@
 #include "ROL_Stream.hpp"
 #include "ROL_InteriorPointStep.hpp"
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 
@@ -26,7 +26,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/step/test_11.cpp b/packages/rol/test/step/test_11.cpp
index 7ab5d1455c02..3f8d2eff3fe3 100644
--- a/packages/rol/test/step/test_11.cpp
+++ b/packages/rol/test/step/test_11.cpp
@@ -11,7 +11,7 @@
     \brief Interior Point test using Hock & Schittkowski problem 24.
 */
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "ROL_HS24.hpp"
 #include "ROL_Algorithm.hpp"
@@ -27,11 +27,10 @@ int main(int argc, char *argv[]) {
   typedef ROL::Vector<RealT>               V;
   typedef ROL::BoundConstraint<RealT>      BC;
   typedef ROL::Objective<RealT>            OBJ;
-  typedef ROL::InequalityConstraint<RealT> INEQ; 
 
    
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   int iprint     = argc - 1;
   ROL::Ptr<std::ostream> outStream;
@@ -46,14 +45,14 @@ int main(int argc, char *argv[]) {
   try { 
 
     ROL::ZOO::getHS24<RealT> HS24;
-    ROL::Ptr<V>    x     = HS24.getInitialGuess();
-    ROL::Ptr<V>    xs    = HS24.getSolution();
-    ROL::Ptr<V>    inmul = HS24.getInequalityMultiplier();    
-
-    ROL::Ptr<BC>   bnd   = HS24.getBoundConstraint();
-    ROL::Ptr<OBJ>  obj   = HS24.getObjective();
-    ROL::Ptr<INEQ> incon = HS24.getInequalityConstraint();
-    ROL::Ptr<BC>   inbnd = HS24.getSlackBoundConstraint();
+    auto x     = HS24.getInitialGuess();
+    auto xs    = HS24.getSolution();
+    auto inmul = HS24.getInequalityMultiplier();    
+
+    auto bnd   = HS24.getBoundConstraint();
+    auto obj   = HS24.getObjective();
+    auto incon = HS24.getInequalityConstraint();
+    auto inbnd = HS24.getSlackBoundConstraint();
    
     
 
diff --git a/packages/rol/test/step/test_12.cpp b/packages/rol/test/step/test_12.cpp
index 1ed85e9a5650..12a75ec24dcd 100644
--- a/packages/rol/test/step/test_12.cpp
+++ b/packages/rol/test/step/test_12.cpp
@@ -13,7 +13,7 @@
 
 #define USE_HESSVEC 0
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "ROL_Step.hpp"
 #include "ROL_GetTestProblems.hpp"
@@ -24,7 +24,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/step/test_13.cpp b/packages/rol/test/step/test_13.cpp
index 9fbf78d8390e..2306619e5379 100644
--- a/packages/rol/test/step/test_13.cpp
+++ b/packages/rol/test/step/test_13.cpp
@@ -14,7 +14,7 @@
 #include "ROL_GetTestProblems.hpp"
 #include "ROL_OptimizationSolver.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 
 #include <iostream>
@@ -23,7 +23,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/step/test_14.cpp b/packages/rol/test/step/test_14.cpp
index 5f74b4c4e6c9..2aa808b2e771 100644
--- a/packages/rol/test/step/test_14.cpp
+++ b/packages/rol/test/step/test_14.cpp
@@ -14,7 +14,7 @@
 #include "ROL_GetTestProblems.hpp"
 #include "ROL_OptimizationSolver.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 
 #include <iostream>
@@ -23,7 +23,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/step/test_15.cpp b/packages/rol/test/step/test_15.cpp
index 75325150337c..d7fd96f14f94 100644
--- a/packages/rol/test/step/test_15.cpp
+++ b/packages/rol/test/step/test_15.cpp
@@ -16,7 +16,7 @@
 #include "ROL_GetTestProblems.hpp"
 #include "ROL_OptimizationSolver.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 //#include <fenv.h>
@@ -26,7 +26,7 @@ typedef double RealT;
 int main(int argc, char *argv[]) {
   //feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/step/test_16.cpp b/packages/rol/test/step/test_16.cpp
index ecf534132c15..06c50d8ec642 100644
--- a/packages/rol/test/step/test_16.cpp
+++ b/packages/rol/test/step/test_16.cpp
@@ -14,8 +14,7 @@
 #include "ROL_GetTestProblems.hpp"
 #include "ROL_OptimizationSolver.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 
@@ -23,7 +22,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/step/test_17.cpp b/packages/rol/test/step/test_17.cpp
index e1e20ebb97cd..69b2dff62883 100644
--- a/packages/rol/test/step/test_17.cpp
+++ b/packages/rol/test/step/test_17.cpp
@@ -14,8 +14,7 @@
 #include "ROL_GetTestProblems.hpp"
 #include "ROL_OptimizationSolver.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 
@@ -23,7 +22,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/step/test_18.cpp b/packages/rol/test/step/test_18.cpp
index 374205b727f3..b8c98fea6400 100644
--- a/packages/rol/test/step/test_18.cpp
+++ b/packages/rol/test/step/test_18.cpp
@@ -14,8 +14,7 @@
 #include "ROL_GetTestProblems.hpp"
 #include "ROL_OptimizationSolver.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
-#include "Teuchos_XMLParameterListHelpers.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 
@@ -23,7 +22,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/step/trustregion/CMakeLists.txt b/packages/rol/test/step/trustregion/CMakeLists.txt
index fbee5778da5c..b1abf4b48d0c 100644
--- a/packages/rol/test/step/trustregion/CMakeLists.txt
+++ b/packages/rol/test/step/trustregion/CMakeLists.txt
@@ -1,4 +1,4 @@
-#TRIBITS_ADD_EXECUTABLE(
+#ROL_ADD_EXECUTABLE(
 #   ConicModel
 #   SOURCES test_01.cpp
 #   ARGS PrintItAll
diff --git a/packages/rol/test/step/trustregion/test_01.cpp b/packages/rol/test/step/trustregion/test_01.cpp
index 81749e4096a5..919f8819612d 100644
--- a/packages/rol/test/step/trustregion/test_01.cpp
+++ b/packages/rol/test/step/trustregion/test_01.cpp
@@ -12,7 +12,7 @@
 */
 
 #include "ROL_OptimizationSolver.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "ROL_Stream.hpp"
 #include "ROL_RandomVector.hpp"
@@ -27,7 +27,7 @@ int main(int argc, char *argv[]) {
   using RealT = double;
   using namespace ROL;
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   auto outStream = makeStreamPtr( std::cout, argc > 1 );
diff --git a/packages/rol/test/utils/CMakeLists.txt b/packages/rol/test/utils/CMakeLists.txt
index 3b5c5e7c0315..dcd2b7451691 100644
--- a/packages/rol/test/utils/CMakeLists.txt
+++ b/packages/rol/test/utils/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   ScalarMinimizationTest01
   SOURCES test_01.cpp
   ARGS PrintItAll
@@ -8,7 +8,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   ScalarMinimizationTest02
   SOURCES test_02.cpp
   ARGS PrintItAll
@@ -17,13 +17,12 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+# Uses SerialDenseVector
+ROL_ADD_EXECUTABLE_AND_TEST(
   FiniteDifferenceTest
   SOURCES test_03.cpp
   ARGS PrintItAll
   NUM_MPI_PROCS 1
   PASS_REGULAR_EXPRESSION "TEST PASSED"
   ADD_DIR_TO_NAME
-  )
-
-
+)
diff --git a/packages/rol/test/utils/function_bindings/test_01.cpp b/packages/rol/test/utils/function_bindings/test_01.cpp
index bac64259841a..284a5fbeb7dd 100644
--- a/packages/rol/test/utils/function_bindings/test_01.cpp
+++ b/packages/rol/test/utils/function_bindings/test_01.cpp
@@ -11,7 +11,7 @@
     \brief Test scalar minimization algorithms on test 01.
 */
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "ROL_ScalarMinimizationTest01.hpp"
 
@@ -19,7 +19,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/utils/test_01.cpp b/packages/rol/test/utils/test_01.cpp
index 8e7f1127aa65..2d021cc35c28 100644
--- a/packages/rol/test/utils/test_01.cpp
+++ b/packages/rol/test/utils/test_01.cpp
@@ -10,9 +10,9 @@
 /*! \file  test_01.cpp
     \brief Test scalar minimization algorithms on test 01.
 */
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "ROL_ScalarMinimizationTest01.hpp"
 
@@ -20,7 +20,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/utils/test_02.cpp b/packages/rol/test/utils/test_02.cpp
index 0556ad12ded4..bc9a6846d466 100644
--- a/packages/rol/test/utils/test_02.cpp
+++ b/packages/rol/test/utils/test_02.cpp
@@ -11,9 +11,9 @@
     \brief Test creating a LineSearch using an externally provided 
            scalar minimization function. 
 */
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "ROL_Algorithm.hpp"
 #include "ROL_BisectionScalarMinimization.hpp"
@@ -27,7 +27,7 @@ typedef double RealT;
 
 int main(int argc, char *argv[] ) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
    
   using namespace ROL;
diff --git a/packages/rol/test/utils/test_03.cpp b/packages/rol/test/utils/test_03.cpp
index 069151ca08ac..ac296ecaf42b 100644
--- a/packages/rol/test/utils/test_03.cpp
+++ b/packages/rol/test/utils/test_03.cpp
@@ -11,7 +11,7 @@
     \brief Test of FiniteDifference class for performing derivative checks
 */
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "ROL_ValidateFunction.hpp"
 
@@ -32,7 +32,7 @@ int main(int argc, char *argv[]) {
   using std::cref;
   using namespace std::placeholders;
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   int iprint     = argc - 1;
   Ptr<std::ostream> os;
diff --git a/packages/rol/test/vector/CMakeLists.txt b/packages/rol/test/vector/CMakeLists.txt
index 63145a82739e..20d7e92e94d9 100644
--- a/packages/rol/test/vector/CMakeLists.txt
+++ b/packages/rol/test/vector/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   StdVectorInterface
   SOURCES test_01.cpp
   ARGS PrintItAll
@@ -9,7 +9,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   )
 
 # Moved to adapters/teuchos/test/vector
-#TRIBITS_ADD_EXECUTABLE_AND_TEST(
+#ROL_ADD_EXECUTABLE_AND_TEST(
 #  CArrayVectorInterface
 #  SOURCES test_02.cpp
 #  ARGS PrintItAll
@@ -20,7 +20,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
 
 
 # Moved to adapters/teuchos/test/vector
-#TRIBITS_ADD_EXECUTABLE_AND_TEST(
+#ROL_ADD_EXECUTABLE_AND_TEST(
 #  MultiVectorInterface
 #  SOURCES test_03.cpp
 #  ARGS PrintItAll
@@ -29,7 +29,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
 #  ADD_DIR_TO_NAME
 #  )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   PartitionedVectorInterface
   SOURCES test_04.cpp
   ARGS PrintItAll
@@ -38,7 +38,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   ScaledStdVectorInterface
   SOURCES test_05.cpp
   ARGS PrintItAll
@@ -47,17 +47,17 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
-  ProfiledVectorInterface
-  SOURCES test_06.cpp
-  ARGS PrintItAll
-  NUM_MPI_PROCS 1
-  PASS_REGULAR_EXPRESSION "TEST PASSED"
-  ADD_DIR_TO_NAME
-  )
+  ROL_ADD_EXECUTABLE_AND_TEST(
+    ProfiledVectorInterface
+    SOURCES test_06.cpp
+    ARGS PrintItAll
+    NUM_MPI_PROCS 1
+    PASS_REGULAR_EXPRESSION "TEST PASSED"
+    ADD_DIR_TO_NAME
+    )
 
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   RieszVectorInterface
   SOURCES test_07.cpp
   ARGS PrintItAll
@@ -67,7 +67,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   )
 
 # moved to adapters/mpi/test/vector
-#TRIBITS_ADD_EXECUTABLE_AND_TEST(
+#ROL_ADD_EXECUTABLE_AND_TEST(
 #  PinTVector
 #  SOURCES test_08.cpp
 #  ARGS PrintItAll
@@ -77,7 +77,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
 #  ADD_DIR_TO_NAME
 #  )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   VectorCloneInterface
   SOURCES test_09.cpp
   ARGS PrintItAll
@@ -86,13 +86,13 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE (
+ROL_ADD_EXECUTABLE (
   VectorWorkspace
   SOURCES test_10.cpp
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
+ROL_ADD_EXECUTABLE_AND_TEST(
   StdArrayInterface
   SOURCES test_11.cpp
   ARGS PrintItAll
@@ -101,7 +101,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   ADD_DIR_TO_NAME
   )
 
-TRIBITS_COPY_FILES_TO_BINARY_DIR(
+ROL_COPY_FILES_TO_BINARY_DIR(
   VectorTestDataCopy
   SOURCE_FILES
     parameters.xml
diff --git a/packages/rol/test/vector/test_01.cpp b/packages/rol/test/vector/test_01.cpp
index 21df3ebdf01e..cbfe7da956f0 100644
--- a/packages/rol/test/vector/test_01.cpp
+++ b/packages/rol/test/vector/test_01.cpp
@@ -14,7 +14,7 @@
 #include "ROL_StdVector.hpp"
 #include "ROL_Types.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 
@@ -23,7 +23,7 @@ typedef double ElementT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/vector/test_04.cpp b/packages/rol/test/vector/test_04.cpp
index c6fd8f004a56..ada5b0a54832 100644
--- a/packages/rol/test/vector/test_04.cpp
+++ b/packages/rol/test/vector/test_04.cpp
@@ -15,7 +15,7 @@
 #include "ROL_StdVector.hpp"
 #include "ROL_Types.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 typedef double RealT;
 
@@ -45,13 +45,11 @@ void print_vector( const ROL::Vector<Real> &x ) {
 
 int main(int argc, char *argv[]) {
 
-  using namespace Teuchos;
-
   typedef ROL::Vector<RealT>            V;
   typedef ROL::StdVector<RealT>         SV;
   typedef ROL::PartitionedVector<RealT> PV;
 
-  GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   int iprint = argc - 1;
 
diff --git a/packages/rol/test/vector/test_05.cpp b/packages/rol/test/vector/test_05.cpp
index b570f0ad9d56..f6218a7707ef 100644
--- a/packages/rol/test/vector/test_05.cpp
+++ b/packages/rol/test/vector/test_05.cpp
@@ -15,14 +15,14 @@
 #include "ROL_ScaledStdVector.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 typedef double RealT;
 typedef double ElementT;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv,0);
+  ROL::GlobalMPISession mpiSession(&argc, &argv,0);
 
   int iprint = argc - 1;
   ROL::nullstream bhs; // outputs nothing
diff --git a/packages/rol/test/vector/test_06.cpp b/packages/rol/test/vector/test_06.cpp
index 43f6e39affb5..895024de647b 100644
--- a/packages/rol/test/vector/test_06.cpp
+++ b/packages/rol/test/vector/test_06.cpp
@@ -19,7 +19,7 @@
 #include "ROL_StatusTest.hpp"
 #include "ROL_Stream.hpp"
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 typedef int    OrdinalT;
 typedef double RealT;
@@ -38,7 +38,7 @@ int main(int argc, char *argv[]) {
   typedef ROL::Vector<RealT>                  V;
   typedef ROL::StdVector<RealT>               SV;
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv,0);
+  ROL::GlobalMPISession mpiSession(&argc, &argv,0);
 
   int iprint     = argc - 1;
   ROL::Ptr<std::ostream> outStream;
diff --git a/packages/rol/test/vector/test_07.cpp b/packages/rol/test/vector/test_07.cpp
index 2c413b5e3aa5..ba0162895e76 100644
--- a/packages/rol/test/vector/test_07.cpp
+++ b/packages/rol/test/vector/test_07.cpp
@@ -19,7 +19,7 @@
 #include "ROL_DiagonalOperator.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 typedef double RealT;
 
@@ -38,7 +38,7 @@ int main(int argc, char *argv[]) {
   typedef ROL::RieszDualVector<RealT>    DualVector;
 
   
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv,0);
+  ROL::GlobalMPISession mpiSession(&argc, &argv,0);
 
   int iprint = argc - 1;
   ROL::nullstream bhs; // outputs nothing
diff --git a/packages/rol/test/vector/test_09.cpp b/packages/rol/test/vector/test_09.cpp
index d12f57347b78..44927d5da020 100644
--- a/packages/rol/test/vector/test_09.cpp
+++ b/packages/rol/test/vector/test_09.cpp
@@ -8,7 +8,7 @@
 // @HEADER
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "ROL_Objective.hpp"
 
@@ -69,7 +69,7 @@ class TestMulti {
 
 int main( int argc, char* argv[] ) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   int iprint     = argc - 1;
diff --git a/packages/rol/test/vector/test_10.cpp b/packages/rol/test/vector/test_10.cpp
index a3273b208376..cfe930c8c139 100644
--- a/packages/rol/test/vector/test_10.cpp
+++ b/packages/rol/test/vector/test_10.cpp
@@ -7,7 +7,7 @@
 // *****************************************************************************
 // @HEADER
 
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include "ROL_Objective.hpp"
 
@@ -60,7 +60,7 @@ int main( int argc, char* argv[] ) {
 
   using namespace ROL;
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   // This little trick lets us print to std::cout only if a (dummy) command-line argument is provided.
   auto outStream = makeStreamPtr( std::cout, argc > 1 );
diff --git a/packages/rol/test/vector/test_11.cpp b/packages/rol/test/vector/test_11.cpp
index f57b036ff9a1..524b1d8fb207 100644
--- a/packages/rol/test/vector/test_11.cpp
+++ b/packages/rol/test/vector/test_11.cpp
@@ -14,7 +14,7 @@
 #include "ROL_StdArray.hpp"
 #include "ROL_StdVector.hpp"
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 #include <iostream>
 
@@ -24,7 +24,7 @@ constexpr auto dim = 100u;
 
 int main(int argc, char *argv[]) {
 
-  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
+  ROL::GlobalMPISession mpiSession(&argc, &argv);
 
   ROL::StdArray<RealT,dim>::initialize_pool();
 
diff --git a/packages/rol/tutorial/example_ineq_bnd.cpp b/packages/rol/tutorial/example_ineq_bnd.cpp
index b57aa9fdc6d8..701c39e70323 100644
--- a/packages/rol/tutorial/example_ineq_bnd.cpp
+++ b/packages/rol/tutorial/example_ineq_bnd.cpp
@@ -21,7 +21,7 @@
 #include "ROL_Bounds.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 
 /* OBJECTIVE FUNCTION */
diff --git a/packages/rol/tutorial/example_unc.cpp b/packages/rol/tutorial/example_unc.cpp
index 266e43b7e947..547aa41fab3e 100644
--- a/packages/rol/tutorial/example_unc.cpp
+++ b/packages/rol/tutorial/example_unc.cpp
@@ -19,7 +19,7 @@
 #include "ROL_StdObjective.hpp"
 
 #include "ROL_Stream.hpp"
-#include "Teuchos_GlobalMPISession.hpp"
+#include "ROL_GlobalMPISession.hpp"
 
 
 /* OBJECTIVE FUNCTION */