diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6bd5cc39fe..24f4baa097 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -30,13 +30,13 @@ project(sundials C)
 # Set some variables with info on the SUNDIALS project
 set(PACKAGE_BUGREPORT "woodward6@llnl.gov")
 set(PACKAGE_NAME "SUNDIALS")
-set(PACKAGE_STRING "SUNDIALS 5.2.0")
+set(PACKAGE_STRING "SUNDIALS 5.3.0")
 set(PACKAGE_TARNAME "sundials")
 
 # set SUNDIALS version numbers
 # (use "" for the version label if none is needed)
 set(PACKAGE_VERSION_MAJOR "5")
-set(PACKAGE_VERSION_MINOR "2")
+set(PACKAGE_VERSION_MINOR "3")
 set(PACKAGE_VERSION_PATCH "0")
 set(PACKAGE_VERSION_LABEL "")
 
@@ -63,37 +63,37 @@ mark_as_advanced(CLEAR
 
 # Specify the VERSION and SOVERSION for shared libraries
 
-set(arkodelib_VERSION "4.2.0")
+set(arkodelib_VERSION "4.3.0")
 set(arkodelib_SOVERSION "4")
 
-set(cvodelib_VERSION "5.2.0")
+set(cvodelib_VERSION "5.3.0")
 set(cvodelib_SOVERSION "5")
 
-set(cvodeslib_VERSION "5.2.0")
+set(cvodeslib_VERSION "5.3.0")
 set(cvodeslib_SOVERSION "5")
 
-set(idalib_VERSION "5.2.0")
+set(idalib_VERSION "5.3.0")
 set(idalib_SOVERSION "5")
 
-set(idaslib_VERSION "4.2.0")
+set(idaslib_VERSION "4.3.0")
 set(idaslib_SOVERSION "4")
 
-set(kinsollib_VERSION "5.2.0")
+set(kinsollib_VERSION "5.3.0")
 set(kinsollib_SOVERSION "5")
 
 set(cpodeslib_VERSION "0.0.0")
 set(cpodeslib_SOVERSION "0")
 
-set(nveclib_VERSION "5.2.0")
+set(nveclib_VERSION "5.3.0")
 set(nveclib_SOVERSION "5")
 
-set(sunmatrixlib_VERSION "3.2.0")
+set(sunmatrixlib_VERSION "3.3.0")
 set(sunmatrixlib_SOVERSION "3")
 
-set(sunlinsollib_VERSION "3.2.0")
+set(sunlinsollib_VERSION "3.3.0")
 set(sunlinsollib_SOVERSION "3")
 
-set(sunnonlinsollib_VERSION "2.2.0")
+set(sunnonlinsollib_VERSION "2.3.0")
 set(sunnonlinsollib_SOVERSION "2")
 
 # Specify the location of additional CMAKE modules
@@ -232,6 +232,13 @@ show_variable(SUNDIALS_INDEX_TYPE STRING "${DOCSTR}" "")
 mark_as_advanced(SUNDIALS_INDEX_TYPE)
 include(SundialsIndexSize)
 
+# ---------------------------------------------------------------
+# Option to specify monitoring
+# ---------------------------------------------------------------
+
+set(DOCSTR "Build with simulation monitoring capabilities enabled")
+sundials_option(SUNDIALS_BUILD_WITH_MONITORING BOOL ${DOCSTR} OFF)
+
 # ---------------------------------------------------------------
 # Enable Fortran interface?
 # ---------------------------------------------------------------
@@ -858,6 +865,17 @@ endif(CUDA_ENABLE)
 # Now that all languages are setup, we can configure them more.
 # ---------------------------------------------------------------
 
+# ---------------------------------------------------------------
+# Option to use specialized fused kernels in the packages.
+# Currently only available in CVODE.
+# ---------------------------------------------------------------
+
+if(CUDA_ENABLE AND CMAKE_CUDA_COMPILER AND BUILD_CVODE)
+  set(SUNDIALS_BUILD_PACKAGE_FUSED_KERNELS FALSE CACHE BOOL "Build specialized fused CUDA kernels")
+else()
+  set(SUNDIALS_BUILD_PACKAGE_FUSED_KERNELS FALSE CACHE BOOL "Build specialized fused CUDA kernels" FORCE)
+endif()
+
 # ---------------------------------------------------------------
 # Decide how to compile MPI codes. We must check for MPI if
 # MPI is enabled or if Trilinos is enabled because the Trilinos
diff --git a/INSTALL_GUIDE.pdf b/INSTALL_GUIDE.pdf
index 82f9cd87e2..dfcb21916e 100644
Binary files a/INSTALL_GUIDE.pdf and b/INSTALL_GUIDE.pdf differ
diff --git a/README.md b/README.md
index 781dc4d75a..18aa60f264 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 # SUNDIALS: SUite of Nonlinear and DIfferential/ALgebraic equation Solvers #
-### Version 5.2.0 (Mar 2020) ###
+### Version 5.3.0 (May 2020) ###
 
 **Center for Applied Scientific Computing, Lawrence Livermore National Laboratory**
 
diff --git a/doc/arkode/ark_examples.pdf b/doc/arkode/ark_examples.pdf
index 35351fd2ef..1303ff3d0d 100644
Binary files a/doc/arkode/ark_examples.pdf and b/doc/arkode/ark_examples.pdf differ
diff --git a/doc/arkode/ark_guide.pdf b/doc/arkode/ark_guide.pdf
index 89382838d4..d3d78e4b5e 100644
Binary files a/doc/arkode/ark_guide.pdf and b/doc/arkode/ark_guide.pdf differ
diff --git a/doc/cvode/cv_examples.pdf b/doc/cvode/cv_examples.pdf
index 23c84191dd..2e25f21e0a 100644
Binary files a/doc/cvode/cv_examples.pdf and b/doc/cvode/cv_examples.pdf differ
diff --git a/doc/cvode/cv_guide.pdf b/doc/cvode/cv_guide.pdf
index ea9d2c5cec..8c7eecac22 100644
Binary files a/doc/cvode/cv_guide.pdf and b/doc/cvode/cv_guide.pdf differ
diff --git a/doc/cvodes/cvs_examples.pdf b/doc/cvodes/cvs_examples.pdf
index daa44e4bf2..0f7ceec7a8 100644
Binary files a/doc/cvodes/cvs_examples.pdf and b/doc/cvodes/cvs_examples.pdf differ
diff --git a/doc/cvodes/cvs_guide.pdf b/doc/cvodes/cvs_guide.pdf
index 0d8bb9680a..793e1fcb1d 100644
Binary files a/doc/cvodes/cvs_guide.pdf and b/doc/cvodes/cvs_guide.pdf differ
diff --git a/doc/ida/ida_examples.pdf b/doc/ida/ida_examples.pdf
index cdbee2c72d..2bd00076b2 100644
Binary files a/doc/ida/ida_examples.pdf and b/doc/ida/ida_examples.pdf differ
diff --git a/doc/ida/ida_guide.pdf b/doc/ida/ida_guide.pdf
index 48e708e395..371d860486 100644
Binary files a/doc/ida/ida_guide.pdf and b/doc/ida/ida_guide.pdf differ
diff --git a/doc/idas/idas_examples.pdf b/doc/idas/idas_examples.pdf
index ba0d8d4347..e601ff8716 100644
Binary files a/doc/idas/idas_examples.pdf and b/doc/idas/idas_examples.pdf differ
diff --git a/doc/idas/idas_guide.pdf b/doc/idas/idas_guide.pdf
index 4b72003d96..2cf26f9558 100644
Binary files a/doc/idas/idas_guide.pdf and b/doc/idas/idas_guide.pdf differ
diff --git a/doc/kinsol/kin_examples.pdf b/doc/kinsol/kin_examples.pdf
index 47d6d670be..97133def05 100644
Binary files a/doc/kinsol/kin_examples.pdf and b/doc/kinsol/kin_examples.pdf differ
diff --git a/doc/kinsol/kin_guide.pdf b/doc/kinsol/kin_guide.pdf
index 0d164150cd..25dbeb2938 100644
Binary files a/doc/kinsol/kin_guide.pdf and b/doc/kinsol/kin_guide.pdf differ
diff --git a/examples/arkode/C_serial/CMakeLists.txt b/examples/arkode/C_serial/CMakeLists.txt
index 88a0a585e8..0b7cbfcf23 100644
--- a/examples/arkode/C_serial/CMakeLists.txt
+++ b/examples/arkode/C_serial/CMakeLists.txt
@@ -15,29 +15,33 @@
 # CMakeLists.txt file for ARKODE serial examples
 # ---------------------------------------------------------------
 
-# Example lists are tuples "name\;type" where the type is
+# Example lists are tuples "name\;args\;type" where the type is
 # 'develop' for examples excluded from 'make test' in releases
 
 # Examples using SUNDIALS linear solvers
 set(ARKODE_examples
-  "ark_analytic\;"
-  "ark_analytic_nonlin\;develop"
-  "ark_brusselator\;develop"
-  "ark_brusselator_fp\;develop"
-  "ark_brusselator1D\;develop"
-  "ark_heat1D\;develop"
-  "ark_heat1D_adapt\;develop"
-  "ark_KrylovDemo_prec\;develop"
-  "ark_robertson\;develop"
-  "ark_robertson_constraints\;develop"
-  "ark_robertson_root\;develop"
-  "ark_brusselator_mri\;develop"
-  "ark_onewaycouple_mri\;develop"
-  "ark_twowaycouple_mri\;develop"
-  "ark_reaction_diffusion_mri\;develop"
-  "ark_brusselator_1D_mri\;develop"
+  "ark_analytic\;\;"
+  "ark_analytic_nonlin\;\;develop"
+  "ark_brusselator\;\;develop"
+  "ark_brusselator_fp\;\;develop"
+  "ark_brusselator1D\;\;develop"
+  "ark_heat1D\;\;develop"
+  "ark_heat1D_adapt\;\;develop"
+  "ark_KrylovDemo_prec\;\;develop"
+  "ark_robertson\;\;develop"
+  "ark_robertson_constraints\;\;develop"
+  "ark_robertson_root\;\;develop"
+  "ark_brusselator_mri\;\;develop"
+  "ark_onewaycouple_mri\;\;develop"
+  "ark_twowaycouple_mri\;\;develop"
+  "ark_reaction_diffusion_mri\;\;develop"
+  "ark_brusselator_1D_mri\;\;develop"
   )
 
+if(SUNDIALS_BUILD_WITH_MONITORING)
+  list(APPEND ARKODE_examples "ark_brusselator_fp\;1\;develop")
+endif()
+
 # Examples using LAPACK linear solvers
 set(ARKODE_examples_BL
   )
@@ -81,22 +85,34 @@ foreach(example_tuple ${ARKODE_examples})
 
   # parse the example tuple
   list(GET example_tuple 0 example)
-  list(GET example_tuple 1 example_type)
+  list(GET example_tuple 1 example_args)
+  list(GET example_tuple 2 example_type)
+
+  if (NOT TARGET ${example})
+    # example source files
+    add_executable(${example} ${example}.c)
+
+    # folder for IDEs
+    set_target_properties(${example} PROPERTIES FOLDER "Examples")
 
-  # example source files
-  add_executable(${example} ${example}.c)
+    # libraries to link against
+    target_link_libraries(${example} ${SUNDIALS_LIBS})
+  endif()
 
-  set_target_properties(${example} PROPERTIES FOLDER "Examples")
+  # check if example args are provided and set the test name
+  if("${example_args}" STREQUAL "")
+    set(test_name ${example})
+  else()
+    string(REGEX REPLACE " " "_" test_name ${example}_${example_args})
+  endif()
 
   # add example to regression tests
-  sundials_add_test(${example} ${example}
+  sundials_add_test(${test_name} ${example}
+    TEST_ARGS ${example_args}
     ANSWER_DIR ${CMAKE_CURRENT_SOURCE_DIR}
-    ANSWER_FILE ${example}.out
+    ANSWER_FILE ${test_name}.out
     EXAMPLE_TYPE ${example_type})
 
-  # libraries to link against
-  target_link_libraries(${example} ${SUNDIALS_LIBS})
-
   # install example source and out files
   if(EXAMPLES_INSTALL)
     install(FILES ${example}.c ${example}.out
diff --git a/examples/arkode/C_serial/ark_brusselator_fp.c b/examples/arkode/C_serial/ark_brusselator_fp.c
index 150def1652..52e516a3c8 100644
--- a/examples/arkode/C_serial/ark_brusselator_fp.c
+++ b/examples/arkode/C_serial/ark_brusselator_fp.c
@@ -78,7 +78,7 @@ static int fi(realtype t, N_Vector y, N_Vector ydot, void *user_data);
 static int check_flag(void *flagvalue, const char *funcname, int opt);
 
 /* Main Program */
-int main()
+int main(int argc, char *argv[])
 {
   /* general problem parameters */
   realtype T0 = RCONST(0.0);     /* initial time */
@@ -93,17 +93,23 @@ int main()
   int maxcor = 10;               /* maximum # of nonlinear iterations/step */
   realtype a, b, ep, u0, v0, w0;
   realtype rdata[3];
+  int monitor = 0;               /* turn on/off monitoring */
 
   /* general problem variables */
   int flag;                       /* reusable error-checking flag */
   N_Vector y = NULL;              /* empty vector for storing solution */
   SUNNonlinearSolver NLS = NULL;  /* empty nonlinear solver object */
   void *arkode_mem = NULL;        /* empty ARKode memory structure */
-  FILE *UFID;
+  FILE *UFID, *INFOFID;
   realtype t, tout;
   int iout;
   long int nst, nst_a, nfe, nfi, nni, ncfn, netf;
 
+  /* read inputs */
+  if (argc == 2) {
+    monitor = atoi(argv[1]);
+  }
+
   /* set up the test problem according to the desired test */
   if (test == 1) {
     u0 = RCONST(3.9);
@@ -134,6 +140,9 @@ int main()
   printf("    problem parameters:  a = %"GSYM",  b = %"GSYM",  ep = %"GSYM"\n",a,b,ep);
   printf("    reltol = %.1"ESYM",  abstol = %.1"ESYM"\n\n",reltol,abstol);
 
+  /* Open up info output file */
+  if (monitor) INFOFID = fopen("ark_brusselator_fp-info.txt","w");
+
   /* Initialize data structures */
   rdata[0] = a;    /* set user data  */
   rdata[1] = b;
@@ -153,6 +162,12 @@ int main()
   /* Initialize fixed-point nonlinear solver and attach to ARKStep */
   NLS = SUNNonlinSol_FixedPoint(y, fp_m);
   if (check_flag((void *)NLS, "SUNNonlinSol_FixedPoint", 0)) return 1;
+  if (monitor) {
+    flag = SUNNonlinSolSetPrintLevel_FixedPoint(NLS, 1);
+    if (check_flag(&flag, "SUNNonlinSolSetPrintLevel_Newton", 1)) return(1);
+    flag = SUNNonlinSolSetInfoFile_FixedPoint(NLS, INFOFID);
+    if (check_flag(&flag, "SUNNonlinSolSetPrintLevel_Newton", 1)) return(1);
+  }
   flag = ARKStepSetNonlinearSolver(arkode_mem, NLS);
   if (check_flag(&flag, "ARKStepSetNonlinearSolver", 1)) return 1;
 
@@ -196,6 +211,7 @@ int main()
   }
   printf("   ----------------------------------------------\n");
   fclose(UFID);
+  if (monitor) fclose(INFOFID);
 
   /* Print some final statistics */
   flag = ARKStepGetNumSteps(arkode_mem, &nst);
diff --git a/examples/arkode/C_serial/ark_brusselator_fp_1.out b/examples/arkode/C_serial/ark_brusselator_fp_1.out
new file mode 100644
index 0000000000..041ebe3d2a
--- /dev/null
+++ b/examples/arkode/C_serial/ark_brusselator_fp_1.out
@@ -0,0 +1,27 @@
+
+Brusselator ODE test problem, fixed-point solver:
+    initial conditions:  u0 = 3,  v0 = 3,  w0 = 3.5
+    problem parameters:  a = 0.5,  b = 3,  ep = 0.0005
+    reltol = 1.0e-06,  abstol = 1.0e-10
+
+        t           u           v           w
+   ----------------------------------------------
+    1.000000    1.897255    1.274939    2.997155
+    2.000000    0.346125    2.366448    2.999481
+    3.000000    0.147442    2.862061    2.999781
+    4.000000    0.140733    3.226731    2.999788
+    5.000000    0.142659    3.583206    2.999788
+    6.000000    0.145095    3.936910    2.999782
+    7.000000    0.147720    4.287893    2.999780
+    8.000000    0.150542    4.635957    2.999775
+    9.000000    0.153590    4.980863    2.999768
+   10.000000    0.156901    5.322330    2.999763
+   ----------------------------------------------
+
+Final Solver Statistics:
+   Internal solver steps = 729 (attempted = 730)
+   Total RHS evals:  Fe = 4383,  Fi = 18793
+   Total number of fixed-point iterations = 14410
+   Total number of nonlinear solver convergence failures = 0
+   Total number of error test failures = 1
+
diff --git a/examples/cvode/C_mpimanyvector/CMakeLists.txt b/examples/cvode/C_mpimanyvector/CMakeLists.txt
index 7f3aa5dac2..e85a799677 100644
--- a/examples/cvode/C_mpimanyvector/CMakeLists.txt
+++ b/examples/cvode/C_mpimanyvector/CMakeLists.txt
@@ -38,6 +38,11 @@ else()
   set(NVECP_LIB sundials_nvecmpimanyvector_shared sundials_nvecparallel_shared)
 endif()
 
+if(SUNDIALS_BUILD_PACKAGE_FUSED_KERNELS)
+  list(APPEND CVODE_LIB
+       sundials_cvode_fused_stubs_${LINK_LIBRARY_TYPE})
+endif()
+
 # Set-up linker flags and link libraries
 set(SUNDIALS_LIBS ${CVODE_LIB} ${NVECP_LIB} ${EXTRA_LINK_LIBS})
 
@@ -87,6 +92,9 @@ if(EXAMPLES_INSTALL)
   # Prepare substitution variables for Makefile and/or CMakeLists templates
   set(SOLVER "CVODE")
   set(SOLVER_LIB "sundials_cvode")
+  if(SUNDIALS_BUILD_PACKAGE_FUSED_KERNELS)
+    set(LIBS "-lsundials_cvode_fused_stubs ${LIBS}")
+  endif()
 
   examples2string(CVODE_examples EXAMPLES)
 
diff --git a/examples/cvode/C_openmp/CMakeLists.txt b/examples/cvode/C_openmp/CMakeLists.txt
index 92939c1ebe..c6910b49ca 100644
--- a/examples/cvode/C_openmp/CMakeLists.txt
+++ b/examples/cvode/C_openmp/CMakeLists.txt
@@ -33,6 +33,11 @@ else()
   set(NVECOMP_LIB sundials_nvecopenmp_shared)
 endif()
 
+if(SUNDIALS_BUILD_PACKAGE_FUSED_KERNELS)
+  list(APPEND CVODE_LIB
+       sundials_cvode_fused_stubs_${LINK_LIBRARY_TYPE})
+endif()
+
 # Set-up linker flags and link libraries
 set(SUNDIALS_LIBS ${CVODE_LIB} ${NVECOMP_LIB} ${EXTRA_LINK_LIBS})
 
@@ -81,6 +86,9 @@ if(EXAMPLES_INSTALL)
   # Prepare substitution variables for Makefile and/or CMakeLists templates
   set(SOLVER "CVODE")
   set(SOLVER_LIB "sundials_cvode")
+  if(SUNDIALS_BUILD_PACKAGE_FUSED_KERNELS)
+    set(LIBS "-lsundials_cvode_fused_stubs ${LIBS}")
+  endif()
 
   examples2string(CVODE_examples EXAMPLES)
 
diff --git a/examples/cvode/C_openmpdev/CMakeLists.txt b/examples/cvode/C_openmpdev/CMakeLists.txt
index 2e5d496c6b..f3bc1b0a08 100644
--- a/examples/cvode/C_openmpdev/CMakeLists.txt
+++ b/examples/cvode/C_openmpdev/CMakeLists.txt
@@ -32,6 +32,11 @@ else()
   set(NVECOMP_LIB sundials_nvecopenmpdev_shared)
 endif()
 
+if(SUNDIALS_BUILD_PACKAGE_FUSED_KERNELS)
+  list(APPEND CVODE_LIB
+       sundials_cvode_fused_stubs_${LINK_LIBRARY_TYPE})
+endif()
+
 # Set-up linker flags and link libraries
 set(SUNDIALS_LIBS ${CVODE_LIB} ${NVECOMP_LIB} ${EXTRA_LINK_LIBS})
 
@@ -81,6 +86,9 @@ if(EXAMPLES_INSTALL)
   # Prepare substitution variables for Makefile and/or CMakeLists templates
   set(SOLVER "CVODE")
   set(SOLVER_LIB "sundials_cvode")
+  if(SUNDIALS_BUILD_PACKAGE_FUSED_KERNELS)
+    set(LIBS "-lsundials_cvode_fused_stubs ${LIBS}")
+  endif()
 
   examples2string(CVODE_examples EXAMPLES)
 
diff --git a/examples/cvode/F2003_serial/CMakeLists.txt b/examples/cvode/F2003_serial/CMakeLists.txt
index acca42ae3f..4b694d76b2 100644
--- a/examples/cvode/F2003_serial/CMakeLists.txt
+++ b/examples/cvode/F2003_serial/CMakeLists.txt
@@ -46,6 +46,12 @@ else()
   set(CVODE_LIB sundials_fcvode_mod_shared)
 endif()
 
+if(SUNDIALS_BUILD_PACKAGE_FUSED_KERNELS)
+  list(APPEND CVODE_LIB
+       sundials_cvode_${LINK_LIBRARY_TYPE}
+       sundials_cvode_fused_stubs_${LINK_LIBRARY_TYPE})
+endif()
+
 # Set-up linker flags and link libraries
 set(SUNDIALS_LIBS ${CVODE_LIB} ${EXTRA_LINK_LIBS})
 
@@ -139,6 +145,9 @@ if(EXAMPLES_INSTALL)
   set(SOLVER_FLIB "sundials_fcvode_mod")
   set(NVEC_LIB "sundials_nvecserial")
   set(NVEC_FLIB "sundials_fnvecserial_mod")
+  if(SUNDIALS_BUILD_PACKAGE_FUSED_KERNELS)
+    set(LIBS "-lsundials_cvode_fused_stubs ${LIBS}")
+  endif()
 
   examples2string(FCVODE_examples EXAMPLES)
 
diff --git a/examples/cvode/cuda/CMakeLists.txt b/examples/cvode/cuda/CMakeLists.txt
index cc0dac3b1a..2ee41ea0bb 100644
--- a/examples/cvode/cuda/CMakeLists.txt
+++ b/examples/cvode/cuda/CMakeLists.txt
@@ -21,13 +21,16 @@
 
 # Examples using SUNDIALS linear solvers
 set(CVODE_examples
-  "cvAdvDiff_kry_cuda\;develop"
-  "cvAdvDiff_kry_cuda_managed\;develop"
+  "cvAdvDiff_kry_cuda\;\;develop"
+  "cvAdvDiff_kry_cuda_managed\;\;develop"
+  "cvAdvDiff_diag_cuda\;0 0\;develop"
+  "cvAdvDiff_diag_cuda\;0 1\;develop"
+  "cvAdvDiff_diag_cuda\;1 1\;develop"
   )
 
 # Examples using cuSolverSP linear solvers
 set(CVODE_examples_cusolver
-  "cvRoberts_block_cusolversp_batchqr\;develop"
+  "cvRoberts_block_cusolversp_batchqr\;\;develop"
 )
 
 if(SUNDIALS_INDEX_SIZE MATCHES "32")
@@ -56,6 +59,11 @@ else()
                     ${EXTRA_LINK_LIBS})
 endif()
 
+if(SUNDIALS_BUILD_PACKAGE_FUSED_KERNELS)
+  set(SUNDIALS_LIBS ${SUNDIALS_LIBS}
+    sundials_cvode_fused_cuda_${LINK_LIBRARY_TYPE})
+endif()
+
 # Add source directory to include directories
 include_directories(.)
 
@@ -64,30 +72,41 @@ foreach(example_tuple ${all_examples})
 
   # parse the example tuple
   list(GET example_tuple 0 example)
-  list(GET example_tuple 1 example_type)
+  list(GET example_tuple 1 example_args)
+  list(GET example_tuple 2 example_type)
 
   set_source_files_properties(${example}.cu PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
 
-  # example source files
-  add_executable(${example} ${example}.cu)
+  if (NOT TARGET ${example})
+    # example source files
+    add_executable(${example} ${example}.cu)
 
-  set_target_properties(${example} PROPERTIES FOLDER "Examples")
+    # folder for IDEs
+    set_target_properties(${example} PROPERTIES FOLDER "Examples")
+
+    # libraries to link against
+    target_link_libraries(${example} PRIVATE ${SUNDIALS_LIBS})
+  endif()
+
+  # check if example args are provided and set the test name
+  if("${example_args}" STREQUAL "")
+    set(test_name ${example})
+  else()
+    string(REGEX REPLACE " " "_" test_name ${example}_${example_args})
+  endif()
 
   # add example to regression tests
-  sundials_add_test(${example} ${example}
+  sundials_add_test(${test_name} ${example}
+    TEST_ARGS ${example_args}
     ANSWER_DIR ${CMAKE_CURRENT_SOURCE_DIR}
-    ANSWER_FILE ${example}.out
+    ANSWER_FILE ${test_name}.out
     EXAMPLE_TYPE ${example_type})
 
-  # libraries to link against
-  target_link_libraries(${example} PRIVATE ${SUNDIALS_LIBS})
-
   # install example source and out files
   if(EXAMPLES_INSTALL)
-    install(FILES ${example}.cu ${example}.out
+    install(FILES ${example}.cu ${test_name}.out
       DESTINATION ${EXAMPLES_INSTALL_PATH}/cvode/cuda)
   endif()
-
 endforeach(example_tuple ${CVODE_examples})
 
 
@@ -101,11 +120,15 @@ if(EXAMPLES_INSTALL)
   set(SOLVER "CVODE")
   set(SOLVER_LIB "sundials_cvode")
   set(NVECTOR_LIB "sundials_nveccuda")
+  if(SUNDIALS_BUILD_PACKAGE_FUSED_KERNELS)
+    set(LIBS "-lsundials_cvode_fused_cuda ${LIBS}")
+  endif()
 
   examples2string(CVODE_examples EXAMPLES)
 
-  if(SUNDIALS_INDEX_SIZE MATCHES "32" AND cuda_arch_ok)
+  if(SUNDIALS_INDEX_SIZE MATCHES "32")
     set(SUNLS_LIB "sundials_sunlinsolcusolversp")
+    set(SUNMAT_LIB "sundials_sunmatrixcusparse")
     examples2string(CVODE_examples_cusolver EXAMPLES_CUSOLVER)
   endif()
 
diff --git a/examples/cvode/cuda/cvAdvDiff_diag_cuda.cu b/examples/cvode/cuda/cvAdvDiff_diag_cuda.cu
new file mode 100644
index 0000000000..c9eba6ce72
--- /dev/null
+++ b/examples/cvode/cuda/cvAdvDiff_diag_cuda.cu
@@ -0,0 +1,364 @@
+/*
+ * -----------------------------------------------------------------
+ * Programmer(s): Cody J. Balos @ LLNL
+ * -----------------------------------------------------------------
+ * SUNDIALS Copyright Start
+ * Copyright (c) 2002-2020, Lawrence Livermore National Security
+ * and Southern Methodist University.
+ * All rights reserved.
+ *
+ * See the top-level LICENSE and NOTICE files for details.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ * SUNDIALS Copyright End
+ * -----------------------------------------------------------------
+ * Example problem:
+ *
+ * The following is a simple example problem, with the program for
+ * its solution by CVODE. The problem is the semi-discrete
+ * form of the advection-diffusion equation in 1-D:
+ *   du/dt = d^2 u / dx^2 + .5 du/dx
+ * on the interval 0 <= x <= 2, and the time interval 0 <= t <= 5.
+ * Homogeneous Dirichlet boundary conditions are posed, and the
+ * initial condition is the following:
+ *   u(x,t=0) = x(2-x)exp(2x) .
+ * The PDE is discretized on a uniform grid of size MX+2 with
+ * central differencing, and with boundary values eliminated,
+ * leaving an ODE system of size NEQ = MX.
+ * This program solves the problem with the ADAMS integration method,
+ * and with Newton iteration using diagonal approximate Jacobians.
+ * It can use scalar (default) relative and absolute tolerances or a
+ * vector of absolute tolerances (controlled by a runtime argument).
+ * The constraint u_i >= 0 is posed for all components.
+ * Output is printed at t = .5, 1.0, ..., 5.
+ * Run statistics (optional outputs) are printed at the end.
+ *
+ * ./cvAdvDiff_diag_cuda [0 (scalar atol) | 1 (vector atol)]
+ *                       [0 (unfused) | 1 (fused)]
+ * -----------------------------------------------------------------
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include <cuda_runtime.h>
+
+#include <cvode/cvode.h>                  /* prototypes for CVODE fcts., consts.  */
+#include <cvode/cvode_diag.h>             /* prototypes for CVODE diagonal solver */
+#include <nvector/nvector_cuda.h>         /* access to cuda N_Vector              */
+#include <sundials/sundials_types.h>      /* definition of type realtype          */
+
+/* Problem Constants */
+
+#define ZERO  RCONST(0.0)
+
+#define XMAX  RCONST(2.0)    /* domain boundary           */
+#define MX    10             /* mesh dimension            */
+#define NEQ   MX             /* number of equations       */
+#define ATOL  RCONST(1e-10)  /* scalar absolute tolerance */
+#define T0    ZERO           /* initial time              */
+#define T1    RCONST(0.5)    /* first output time         */
+#define DTOUT RCONST(0.5)    /* output time increment     */
+#define NOUT  10             /* number of output times    */
+
+/* Type : UserData
+   contains mesh spacing and problem parameters. */
+
+typedef struct {
+  realtype dx;
+  realtype hdcoef;
+  realtype hacoef;
+} *UserData;
+
+/* Private Helper Functions */
+
+static void SetIC(N_Vector u, realtype dx);
+
+static void PrintIntro(int toltype, int usefused);
+
+static void PrintData(realtype t, realtype umax, long int nst);
+
+static void PrintFinalStats(void *cvode_mem);
+
+/* Functions Called by the Solver */
+
+static int f(realtype t, N_Vector u, N_Vector udot, void *user_data);
+
+/* Private function to check function return values */
+
+static int check_retval(void *returnvalue, const char *funcname, int opt);
+
+/***************************** Main Program ******************************/
+
+int main(int argc, char *argv[])
+{
+  realtype dx, reltol, abstol, t, tout, umax;
+  N_Vector u;
+  UserData data;
+  void *cvode_mem;
+  int iout, retval, toltype, usefused;
+  long int nst;
+
+  u = NULL;
+  data = NULL;
+  cvode_mem = NULL;
+  toltype = 0;
+  usefused = 0;
+
+  if (argc >= 2) {
+    /* use vector or scalar atol? */
+    toltype = atoi(argv[1]);
+    /* use fused operations? */
+    if (argc == 3)
+      usefused = atoi(argv[2]);
+  }
+
+  data = (UserData) malloc(sizeof *data);  /* Allocate data memory */
+  if(check_retval((void *)data, "malloc", 2)) return 1;
+
+  u = N_VNew_Cuda(NEQ);  /* Allocate u vector */
+  if(check_retval((void *)u, "N_VNew", 0)) return 1;
+
+  reltol = ZERO;  /* Set the tolerances */
+  abstol = ATOL;
+
+  dx = data->dx = XMAX/((realtype)(MX+1));  /* Set grid coefficients in data */
+  data->hdcoef = RCONST(1.0)/(dx*dx);
+  data->hacoef = RCONST(0.5)/(RCONST(2.0)*dx);
+
+  SetIC(u, dx);  /* Initialize u vector */
+
+  /* Call CVodeCreate to create the solver memory and specify the
+   * Adams-Moulton LMM */
+  cvode_mem = CVodeCreate(CV_ADAMS);
+  if(check_retval((void *)cvode_mem, "CVodeCreate", 0)) return 1;
+
+  retval = CVodeSetUserData(cvode_mem, data);
+  if(check_retval(&retval, "CVodeSetUserData", 1)) return 1;
+
+  /* Call CVodeInit to initialize the integrator memory and specify the
+   * user's right hand side function in u'=f(t,u), the inital time T0, and
+   * the initial dependent variable vector u. */
+  retval = CVodeInit(cvode_mem, f, T0, u);
+  if(check_retval(&retval, "CVodeInit", 1)) return(1);
+
+  /* Call CVodeSStolerances to specify the scalar relative tolerance
+   * and scalar absolute tolerances */
+
+  if (toltype == 0) {
+    retval = CVodeSStolerances(cvode_mem, reltol, abstol);
+    if (check_retval(&retval, "CVodeSStolerances", 1)) return(1);
+  } else {
+    N_Vector vabstol = N_VClone_Cuda(u);
+    if (check_retval(&vabstol, "N_VClone_Cuda", 0)) return(1);
+    N_VConst(abstol, vabstol);
+    retval = CVodeSVtolerances(cvode_mem, reltol, vabstol);
+    if (check_retval(&retval, "CVodeSVtolerances", 1)) return(1);
+    N_VDestroy(vabstol);
+  }
+
+  /* Call CVDiag to create and attach CVODE-specific diagonal linear solver */
+  retval = CVDiag(cvode_mem);
+  if(check_retval(&retval, "CVDiag", 1)) return(1);
+
+  /* Tell CVode to use fused kernels if they are available. */
+  retval = CVodeSetUseIntegratorFusedKernels(cvode_mem, usefused);
+  check_retval(&retval, "CVodeSetUseIntegratorFusedKernels", 1);
+
+  PrintIntro(toltype, usefused);
+
+  umax = N_VMaxNorm(u);
+
+  t = T0;
+  PrintData(t, umax, 0);
+
+  /* In loop over output points, call CVode, print results, test for error */
+
+  for (iout=1, tout=T1; iout <= NOUT; iout++, tout += DTOUT) {
+    retval = CVode(cvode_mem, tout, u, &t, CV_NORMAL);
+    if(check_retval(&retval, "CVode", 1)) break;
+    umax = N_VMaxNorm(u);
+    retval = CVodeGetNumSteps(cvode_mem, &nst);
+    check_retval(&retval, "CVodeGetNumSteps", 1);
+    PrintData(t, umax, nst);
+  }
+
+  PrintFinalStats(cvode_mem);  /* Print some final statistics */
+
+  N_VDestroy(u);                 /* Free the u vector */
+  CVodeFree(&cvode_mem);         /* Free the integrator memory */
+  free(data);                    /* Free user data */
+
+  return(0);
+}
+
+/************************ Private Helper Functions ***********************/
+
+/* Set initial conditions in u vector */
+
+static void SetIC(N_Vector u, realtype dx)
+{
+  int i;
+  sunindextype N;
+  realtype x;
+  realtype *udata;
+
+  /* Set pointer to data array and get local length of u. */
+  udata = N_VGetHostArrayPointer_Cuda(u);
+  N = N_VGetLength(u);
+
+  /* Load initial profile into u vector */
+  for (i=1; i<=N; i++) {
+    x = i*dx;
+    udata[i-1] = x*(XMAX - x)*exp(RCONST(2.0)*x);
+  }
+  N_VCopyToDevice_Cuda(u);
+}
+
+/* Print problem introduction */
+
+static void PrintIntro(int toltype, int usefused)
+{
+  printf("\n 1-D advection-diffusion equation, mesh size =%3d \n", MX);
+  printf("\n Diagonal linear solver CVDiag \n");
+  if (usefused)
+    printf(" Using fused CVODE kernels \n");
+  if (toltype == 0)
+    printf(" Using scalar ATOL\n");
+  else
+    printf(" Using vector ATOL\n");
+  printf("\n");
+
+  return;
+}
+
+/* Print data */
+
+static void PrintData(realtype t, realtype umax, long int nst)
+{
+
+#if defined(SUNDIALS_EXTENDED_PRECISION)
+  printf("At t = %4.2Lf  max.norm(u) =%14.6Le  nst =%4ld \n", t, umax, nst);
+#elif defined(SUNDIALS_DOUBLE_PRECISION)
+  printf("At t = %4.2f  max.norm(u) =%14.6e  nst =%4ld \n", t, umax, nst);
+#else
+  printf("At t = %4.2f  max.norm(u) =%14.6e  nst =%4ld \n", t, umax, nst);
+#endif
+
+  return;
+}
+
+/* Print some final statistics located in the iopt array */
+
+static void PrintFinalStats(void *cvode_mem)
+{
+  long int nst, nfe, nni, ncfn, netf;
+  int retval;
+
+  retval = CVodeGetNumSteps(cvode_mem, &nst);
+  check_retval(&retval, "CVodeGetNumSteps", 1);
+  retval = CVodeGetNumRhsEvals(cvode_mem, &nfe);
+  check_retval(&retval, "CVodeGetNumRhsEvals", 1);
+  retval = CVodeGetNumErrTestFails(cvode_mem, &netf);
+  check_retval(&retval, "CVodeGetNumErrTestFails", 1);
+  retval = CVodeGetNumNonlinSolvIters(cvode_mem, &nni);
+  check_retval(&retval, "CVodeGetNumNonlinSolvIters", 1);
+  retval = CVodeGetNumNonlinSolvConvFails(cvode_mem, &ncfn);
+  check_retval(&retval, "CVodeGetNumNonlinSolvConvFails", 1);
+
+  printf("\nFinal Statistics: \n\n");
+  printf("nst = %-6ld  nfe  = %-6ld  ", nst, nfe);
+  printf("nni = %-6ld  ncfn = %-6ld  netf = %ld\n \n", nni, ncfn, netf);
+}
+
+ /***************** Function Called by the Solver ***********************/
+
+ /* f routine. Compute f(t,u). */
+
+__global__
+static void f_kernel(sunindextype N,
+                     realtype hordc, realtype horac,
+                     const realtype* u, realtype* udot)
+{
+  sunindextype i = blockDim.x*blockIdx.x + threadIdx.x;
+  realtype ui, ult, urt, hdiff, hadv;
+
+  if (i < N) {
+    /* Extract u at x_i and two neighboring points */
+    ui = u[i];
+    ult = (i == 0) ? ZERO : u[i-1];
+    urt = (i == N-1) ? ZERO : u[i+1];
+
+    /* Set diffusion and advection terms and load into udot */
+    hdiff = hordc*(ult - RCONST(2.0)*ui + urt);
+    hadv = horac*(urt - ult);
+    udot[i] = hdiff + hadv;
+  }
+}
+
+static int f(realtype t, N_Vector u, N_Vector udot, void *user_data)
+{
+  realtype hordc, horac;
+  realtype *udata, *dudata;
+  sunindextype N;
+  size_t grid, block;
+  UserData data;
+  cudaError_t cuerr;
+
+  udata = N_VGetDeviceArrayPointer_Cuda(u);
+  dudata = N_VGetDeviceArrayPointer_Cuda(udot);
+
+  /* Extract needed problem constants from data */
+  data = (UserData) user_data;
+  hordc = data->hdcoef;
+  horac = data->hacoef;
+
+  /* Extract parameters for parallel computation. */
+  N = N_VGetLength(u); /* Number of elements of u. */
+
+  block = 64;
+  grid  = (block + N - 1)/block;
+  f_kernel<<<grid, block>>>(N, hordc, horac, udata, dudata);
+
+  cudaDeviceSynchronize();
+  cuerr = cudaGetLastError();
+  if (cuerr != cudaSuccess) {
+    fprintf(stderr, "ERROR in f: f_kernel --> %s\n", cudaGetErrorString(cuerr));
+    return(-1);
+  }
+
+  return(0);
+}
+
+ /* Check function return value...
+      opt == 0 means SUNDIALS function allocates memory so check if
+               returned NULL pointer
+      opt == 1 means SUNDIALS function returns an integer value so check if
+               retval < 0
+      opt == 2 means function allocates memory so check if returned
+               NULL pointer */
+
+static int check_retval(void *returnvalue, const char *funcname, int opt)
+{
+  int *retval;
+
+  /* Check if SUNDIALS function returned NULL pointer - no memory allocated */
+  if (opt == 0 && returnvalue == NULL) {
+    fprintf(stderr, "\nSUNDIALS_ERROR: %s() failed - returned NULL pointer\n\n", funcname);
+    return(1); }
+
+  /* Check if retval < 0 */
+  else if (opt == 1) {
+    retval = (int *) returnvalue;
+    if (*retval < 0) {
+      fprintf(stderr, "\nSUNDIALS_ERROR: %s() failed with retval = %d\n\n", funcname, *retval);
+      return(1); }}
+
+  /* Check if function returned NULL pointer - no memory allocated */
+  else if (opt == 2 && returnvalue == NULL) {
+    fprintf(stderr, "\nMEMORY_ERROR: %s() failed - returned NULL pointer\n\n", funcname);
+    return(1); }
+
+  return(0);
+}
\ No newline at end of file
diff --git a/examples/cvode/cuda/cvAdvDiff_diag_cuda_0_0.out b/examples/cvode/cuda/cvAdvDiff_diag_cuda_0_0.out
new file mode 100644
index 0000000000..211ab1796e
--- /dev/null
+++ b/examples/cvode/cuda/cvAdvDiff_diag_cuda_0_0.out
@@ -0,0 +1,22 @@
+
+ 1-D advection-diffusion equation, mesh size = 10 
+
+ Diagonal linear solver CVDiag 
+ Using scalar ATOL
+
+At t = 0.00  max.norm(u) =  1.569909e+01  nst =   0 
+At t = 0.50  max.norm(u) =  3.052879e+00  nst = 444 
+At t = 1.00  max.norm(u) =  8.753297e-01  nst = 632 
+At t = 1.50  max.norm(u) =  2.494935e-01  nst = 787 
+At t = 2.00  max.norm(u) =  7.110094e-02  nst = 883 
+At t = 2.50  max.norm(u) =  2.026233e-02  nst = 970 
+At t = 3.00  max.norm(u) =  5.774352e-03  nst =1093 
+At t = 3.50  max.norm(u) =  1.645572e-03  nst =1179 
+At t = 4.00  max.norm(u) =  4.689201e-04  nst =1304 
+At t = 4.50  max.norm(u) =  1.336304e-04  nst =1385 
+At t = 5.00  max.norm(u) =  3.808141e-05  nst =1448 
+
+Final Statistics: 
+
+nst = 1448    nfe  = 2264    nni = 2261    ncfn = 34      netf = 56
+ 
diff --git a/examples/cvode/cuda/cvAdvDiff_diag_cuda_0_1.out b/examples/cvode/cuda/cvAdvDiff_diag_cuda_0_1.out
new file mode 100644
index 0000000000..1d8f94eeff
--- /dev/null
+++ b/examples/cvode/cuda/cvAdvDiff_diag_cuda_0_1.out
@@ -0,0 +1,23 @@
+
+ 1-D advection-diffusion equation, mesh size = 10 
+
+ Diagonal linear solver CVDiag 
+ Using fused CVODE kernels 
+ Using scalar ATOL
+
+At t = 0.00  max.norm(u) =  1.569909e+01  nst =   0 
+At t = 0.50  max.norm(u) =  3.052879e+00  nst = 460 
+At t = 1.00  max.norm(u) =  8.753297e-01  nst = 638 
+At t = 1.50  max.norm(u) =  2.494935e-01  nst = 799 
+At t = 2.00  max.norm(u) =  7.110094e-02  nst = 956 
+At t = 2.50  max.norm(u) =  2.026233e-02  nst =1072 
+At t = 3.00  max.norm(u) =  5.774352e-03  nst =1160 
+At t = 3.50  max.norm(u) =  1.645568e-03  nst =1255 
+At t = 4.00  max.norm(u) =  4.689520e-04  nst =1339 
+At t = 4.50  max.norm(u) =  1.336373e-04  nst =1399 
+At t = 5.00  max.norm(u) =  3.808158e-05  nst =1465 
+
+Final Statistics: 
+
+nst = 1465    nfe  = 2345    nni = 2342    ncfn = 21      netf = 70
+ 
diff --git a/examples/cvode/cuda/cvAdvDiff_diag_cuda_1_1.out b/examples/cvode/cuda/cvAdvDiff_diag_cuda_1_1.out
new file mode 100644
index 0000000000..9d602136cb
--- /dev/null
+++ b/examples/cvode/cuda/cvAdvDiff_diag_cuda_1_1.out
@@ -0,0 +1,23 @@
+
+ 1-D advection-diffusion equation, mesh size = 10 
+
+ Diagonal linear solver CVDiag 
+ Using fused CVODE kernels 
+ Using vector ATOL
+
+At t = 0.00  max.norm(u) =  1.569909e+01  nst =   0 
+At t = 0.50  max.norm(u) =  3.052879e+00  nst = 460 
+At t = 1.00  max.norm(u) =  8.753297e-01  nst = 638 
+At t = 1.50  max.norm(u) =  2.494935e-01  nst = 799 
+At t = 2.00  max.norm(u) =  7.110094e-02  nst = 956 
+At t = 2.50  max.norm(u) =  2.026233e-02  nst =1072 
+At t = 3.00  max.norm(u) =  5.774352e-03  nst =1160 
+At t = 3.50  max.norm(u) =  1.645568e-03  nst =1255 
+At t = 4.00  max.norm(u) =  4.689520e-04  nst =1339 
+At t = 4.50  max.norm(u) =  1.336373e-04  nst =1399 
+At t = 5.00  max.norm(u) =  3.808158e-05  nst =1465 
+
+Final Statistics: 
+
+nst = 1465    nfe  = 2345    nni = 2342    ncfn = 21      netf = 70
+ 
diff --git a/examples/cvode/cuda/cvRoberts_block_cusolversp_batchqr.out b/examples/cvode/cuda/cvRoberts_block_cusolversp_batchqr.out
index 45522fcff6..0f6d30f883 100644
--- a/examples/cvode/cuda/cvRoberts_block_cusolversp_batchqr.out
+++ b/examples/cvode/cuda/cvRoberts_block_cusolversp_batchqr.out
@@ -43,90 +43,90 @@ group 60: At t = 4.0000e+02      y =  4.505332e-01    3.223118e-06    5.494636e-
 group 70: At t = 4.0000e+02      y =  4.505332e-01    3.223118e-06    5.494636e-01
 group 80: At t = 4.0000e+02      y =  4.505332e-01    3.223118e-06    5.494636e-01
 group 90: At t = 4.0000e+02      y =  4.505332e-01    3.223118e-06    5.494636e-01
-group 0: At t = 4.0000e+03      y =  1.831798e-01    8.940768e-07    8.168193e-01
-group 10: At t = 4.0000e+03      y =  1.831798e-01    8.940768e-07    8.168193e-01
-group 20: At t = 4.0000e+03      y =  1.831798e-01    8.940768e-07    8.168193e-01
-group 30: At t = 4.0000e+03      y =  1.831798e-01    8.940768e-07    8.168193e-01
-group 40: At t = 4.0000e+03      y =  1.831798e-01    8.940768e-07    8.168193e-01
-group 50: At t = 4.0000e+03      y =  1.831798e-01    8.940768e-07    8.168193e-01
-group 60: At t = 4.0000e+03      y =  1.831798e-01    8.940768e-07    8.168193e-01
-group 70: At t = 4.0000e+03      y =  1.831798e-01    8.940768e-07    8.168193e-01
-group 80: At t = 4.0000e+03      y =  1.831798e-01    8.940768e-07    8.168193e-01
-group 90: At t = 4.0000e+03      y =  1.831798e-01    8.940768e-07    8.168193e-01
-group 0: At t = 4.0000e+04      y =  3.897912e-02    1.621575e-07    9.610207e-01
-group 10: At t = 4.0000e+04      y =  3.897912e-02    1.621575e-07    9.610207e-01
-group 20: At t = 4.0000e+04      y =  3.897912e-02    1.621575e-07    9.610207e-01
-group 30: At t = 4.0000e+04      y =  3.897912e-02    1.621575e-07    9.610207e-01
-group 40: At t = 4.0000e+04      y =  3.897912e-02    1.621575e-07    9.610207e-01
-group 50: At t = 4.0000e+04      y =  3.897912e-02    1.621575e-07    9.610207e-01
-group 60: At t = 4.0000e+04      y =  3.897912e-02    1.621575e-07    9.610207e-01
-group 70: At t = 4.0000e+04      y =  3.897912e-02    1.621575e-07    9.610207e-01
-group 80: At t = 4.0000e+04      y =  3.897912e-02    1.621575e-07    9.610207e-01
-group 90: At t = 4.0000e+04      y =  3.897912e-02    1.621575e-07    9.610207e-01
-group 0: At t = 4.0000e+05      y =  4.938822e-03    1.985206e-08    9.950612e-01
-group 10: At t = 4.0000e+05      y =  4.938822e-03    1.985206e-08    9.950612e-01
-group 20: At t = 4.0000e+05      y =  4.938822e-03    1.985206e-08    9.950612e-01
-group 30: At t = 4.0000e+05      y =  4.938822e-03    1.985206e-08    9.950612e-01
-group 40: At t = 4.0000e+05      y =  4.938822e-03    1.985206e-08    9.950612e-01
-group 50: At t = 4.0000e+05      y =  4.938822e-03    1.985206e-08    9.950612e-01
-group 60: At t = 4.0000e+05      y =  4.938822e-03    1.985206e-08    9.950612e-01
-group 70: At t = 4.0000e+05      y =  4.938822e-03    1.985206e-08    9.950612e-01
-group 80: At t = 4.0000e+05      y =  4.938822e-03    1.985206e-08    9.950612e-01
-group 90: At t = 4.0000e+05      y =  4.938822e-03    1.985206e-08    9.950612e-01
-group 0: At t = 4.0000e+06      y =  5.168467e-04    2.068440e-09    9.994832e-01
-group 10: At t = 4.0000e+06      y =  5.168467e-04    2.068440e-09    9.994832e-01
-group 20: At t = 4.0000e+06      y =  5.168467e-04    2.068440e-09    9.994832e-01
-group 30: At t = 4.0000e+06      y =  5.168467e-04    2.068440e-09    9.994832e-01
-group 40: At t = 4.0000e+06      y =  5.168467e-04    2.068440e-09    9.994832e-01
-group 50: At t = 4.0000e+06      y =  5.168467e-04    2.068440e-09    9.994832e-01
-group 60: At t = 4.0000e+06      y =  5.168467e-04    2.068440e-09    9.994832e-01
-group 70: At t = 4.0000e+06      y =  5.168467e-04    2.068440e-09    9.994832e-01
-group 80: At t = 4.0000e+06      y =  5.168467e-04    2.068440e-09    9.994832e-01
-group 90: At t = 4.0000e+06      y =  5.168467e-04    2.068440e-09    9.994832e-01
-group 0: At t = 4.0000e+07      y =  5.202426e-05    2.081078e-10    9.999480e-01
-group 10: At t = 4.0000e+07      y =  5.202426e-05    2.081078e-10    9.999480e-01
-group 20: At t = 4.0000e+07      y =  5.202426e-05    2.081078e-10    9.999480e-01
-group 30: At t = 4.0000e+07      y =  5.202426e-05    2.081078e-10    9.999480e-01
-group 40: At t = 4.0000e+07      y =  5.202426e-05    2.081078e-10    9.999480e-01
-group 50: At t = 4.0000e+07      y =  5.202426e-05    2.081078e-10    9.999480e-01
-group 60: At t = 4.0000e+07      y =  5.202426e-05    2.081078e-10    9.999480e-01
-group 70: At t = 4.0000e+07      y =  5.202426e-05    2.081078e-10    9.999480e-01
-group 80: At t = 4.0000e+07      y =  5.202426e-05    2.081078e-10    9.999480e-01
-group 90: At t = 4.0000e+07      y =  5.202426e-05    2.081078e-10    9.999480e-01
-group 0: At t = 4.0000e+08      y =  5.212615e-06    2.085057e-11    9.999948e-01
-group 10: At t = 4.0000e+08      y =  5.212615e-06    2.085057e-11    9.999948e-01
-group 20: At t = 4.0000e+08      y =  5.212615e-06    2.085057e-11    9.999948e-01
-group 30: At t = 4.0000e+08      y =  5.212615e-06    2.085057e-11    9.999948e-01
-group 40: At t = 4.0000e+08      y =  5.212615e-06    2.085057e-11    9.999948e-01
-group 50: At t = 4.0000e+08      y =  5.212615e-06    2.085057e-11    9.999948e-01
-group 60: At t = 4.0000e+08      y =  5.212615e-06    2.085057e-11    9.999948e-01
-group 70: At t = 4.0000e+08      y =  5.212615e-06    2.085057e-11    9.999948e-01
-group 80: At t = 4.0000e+08      y =  5.212615e-06    2.085057e-11    9.999948e-01
-group 90: At t = 4.0000e+08      y =  5.212615e-06    2.085057e-11    9.999948e-01
-group 0: At t = 4.0000e+09      y =  5.146550e-07    2.058622e-12    9.999995e-01
-group 10: At t = 4.0000e+09      y =  5.146550e-07    2.058622e-12    9.999995e-01
-group 20: At t = 4.0000e+09      y =  5.146550e-07    2.058622e-12    9.999995e-01
-group 30: At t = 4.0000e+09      y =  5.146550e-07    2.058622e-12    9.999995e-01
-group 40: At t = 4.0000e+09      y =  5.146550e-07    2.058622e-12    9.999995e-01
-group 50: At t = 4.0000e+09      y =  5.146550e-07    2.058622e-12    9.999995e-01
-group 60: At t = 4.0000e+09      y =  5.146550e-07    2.058622e-12    9.999995e-01
-group 70: At t = 4.0000e+09      y =  5.146550e-07    2.058622e-12    9.999995e-01
-group 80: At t = 4.0000e+09      y =  5.146550e-07    2.058622e-12    9.999995e-01
-group 90: At t = 4.0000e+09      y =  5.146550e-07    2.058622e-12    9.999995e-01
-group 0: At t = 4.0000e+10      y =  5.147690e-08    2.059076e-13    9.999999e-01
-group 10: At t = 4.0000e+10      y =  5.147690e-08    2.059076e-13    9.999999e-01
-group 20: At t = 4.0000e+10      y =  5.147690e-08    2.059076e-13    9.999999e-01
-group 30: At t = 4.0000e+10      y =  5.147690e-08    2.059076e-13    9.999999e-01
-group 40: At t = 4.0000e+10      y =  5.147690e-08    2.059076e-13    9.999999e-01
-group 50: At t = 4.0000e+10      y =  5.147690e-08    2.059076e-13    9.999999e-01
-group 60: At t = 4.0000e+10      y =  5.147690e-08    2.059076e-13    9.999999e-01
-group 70: At t = 4.0000e+10      y =  5.147690e-08    2.059076e-13    9.999999e-01
-group 80: At t = 4.0000e+10      y =  5.147690e-08    2.059076e-13    9.999999e-01
-group 90: At t = 4.0000e+10      y =  5.147690e-08    2.059076e-13    9.999999e-01
+group 0: At t = 4.0000e+03      y =  1.831797e-01    8.940765e-07    8.168194e-01
+group 10: At t = 4.0000e+03      y =  1.831797e-01    8.940765e-07    8.168194e-01
+group 20: At t = 4.0000e+03      y =  1.831797e-01    8.940765e-07    8.168194e-01
+group 30: At t = 4.0000e+03      y =  1.831797e-01    8.940765e-07    8.168194e-01
+group 40: At t = 4.0000e+03      y =  1.831797e-01    8.940765e-07    8.168194e-01
+group 50: At t = 4.0000e+03      y =  1.831797e-01    8.940765e-07    8.168194e-01
+group 60: At t = 4.0000e+03      y =  1.831797e-01    8.940765e-07    8.168194e-01
+group 70: At t = 4.0000e+03      y =  1.831797e-01    8.940765e-07    8.168194e-01
+group 80: At t = 4.0000e+03      y =  1.831797e-01    8.940765e-07    8.168194e-01
+group 90: At t = 4.0000e+03      y =  1.831797e-01    8.940765e-07    8.168194e-01
+group 0: At t = 4.0000e+04      y =  3.897911e-02    1.621575e-07    9.610207e-01
+group 10: At t = 4.0000e+04      y =  3.897911e-02    1.621575e-07    9.610207e-01
+group 20: At t = 4.0000e+04      y =  3.897911e-02    1.621575e-07    9.610207e-01
+group 30: At t = 4.0000e+04      y =  3.897911e-02    1.621575e-07    9.610207e-01
+group 40: At t = 4.0000e+04      y =  3.897911e-02    1.621575e-07    9.610207e-01
+group 50: At t = 4.0000e+04      y =  3.897911e-02    1.621575e-07    9.610207e-01
+group 60: At t = 4.0000e+04      y =  3.897911e-02    1.621575e-07    9.610207e-01
+group 70: At t = 4.0000e+04      y =  3.897911e-02    1.621575e-07    9.610207e-01
+group 80: At t = 4.0000e+04      y =  3.897911e-02    1.621575e-07    9.610207e-01
+group 90: At t = 4.0000e+04      y =  3.897911e-02    1.621575e-07    9.610207e-01
+group 0: At t = 4.0000e+05      y =  4.938738e-03    1.985178e-08    9.950612e-01
+group 10: At t = 4.0000e+05      y =  4.938738e-03    1.985178e-08    9.950612e-01
+group 20: At t = 4.0000e+05      y =  4.938738e-03    1.985178e-08    9.950612e-01
+group 30: At t = 4.0000e+05      y =  4.938738e-03    1.985178e-08    9.950612e-01
+group 40: At t = 4.0000e+05      y =  4.938738e-03    1.985178e-08    9.950612e-01
+group 50: At t = 4.0000e+05      y =  4.938738e-03    1.985178e-08    9.950612e-01
+group 60: At t = 4.0000e+05      y =  4.938738e-03    1.985178e-08    9.950612e-01
+group 70: At t = 4.0000e+05      y =  4.938738e-03    1.985178e-08    9.950612e-01
+group 80: At t = 4.0000e+05      y =  4.938738e-03    1.985178e-08    9.950612e-01
+group 90: At t = 4.0000e+05      y =  4.938738e-03    1.985178e-08    9.950612e-01
+group 0: At t = 4.0000e+06      y =  5.166270e-04    2.067563e-09    9.994834e-01
+group 10: At t = 4.0000e+06      y =  5.166270e-04    2.067563e-09    9.994834e-01
+group 20: At t = 4.0000e+06      y =  5.166270e-04    2.067563e-09    9.994834e-01
+group 30: At t = 4.0000e+06      y =  5.166270e-04    2.067563e-09    9.994834e-01
+group 40: At t = 4.0000e+06      y =  5.166270e-04    2.067563e-09    9.994834e-01
+group 50: At t = 4.0000e+06      y =  5.166270e-04    2.067563e-09    9.994834e-01
+group 60: At t = 4.0000e+06      y =  5.166270e-04    2.067563e-09    9.994834e-01
+group 70: At t = 4.0000e+06      y =  5.166270e-04    2.067563e-09    9.994834e-01
+group 80: At t = 4.0000e+06      y =  5.166270e-04    2.067563e-09    9.994834e-01
+group 90: At t = 4.0000e+06      y =  5.166270e-04    2.067563e-09    9.994834e-01
+group 0: At t = 4.0000e+07      y =  5.202720e-05    2.081194e-10    9.999480e-01
+group 10: At t = 4.0000e+07      y =  5.202720e-05    2.081194e-10    9.999480e-01
+group 20: At t = 4.0000e+07      y =  5.202720e-05    2.081194e-10    9.999480e-01
+group 30: At t = 4.0000e+07      y =  5.202720e-05    2.081194e-10    9.999480e-01
+group 40: At t = 4.0000e+07      y =  5.202720e-05    2.081194e-10    9.999480e-01
+group 50: At t = 4.0000e+07      y =  5.202720e-05    2.081194e-10    9.999480e-01
+group 60: At t = 4.0000e+07      y =  5.202720e-05    2.081194e-10    9.999480e-01
+group 70: At t = 4.0000e+07      y =  5.202720e-05    2.081194e-10    9.999480e-01
+group 80: At t = 4.0000e+07      y =  5.202720e-05    2.081194e-10    9.999480e-01
+group 90: At t = 4.0000e+07      y =  5.202720e-05    2.081194e-10    9.999480e-01
+group 0: At t = 4.0000e+08      y =  5.214602e-06    2.085852e-11    9.999948e-01
+group 10: At t = 4.0000e+08      y =  5.214602e-06    2.085852e-11    9.999948e-01
+group 20: At t = 4.0000e+08      y =  5.214602e-06    2.085852e-11    9.999948e-01
+group 30: At t = 4.0000e+08      y =  5.214602e-06    2.085852e-11    9.999948e-01
+group 40: At t = 4.0000e+08      y =  5.214602e-06    2.085852e-11    9.999948e-01
+group 50: At t = 4.0000e+08      y =  5.214602e-06    2.085852e-11    9.999948e-01
+group 60: At t = 4.0000e+08      y =  5.214602e-06    2.085852e-11    9.999948e-01
+group 70: At t = 4.0000e+08      y =  5.214602e-06    2.085852e-11    9.999948e-01
+group 80: At t = 4.0000e+08      y =  5.214602e-06    2.085852e-11    9.999948e-01
+group 90: At t = 4.0000e+08      y =  5.214602e-06    2.085852e-11    9.999948e-01
+group 0: At t = 4.0000e+09      y =  5.160314e-07    2.064127e-12    9.999995e-01
+group 10: At t = 4.0000e+09      y =  5.160314e-07    2.064127e-12    9.999995e-01
+group 20: At t = 4.0000e+09      y =  5.160314e-07    2.064127e-12    9.999995e-01
+group 30: At t = 4.0000e+09      y =  5.160314e-07    2.064127e-12    9.999995e-01
+group 40: At t = 4.0000e+09      y =  5.160314e-07    2.064127e-12    9.999995e-01
+group 50: At t = 4.0000e+09      y =  5.160314e-07    2.064127e-12    9.999995e-01
+group 60: At t = 4.0000e+09      y =  5.160314e-07    2.064127e-12    9.999995e-01
+group 70: At t = 4.0000e+09      y =  5.160314e-07    2.064127e-12    9.999995e-01
+group 80: At t = 4.0000e+09      y =  5.160314e-07    2.064127e-12    9.999995e-01
+group 90: At t = 4.0000e+09      y =  5.160314e-07    2.064127e-12    9.999995e-01
+group 0: At t = 4.0000e+10      y =  5.791263e-08    2.316505e-13    9.999999e-01
+group 10: At t = 4.0000e+10      y =  5.791263e-08    2.316505e-13    9.999999e-01
+group 20: At t = 4.0000e+10      y =  5.791263e-08    2.316505e-13    9.999999e-01
+group 30: At t = 4.0000e+10      y =  5.791263e-08    2.316505e-13    9.999999e-01
+group 40: At t = 4.0000e+10      y =  5.791263e-08    2.316505e-13    9.999999e-01
+group 50: At t = 4.0000e+10      y =  5.791263e-08    2.316505e-13    9.999999e-01
+group 60: At t = 4.0000e+10      y =  5.791263e-08    2.316505e-13    9.999999e-01
+group 70: At t = 4.0000e+10      y =  5.791263e-08    2.316505e-13    9.999999e-01
+group 80: At t = 4.0000e+10      y =  5.791263e-08    2.316505e-13    9.999999e-01
+group 90: At t = 4.0000e+10      y =  5.791263e-08    2.316505e-13    9.999999e-01
 
 Final Statistics:
-nst = 604    nfe  = 859    nsetups = 121    nje = 12
-nni = 856    ncfn = 0      netf = 30        nge = 0
+nst = 547    nfe  = 812    nsetups = 117    nje = 13
+nni = 809    ncfn = 0      netf = 29        nge = 0
  
 cuSolverSp numerical factorization workspace size (in bytes) = 73472
-cuSolverSp internal Q, R buffer size (in bytes) = 9600
+cuSolverSp internal Q, R buffer size (in bytes) = 9600
\ No newline at end of file
diff --git a/examples/cvode/fcmix_parallel/CMakeLists.txt b/examples/cvode/fcmix_parallel/CMakeLists.txt
index 5bfa7e0bae..61a692396d 100644
--- a/examples/cvode/fcmix_parallel/CMakeLists.txt
+++ b/examples/cvode/fcmix_parallel/CMakeLists.txt
@@ -44,6 +44,11 @@ else()
   set(FNVECP_LIB sundials_fnvecparallel_shared)
 endif()
 
+if(SUNDIALS_BUILD_PACKAGE_FUSED_KERNELS)
+  list(APPEND CVODE_LIB
+       sundials_cvode_fused_stubs_${LINK_LIBRARY_TYPE})
+endif()
+
 # Only static FCMIX libraries are available
 set(FCVODE_LIB sundials_fcvode_static)
 
@@ -98,6 +103,9 @@ if(EXAMPLES_INSTALL)
   set(SOLVER "CVODE")
   set(SOLVER_LIB "sundials_cvode")
   set(SOLVER_FLIB "sundials_fcvode")
+  if(SUNDIALS_BUILD_PACKAGE_FUSED_KERNELS)
+    set(LIBS "-lsundials_cvode_fused_stubs ${LIBS}")
+  endif()
 
   examples2string(FCVODE_examples EXAMPLES)
 
diff --git a/examples/cvode/fcmix_serial/CMakeLists.txt b/examples/cvode/fcmix_serial/CMakeLists.txt
index d0b0a9d5d7..04d7cb9314 100644
--- a/examples/cvode/fcmix_serial/CMakeLists.txt
+++ b/examples/cvode/fcmix_serial/CMakeLists.txt
@@ -60,6 +60,11 @@ else()
   set(FNVECS_LIB sundials_fnvecserial_shared)
 endif()
 
+if(SUNDIALS_BUILD_PACKAGE_FUSED_KERNELS)
+  list(APPEND CVODE_LIB
+       sundials_cvode_fused_stubs_${LINK_LIBRARY_TYPE})
+endif()
+
 # Only static FCMIX libraries are available
 set(FCVODE_LIB sundials_fcvode_static)
 
@@ -264,6 +269,9 @@ if(EXAMPLES_INSTALL)
   set(SOLVER "CVODE")
   set(SOLVER_LIB "sundials_cvode")
   set(SOLVER_FLIB "sundials_fcvode")
+  if(SUNDIALS_BUILD_PACKAGE_FUSED_KERNELS)
+    set(LIBS "-lsundials_cvode_fused_stubs ${LIBS}")
+  endif()
 
   examples2string(FCVODE_examples EXAMPLES)
 
diff --git a/examples/cvode/parallel/CMakeLists.txt b/examples/cvode/parallel/CMakeLists.txt
index c4b4223c55..7cd7aad65a 100644
--- a/examples/cvode/parallel/CMakeLists.txt
+++ b/examples/cvode/parallel/CMakeLists.txt
@@ -41,6 +41,11 @@ else()
   set(NVECP_LIB sundials_nvecparallel_shared)
 endif()
 
+if(SUNDIALS_BUILD_PACKAGE_FUSED_KERNELS)
+  list(APPEND CVODE_LIB
+       sundials_cvode_fused_stubs_${LINK_LIBRARY_TYPE})
+endif()
+
 # Set-up linker flags and link libraries
 set(SUNDIALS_LIBS ${CVODE_LIB} ${NVECP_LIB} ${EXTRA_LINK_LIBS})
 
@@ -90,6 +95,9 @@ if(EXAMPLES_INSTALL)
   # Prepare substitution variables for Makefile and/or CMakeLists templates
   set(SOLVER "CVODE")
   set(SOLVER_LIB "sundials_cvode")
+  if(SUNDIALS_BUILD_PACKAGE_FUSED_KERNELS)
+    set(LIBS "-lsundials_cvode_fused_stubs ${LIBS}")
+  endif()
 
   examples2string(CVODE_examples EXAMPLES)
 
diff --git a/examples/cvode/parhyp/CMakeLists.txt b/examples/cvode/parhyp/CMakeLists.txt
index 61f7b62882..dea5f716ba 100644
--- a/examples/cvode/parhyp/CMakeLists.txt
+++ b/examples/cvode/parhyp/CMakeLists.txt
@@ -40,10 +40,14 @@ else()
   set(NVECP_LIB sundials_nvecparhyp_shared)
 endif()
 
+if(SUNDIALS_BUILD_PACKAGE_FUSED_KERNELS)
+  list(APPEND CVODE_LIB
+       sundials_cvode_fused_stubs_${LINK_LIBRARY_TYPE})
+endif()
+
 # Set-up linker flags and link libraries
 set(SUNDIALS_LIBS ${CVODE_LIB} ${NVECP_LIB} ${EXTRA_LINK_LIBS})
 
-
 # Add the build and install targets for each example
 foreach(example_tuple ${CVODE_examples})
 
@@ -90,6 +94,9 @@ if(EXAMPLES_INSTALL)
   # Prepare substitution variables for Makefile and/or CMakeLists templates
   set(SOLVER "CVODE")
   set(SOLVER_LIB "sundials_cvode")
+  if(SUNDIALS_BUILD_PACKAGE_FUSED_KERNELS)
+    set(LIBS "-lsundials_cvode_fused_stubs ${LIBS}")
+  endif()
 
   examples2string(CVODE_examples EXAMPLES)
 
diff --git a/examples/cvode/petsc/CMakeLists.txt b/examples/cvode/petsc/CMakeLists.txt
index 621d3d57da..3f368319ae 100644
--- a/examples/cvode/petsc/CMakeLists.txt
+++ b/examples/cvode/petsc/CMakeLists.txt
@@ -43,6 +43,11 @@ else()
   set(SUNNLS_LIB sundials_sunnonlinsolpetscsnes_shared)
 endif()
 
+if(SUNDIALS_BUILD_PACKAGE_FUSED_KERNELS)
+  list(APPEND CVODE_LIB
+       sundials_cvode_fused_stubs_${LINK_LIBRARY_TYPE})
+endif()
+
 # Set-up linker flags and link libraries
 set(SUNDIALS_LIBS ${CVODE_LIB} ${NVECP_LIB} ${SUNNLS_LIB} ${EXTRA_LINK_LIBS})
 
@@ -90,6 +95,9 @@ if(EXAMPLES_INSTALL)
   # Prepare substitution variables for Makefile and/or CMakeLists templates
   set(SOLVER "CVODE")
   set(SOLVER_LIB "sundials_cvode")
+  if(SUNDIALS_BUILD_PACKAGE_FUSED_KERNELS)
+    set(LIBS "-lsundials_cvode_fused_stubs ${LIBS}")
+  endif()
 
   examples2string(CVODE_examples EXAMPLES)
 
diff --git a/examples/cvode/raja/CMakeLists.txt b/examples/cvode/raja/CMakeLists.txt
index 32f49729fc..9c759e3f3c 100644
--- a/examples/cvode/raja/CMakeLists.txt
+++ b/examples/cvode/raja/CMakeLists.txt
@@ -38,6 +38,11 @@ else()
   set(NVECS_LIB sundials_nveccudaraja_shared)
 endif()
 
+if(SUNDIALS_BUILD_PACKAGE_FUSED_KERNELS)
+  list(APPEND CVODE_LIB
+       sundials_cvode_fused_stubs_${LINK_LIBRARY_TYPE})
+endif()
+
 set(SUNDIALS_LIBS ${CVODE_LIB} ${NVECS_LIB} ${EXTRA_LINK_LIBS})
 
 # Add the build and install targets for each CVODE example
@@ -80,6 +85,9 @@ if(EXAMPLES_INSTALL)
   set(SOLVER "CVODE")
   set(SOLVER_LIB "sundials_cvode")
   set(NVECTOR_LIB "sundials_nveccudaraja")
+  if(SUNDIALS_BUILD_PACKAGE_FUSED_KERNELS)
+    set(LIBS "-lsundials_cvode_fused_stubs ${LIBS}")
+  endif()
 
   examples2string(CVODE_examples EXAMPLES)
 
diff --git a/examples/cvode/serial/CMakeLists.txt b/examples/cvode/serial/CMakeLists.txt
index ab17d6a821..aae4fbbff8 100644
--- a/examples/cvode/serial/CMakeLists.txt
+++ b/examples/cvode/serial/CMakeLists.txt
@@ -15,40 +15,52 @@
 # CMakeLists.txt file for CVODE serial examples
 # ---------------------------------------------------------------
 
-# Example lists are tuples "name\;type" where the type is
+# Example lists are tuples "name\;args\;type" where the type is
 # 'develop' for examples excluded from 'make test' in releases
 
 # Examples using SUNDIALS linear solvers
 set(CVODE_examples
-  "cvRoberts_dns\;"
-  "cvRoberts_dns_uw\;develop"
-  "cvRoberts_dns_negsol\;develop"
-  "cvRoberts_dns_constraints\;develop"
-  "cvAdvDiff_bnd\;develop"
-  "cvDirectDemo_ls\;develop"
-  "cvDiurnal_kry_bp\;develop"
-  "cvDiurnal_kry\;develop"
-  "cvDisc_dns\;develop"
-  "cvKrylovDemo_ls\;develop"
-  "cvKrylovDemo_prec\;develop"
+  "cvAdvDiff_bnd\;\;develop"
+  "cvDirectDemo_ls\;\;develop"
+  "cvDiurnal_kry_bp\;\;develop"
+  "cvDiurnal_kry\;\;develop"
+  "cvDisc_dns\;\;develop"
+  "cvKrylovDemo_ls\;\;develop"
+  "cvKrylovDemo_prec\;\;develop"
+  "cvParticle_dns\;\;develop"
+  "cvPendulum_dns\;\;develop"
+  "cvRoberts_dns\;\;"
+  "cvRoberts_dns_uw\;\;develop"
+  "cvRoberts_dns_negsol\;\;develop"
+  "cvRoberts_dns_constraints\;\;develop"
   #cvAdvDiffReac_kry\;develop" # not released
   )
 
+if(SUNDIALS_BUILD_WITH_MONITORING)
+  list(APPEND CVODE_examples "cvKrylovDemo_ls\;1\;develop")
+endif()
+
 # Examples using LAPACK linear solvers
 set(CVODE_examples_BL
-  "cvAdvDiff_bndL\;develop"
-  "cvRoberts_dnsL\;develop"
+  "cvAdvDiff_bndL\;\;develop"
+  "cvRoberts_dnsL\;\;develop"
   )
 
 # Examples using KLU linear solver
 set(CVODE_examples_KLU
-  "cvRoberts_klu\;develop"
-  "cvRoberts_block_klu\;develop"
+  "cvRoberts_klu\;\;develop"
+  "cvRoberts_block_klu\;\;develop"
   )
 
 # Examples using SuperLU_MT linear solver
 set(CVODE_examples_SUPERLUMT
-  "cvRoberts_sps\;develop"
+  "cvRoberts_sps\;\;develop"
+  )
+
+# Auxiliary files to install
+set(CVODE_extras
+  plot_cvParticle.py
+  plot_cvPendulum.py
   )
 
 # Specify libraries to link against (through the target that was used to
@@ -61,34 +73,55 @@ else()
   set(NVECS_LIB sundials_nvecserial_shared)
 endif()
 
+if(SUNDIALS_BUILD_PACKAGE_FUSED_KERNELS)
+  list(APPEND CVODE_LIB
+       sundials_cvode_fused_stubs_${LINK_LIBRARY_TYPE})
+endif()
+
 # Set-up linker flags and link libraries
 set(SUNDIALS_LIBS ${CVODE_LIB} ${NVECS_LIB} ${EXTRA_LINK_LIBS})
 
-
 # Add the build and install targets for each example
 foreach(example_tuple ${CVODE_examples})
 
   # parse the example tuple
   list(GET example_tuple 0 example)
-  list(GET example_tuple 1 example_type)
+  list(GET example_tuple 1 example_args)
+  list(GET example_tuple 2 example_type)
+
+  # check if this example has already been added, only need to add
+  # example source files once for testing with different inputs
+  if(NOT TARGET ${example})
+    # example source files
+    add_executable(${example} ${example}.c)
+
+    # folder to organize targets in an IDE
+    set_target_properties(${example} PROPERTIES FOLDER "Examples")
 
-  # example source files
-  add_executable(${example} ${example}.c)
+    # libraries to link against
+    target_link_libraries(${example} ${SUNDIALS_LIBS})
+  endif()
 
-  set_target_properties(${example} PROPERTIES FOLDER "Examples")
+  # check if example args are provided and set the test name
+  if("${example_args}" STREQUAL "")
+    set(test_name ${example})
+  else()
+    string(REGEX REPLACE " " "_" test_name ${example}_${example_args})
+  endif()
 
   # add example to regression tests
-  sundials_add_test(${example} ${example}
+  sundials_add_test(${test_name} ${example}
+    TEST_ARGS ${example_args}
     ANSWER_DIR ${CMAKE_CURRENT_SOURCE_DIR}
-    ANSWER_FILE ${example}.out
+    ANSWER_FILE ${test_name}.out
     EXAMPLE_TYPE ${example_type})
 
-  # libraries to link against
-  target_link_libraries(${example} ${SUNDIALS_LIBS})
+  # find all .out files for this example
+  file(GLOB example_out ${example}*.out)
 
-  # install example source and out files
+  # install example source and .out files
   if(EXAMPLES_INSTALL)
-    install(FILES ${example}.c ${example}.out
+    install(FILES ${example}.c ${example_out}
       DESTINATION ${EXAMPLES_INSTALL_PATH}/cvode/serial)
   endif()
 
@@ -121,25 +154,42 @@ if(LAPACK_FOUND)
 
     # parse the example tuple
     list(GET example_tuple 0 example)
-    list(GET example_tuple 1 example_type)
+    list(GET example_tuple 1 example_args)
+    list(GET example_tuple 2 example_type)
 
-    # example source files
-    add_executable(${example} ${example}.c)
+    # check if this example has already been added, only need to add
+    # example source files once for testing with different inputs
+    if(NOT TARGET ${example})
+      # example source files
+      add_executable(${example} ${example}.c)
 
-    set_target_properties(${example} PROPERTIES FOLDER "Examples")
+      # folder to organize targets in an IDE
+      set_target_properties(${example} PROPERTIES FOLDER "Examples")
+
+      # libraries to link against
+      target_link_libraries(${example} ${SUNDIALS_LIBS} ${SUNLINSOLLAPACK_LIBS})
+    endif()
+
+    # check if example args are provided and set the test name
+    if("${example_args}" STREQUAL "")
+      set(test_name ${example})
+    else()
+      string(REGEX REPLACE " " "_" test_name ${example}_${example_args})
+    endif()
 
     # add example to regression tests
-    sundials_add_test(${example} ${example}
+    sundials_add_test(${test_name} ${example}
+      TEST_ARGS ${example_args}
       ANSWER_DIR ${CMAKE_CURRENT_SOURCE_DIR}
-      ANSWER_FILE ${example}.out
+      ANSWER_FILE ${test_name}.out
       EXAMPLE_TYPE ${example_type})
 
-    # libraries to link against
-    target_link_libraries(${example} ${SUNDIALS_LIBS} ${SUNLINSOLLAPACK_LIBS})
+    # find all .out files for this example
+    file(GLOB example_out ${example}*.out)
 
-    # install example source and out files
+    # install example source and .out files
     if(EXAMPLES_INSTALL)
-      install(FILES ${example}.c ${example}.out
+      install(FILES ${example}.c ${example_out}
         DESTINATION ${EXAMPLES_INSTALL_PATH}/cvode/serial)
     endif()
 
@@ -165,25 +215,42 @@ if(KLU_FOUND)
 
     # parse the example tuple
     list(GET example_tuple 0 example)
-    list(GET example_tuple 1 example_type)
+    list(GET example_tuple 1 example_args)
+    list(GET example_tuple 2 example_type)
 
-    # example source files
-    add_executable(${example} ${example}.c)
+    # check if this example has already been added, only need to add
+    # example source files once for testing with different inputs
+    if(NOT TARGET ${example})
+      # add example source files
+      add_executable(${example} ${example}.c)
 
-    set_target_properties(${example} PROPERTIES FOLDER "Examples")
+      # folder to organize targets in an IDE
+      set_target_properties(${example} PROPERTIES FOLDER "Examples")
+
+      # libraries to link against
+      target_link_libraries(${example} ${SUNDIALS_LIBS} ${SUNLINSOLKLU_LIBS})
+    endif()
+
+    # check if example args are provided and set the test name
+    if("${example_args}" STREQUAL "")
+      set(test_name ${example})
+    else()
+      string(REGEX REPLACE " " "_" test_name ${example}_${example_args})
+    endif()
 
     # add example to regression tests
-    sundials_add_test(${example} ${example}
+    sundials_add_test(${test_name} ${example}
+      TEST_ARGS ${example_args}
       ANSWER_DIR ${CMAKE_CURRENT_SOURCE_DIR}
-      ANSWER_FILE ${example}.out
+      ANSWER_FILE ${test_name}.out
       EXAMPLE_TYPE ${example_type})
 
-    # libraries to link against
-    target_link_libraries(${example} ${SUNDIALS_LIBS} ${SUNLINSOLKLU_LIBS})
+    # find all .out files for this example
+    file(GLOB example_out ${example}*.out)
 
-    # install example source and out files
+    # install example source and .out files
     if(EXAMPLES_INSTALL)
-      install(FILES ${example}.c ${example}.out
+      install(FILES ${example}.c ${example_out}
         DESTINATION ${EXAMPLES_INSTALL_PATH}/cvode/serial)
     endif()
 
@@ -214,25 +281,35 @@ if(SUPERLUMT_FOUND)
 
     # parse the example tuple
     list(GET example_tuple 0 example)
-    list(GET example_tuple 1 example_type)
+    list(GET example_tuple 1 example_args)
+    list(GET example_tuple 2 example_type)
 
-    # example source files
-    add_executable(${example} ${example}.c)
+    # check if this example has already been added, only need to add
+    # example source files once for testing with different inputs
+    if(NOT TARGET ${example})
+      # add example source files
+      add_executable(${example} ${example}.c)
 
-    set_target_properties(${example} PROPERTIES FOLDER "Examples")
+      # folder to organize targets in an IDE
+      set_target_properties(${example} PROPERTIES FOLDER "Examples")
 
-    # add example to regression tests
-    sundials_add_test(${example} ${example}
-      ANSWER_DIR ${CMAKE_CURRENT_SOURCE_DIR}
-      ANSWER_FILE ${example}.out
-      EXAMPLE_TYPE ${example_type})
+      # libraries to link against
+      target_link_libraries(${example} ${SUNDIALS_LIBS} ${SUNLINSOLSLUMT_LIBS})
+    endif()
 
-    # libraries to link against
-    target_link_libraries(${example} ${SUNDIALS_LIBS} ${SUNLINSOLSLUMT_LIBS})
+    # check if example args are provided and set the test name
+    if("${example_args}" STREQUAL "")
+      set(test_name ${example})
+    else()
+      string(REGEX REPLACE " " "_" test_name ${example}_${example_args})
+    endif()
+
+    # find all .out files for this example
+    file(GLOB example_out ${example}*.out)
 
-    # install example source and out files
+    # install example source and .out files
     if(EXAMPLES_INSTALL)
-      install(FILES ${example}.c ${example}.out
+      install(FILES ${example}.c ${example_out}
         DESTINATION ${EXAMPLES_INSTALL_PATH}/cvode/serial)
     endif()
 
@@ -247,9 +324,17 @@ if(EXAMPLES_INSTALL)
   # Install the README file
   install(FILES README DESTINATION ${EXAMPLES_INSTALL_PATH}/cvode/serial)
 
+  # Install the extra files
+  foreach(extrafile ${CVODE_extras})
+    install(FILES ${extrafile} DESTINATION ${EXAMPLES_INSTALL_PATH}/cvode/serial)
+  endforeach()
+
   # Prepare substitution variables for Makefile and/or CMakeLists templates
   set(SOLVER "CVODE")
   set(SOLVER_LIB "sundials_cvode")
+  if(SUNDIALS_BUILD_PACKAGE_FUSED_KERNELS)
+    set(LIBS "-lsundials_cvode_fused_stubs ${LIBS}")
+  endif()
 
   examples2string(CVODE_examples EXAMPLES)
 
diff --git a/examples/cvode/serial/cvKrylovDemo_ls.c b/examples/cvode/serial/cvKrylovDemo_ls.c
index cfa96d388e..8cf582eee1 100644
--- a/examples/cvode/serial/cvKrylovDemo_ls.c
+++ b/examples/cvode/serial/cvKrylovDemo_ls.c
@@ -1,6 +1,6 @@
 /* -----------------------------------------------------------------
  * Programmer(s): Scott D. Cohen, Alan C. Hindmarsh and
- *                Radu Serban @ LLNL
+ *                Radu Serban, Cody J. Balos @ LLNL
  * -----------------------------------------------------------------
  * SUNDIALS Copyright Start
  * Copyright (c) 2002-2020, Lawrence Livermore National Security
@@ -13,7 +13,7 @@
  * SUNDIALS Copyright End
  * -----------------------------------------------------------------
  * This example loops through the available iterative linear solvers:
- * SPGMR, SPBCG and SPTFQMR.
+ * SPGMR, SPFGMR, SPBCG and SPTFQMR.
  *
  * Example problem:
  *
@@ -32,10 +32,10 @@
  *   0 <= t <= 86400 sec (1 day).
  * The PDE system is treated by central differences on a uniform
  * 10 x 10 mesh, with simple polynomial initial profiles.
- * The problem is solved with CVODE, with the BDF/GMRES,
+ * The problem is solved with CVODE, with the BDF/GMRES, BDF/FGMRES
  * BDF/Bi-CGStab, and BDF/TFQMR methods (i.e. using the SUNLinSol_SPGMR,
- * SUNLinSol_SPBCGS and SUNLinSol_SPTFQMR linear solvers) and the
- * block-diagonal part of the Newton matrix as a left preconditioner.
+ * SUNLinSol_SPFGMR, SUNLinSol_SPBCGS, and SUNLinSol_SPTFQMR linear solvers)
+ * and the block-diagonal part of the Newton matrix as a left preconditioner.
  * A copy of the block-diagonal part of the Jacobian is saved and
  * conditionally reused within the Precond routine.
  * -----------------------------------------------------------------*/
@@ -44,13 +44,15 @@
 #include <stdlib.h>
 #include <math.h>
 
-#include <cvode/cvode.h>                 /* main integrator header file       */
-#include <sunlinsol/sunlinsol_spgmr.h>   /* access to SPGMR SUNLinearSolver   */
-#include <sunlinsol/sunlinsol_spbcgs.h>  /* access to SPBCGS SUNLinearSolver  */
-#include <sunlinsol/sunlinsol_sptfqmr.h> /* access to SPTFQMR SUNLinearSolver */
-#include <nvector/nvector_serial.h>      /* serial N_Vector types, fct. and macros */
-#include <sundials/sundials_dense.h>     /* use generic DENSE solver in preconditioning */
-#include <sundials/sundials_types.h>     /* definition of realtype */
+#include <cvode/cvode.h>                      /* main integrator header file                 */
+#include <sunlinsol/sunlinsol_spgmr.h>        /* access to SPGMR SUNLinearSolver             */
+#include <sunlinsol/sunlinsol_spfgmr.h>       /* access to SPFGMR SUNLinearSolver            */
+#include <sunlinsol/sunlinsol_spbcgs.h>       /* access to SPBCGS SUNLinearSolver            */
+#include <sunlinsol/sunlinsol_sptfqmr.h>      /* access to SPTFQMR SUNLinearSolver           */
+#include <sunnonlinsol/sunnonlinsol_newton.h> /* access to Newton SUNNonlinearSolver         */
+#include <nvector/nvector_serial.h>           /* serial N_Vector types, fct. and macros      */
+#include <sundials/sundials_dense.h>          /* use generic DENSE solver in preconditioning */
+#include <sundials/sundials_types.h>          /* definition of realtype                      */
 
 /* helpful macros */
 
@@ -105,8 +107,9 @@
 /* Linear Solver Loop Constants */
 
 #define USE_SPGMR   0
-#define USE_SPBCG   1
-#define USE_SPTFQMR 2
+#define USE_SPFGMR  1
+#define USE_SPBCG   2
+#define USE_SPTFQMR 3
 
 /* User-defined vector and matrix accessor macros: IJKth, IJth */
 
@@ -132,22 +135,25 @@
 #define IJth(a,i,j)        (a[j-1][i-1])
 
 /* Type : UserData
-   contains preconditioner blocks, pivot arrays, and problem constants */
+   contains preconditioner blocks, pivot arrays, and problem constants,
+   solution vector, and linsolver type */
 
 typedef struct {
   realtype **P[MX][MY], **Jbd[MX][MY];
   sunindextype *pivot[MX][MY];
   realtype q4, om, dx, dy, hdco, haco, vdco;
+  N_Vector u;
+  int linsolver;
 } *UserData;
 
 /* Private Helper Functions */
 
 static UserData AllocUserData(void);
-static void InitUserData(UserData data);
+static void InitUserData(UserData data, N_Vector u);
 static void FreeUserData(UserData data);
 static void SetInitialProfiles(N_Vector u, realtype dx, realtype dy);
 static void PrintOutput(void *cvode_mem, N_Vector u, realtype t);
-static void PrintFinalStats(void *cvode_mem, int linsolver);
+static void PrintStats(void *cvode_mem, int linsolver, int stats);
 static int check_retval(void *returnvalue, const char *funcname, int opt);
 
 /* Functions Called by the Solver */
@@ -162,6 +168,7 @@ static int PSolve(realtype tn, N_Vector u, N_Vector fu,
                   realtype gamma, realtype delta,
                   int lr, void *user_data);
 
+static int myMonitorFunction(void *cvode_mem, void *user_data);
 
 /*
  *-------------------------------
@@ -169,26 +176,40 @@ static int PSolve(realtype tn, N_Vector u, N_Vector fu,
  *-------------------------------
  */
 
-int main(void)
+int main(int argc, char* argv[])
 {
   realtype abstol, reltol, t, tout;
   N_Vector u;
   UserData data;
   SUNLinearSolver LS;
+  SUNNonlinearSolver NLS;
   void *cvode_mem;
   int linsolver, iout, retval;
+  FILE* infofp;
+  int monitor;
 
   u = NULL;
   data = NULL;
   LS = NULL;
   cvode_mem = NULL;
+  monitor = 0;
+
+  if (argc == 2) {
+    monitor = atoi(argv[1]);
+  }
+
+  /* Open info file if monitoring is turned on */
+  if (monitor) {
+    infofp = fopen("cvKrylovDemo_ls-info.txt", "w+");
+    if (check_retval((void *)infofp, "fopen", 0)) return(1);
+  }
 
   /* Allocate memory, and set problem data, initial values, tolerances */
   u = N_VNew_Serial(NEQ);
-  if(check_retval((void *)u, "N_VNew_Serial", 0)) return(1);
+  if (check_retval((void *)u, "N_VNew_Serial", 0)) return(1);
   data = AllocUserData();
-  if(check_retval((void *)data, "AllocUserData", 2)) return(1);
-  InitUserData(data);
+  if (check_retval((void *)data, "AllocUserData", 2)) return(1);
+  InitUserData(data, u);
   SetInitialProfiles(u, data->dx, data->dy);
   abstol=ATOL;
   reltol=RTOL;
@@ -196,30 +217,55 @@ int main(void)
   /* Call CVodeCreate to create the solver memory and specify the
    * Backward Differentiation Formula */
   cvode_mem = CVodeCreate(CV_BDF);
-  if(check_retval((void *)cvode_mem, "CVodeCreate", 0)) return(1);
+  if (check_retval((void *)cvode_mem, "CVodeCreate", 0)) return(1);
 
   /* Set the pointer to user-defined data */
   retval = CVodeSetUserData(cvode_mem, data);
-  if(check_retval(&retval, "CVodeSetUserData", 1)) return(1);
+  if (check_retval(&retval, "CVodeSetUserData", 1)) return(1);
 
   /* Call CVodeInit to initialize the integrator memory and specify the
    * user's right hand side function in u'=f(t,u), the inital time T0, and
    * the initial dependent variable vector u. */
   retval = CVodeInit(cvode_mem, f, T0, u);
-  if(check_retval(&retval, "CVodeInit", 1)) return(1);
+  if (check_retval(&retval, "CVodeInit", 1)) return(1);
 
   /* Call CVodeSStolerances to specify the scalar relative tolerance
    * and scalar absolute tolerances */
   retval = CVodeSStolerances(cvode_mem, reltol, abstol);
   if (check_retval(&retval, "CVodeSStolerances", 1)) return(1);
 
-  /* START: Loop through SPGMR, SPBCG and SPTFQMR linear solver modules */
-  for (linsolver = 0; linsolver < 3; ++linsolver) {
+  /* Set a function that CVode will call every 50 successful time steps.
+   * This will be used to monitor the solution and integrator statistics. */
+  if (monitor) {
+    retval = CVodeSetMonitorFn(cvode_mem, myMonitorFunction);
+    if (check_retval(&retval, "CVodeSetMonitorFn", 1)) return(1);
+    retval = CVodeSetMonitorFrequency(cvode_mem, 50);
+    if (check_retval(&retval, "CVodeSetMonitorFrequency", 1)) return(1);
+  }
+
+  /* Create the SUNNonlinearSolver */
+  NLS = SUNNonlinSol_Newton(u);
+  if (check_retval(&retval, "SUNNonlinSol_Newton", 0)) return(1);
+  if (monitor) {
+    /* Set the print level set to 1, so that the nonlinear residual
+       is printed every newton iteration. */
+    retval = SUNNonlinSolSetPrintLevel_Newton(NLS, 1);
+    if (check_retval(&retval, "SUNNonlinSolSetPrintLevel_Newton", 1)) return(1);
+    retval = SUNNonlinSolSetInfoFile_Newton(NLS, infofp);
+    if (check_retval(&retval, "SUNNonlinSolSetInfoFile_Newton", 1)) return(1);
+  }
+
+  /* Call CVodeSetNonlinearSolver to attach the nonlinear solver to CVode */
+  retval = CVodeSetNonlinearSolver(cvode_mem, NLS);
+  if (check_retval(&retval, "CVodeSetNonlinearSolver", 1)) return(1);
+
+  /* START: Loop through SPGMR, SPFGMR, SPBCG and SPTFQMR linear solver modules */
+  for (linsolver = 0; linsolver < 4; ++linsolver) {
 
     if (linsolver != 0) {
 
       /* Re-initialize user data */
-      InitUserData(data);
+      InitUserData(data, u);
       SetInitialProfiles(u, data->dx, data->dy);
 
     /* Re-initialize CVode for the solution of the same problem, but
@@ -232,6 +278,9 @@ int main(void)
     /* Free previous linear solver and attach a new linear solver module */
     SUNLinSolFree(LS);
 
+    /* Set the linear sovler type in user data */
+    data->linsolver = linsolver;
+
     switch(linsolver) {
 
     /* (a) SPGMR */
@@ -241,74 +290,132 @@ int main(void)
       printf(" -------");
       printf(" \n| SPGMR |\n");
       printf(" -------\n");
+      if (monitor) {
+        fprintf(infofp, " ---------");
+        fprintf(infofp, " \n| SPGMR |\n");
+        fprintf(infofp, " ---------\n");
+      }
 
       /* Call SUNLinSol_SPGMR to specify the linear solver SPGMR with
          left preconditioning and the default maximum Krylov dimension */
       LS = SUNLinSol_SPGMR(u, PREC_LEFT, 0);
-      if(check_retval((void *)LS, "SUNLinSol_SPGMR", 0)) return(1);
+      if (check_retval((void *)LS, "SUNLinSol_SPGMR", 0)) return(1);
+      if (monitor) {
+        retval = SUNLinSolSetPrintLevel_SPGMR(LS, 1);
+        if (check_retval(&retval, "SUNLinSolSetPrintLevel_SPGMR", 1)) return(1);
+        retval = SUNLinSolSetInfoFile_SPGMR(LS, infofp);
+        if (check_retval(&retval, "SUNLinSolSetInfoFile_SPGMR", 1)) return(1);
+      }
+      retval = CVodeSetLinearSolver(cvode_mem, LS, NULL);
+      if (check_retval(&retval, "CVodeSetLinearSolver", 1)) return 1;
+
+      break;
+
+    /* (b) SPFGMR */
+    case(USE_SPFGMR):
+
+      /* Print header */
+      printf(" ---------");
+      printf(" \n| SPFGMR |\n");
+      printf(" ---------\n");
+      if (monitor) {
+        fprintf(infofp, " ---------");
+        fprintf(infofp, " \n| SPFGMR |\n");
+        fprintf(infofp, " ---------\n");
+      }
 
+      /* Call SUNLinSol_SPFGMR to specify the linear solver SPFGMR with
+         left preconditioning and the default maximum Krylov dimension */
+      LS = SUNLinSol_SPFGMR(u, PREC_LEFT, 0);
+      if (check_retval((void *)LS, "SUNLinSol_SPFGMR", 0)) return(1);
+      if (monitor) {
+        retval = SUNLinSolSetPrintLevel_SPFGMR(LS, 1);
+        if (check_retval(&retval, "SUNLinSolSetPrintLevel_SPFGMR", 1)) return(1);
+        retval = SUNLinSolSetInfoFile_SPFGMR(LS, infofp);
+        if (check_retval(&retval, "SUNLinSolSetInfoFile_SPFGMR", 1)) return(1);
+      }
       retval = CVodeSetLinearSolver(cvode_mem, LS, NULL);
-      if(check_retval(&retval, "CVodeSetLinearSolver", 1)) return 1;
+      if (check_retval(&retval, "CVodeSetLinearSolver", 1)) return 1;
 
       break;
 
-    /* (b) SPBCG */
+    /* (c) SPBCG */
     case(USE_SPBCG):
 
       /* Print header */
       printf(" -------");
       printf(" \n| SPBCGS |\n");
       printf(" -------\n");
+      if (monitor) {
+        fprintf(infofp, " ---------");
+        fprintf(infofp, " \n| SPBCGS |\n");
+        fprintf(infofp, " ---------\n");
+      }
 
       /* Call SUNLinSol_SPBCGS to specify the linear solver SPBCGS with
          left preconditioning and the default maximum Krylov dimension */
       LS = SUNLinSol_SPBCGS(u, PREC_LEFT, 0);
-      if(check_retval((void *)LS, "SUNLinSol_SPBCGS", 0)) return(1);
-
+      if (check_retval((void *)LS, "SUNLinSol_SPBCGS", 0)) return(1);
+      if (monitor) {
+        retval = SUNLinSolSetPrintLevel_SPBCGS(LS, 1);
+        if (check_retval(&retval, "SUNLinSolSetPrintLevel_SPBCGS", 1)) return(1);
+        retval = SUNLinSolSetInfoFile_SPBCGS(LS, infofp);
+        if (check_retval(&retval, "SUNLinSolSetInfoFile_SPBCGS", 1)) return(1);
+      }
       retval = CVodeSetLinearSolver(cvode_mem, LS, NULL);
-      if(check_retval(&retval, "CVodeSetLinearSolver", 1)) return 1;
+      if (check_retval(&retval, "CVodeSetLinearSolver", 1)) return 1;
 
       break;
 
-    /* (c) SPTFQMR */
+    /* (d) SPTFQMR */
     case(USE_SPTFQMR):
 
       /* Print header */
       printf(" ---------");
       printf(" \n| SPTFQMR |\n");
       printf(" ---------\n");
+      if (monitor) {
+        fprintf(infofp, " ---------");
+        fprintf(infofp, " \n| SPTFQMR |\n");
+        fprintf(infofp, " ---------\n");
+      }
 
       /* Call SUNLinSol_SPTFQMR to specify the linear solver SPTFQMR with
          left preconditioning and the default maximum Krylov dimension */
       LS = SUNLinSol_SPTFQMR(u, PREC_LEFT, 0);
-      if(check_retval((void *)LS, "SUNLinSol_SPTFQMR", 0)) return(1);
-
+      if (check_retval((void *)LS, "SUNLinSol_SPTFQMR", 0)) return(1);
+      if (monitor) {
+        retval = SUNLinSolSetPrintLevel_SPTFQMR(LS, 1);
+        if (check_retval(&retval, "SUNLinSolSetPrintLevel_SPTFQMR", 1)) return(1);
+        retval = SUNLinSolSetInfoFile_SPTFQMR(LS, infofp);
+        if (check_retval(&retval, "SUNLinSolSetInfoFile_SPTFQMR", 1)) return(1);
+      }
       retval = CVodeSetLinearSolver(cvode_mem, LS, NULL);
-      if(check_retval(&retval, "CVodeSetLinearSolver", 1)) return 1;
+      if (check_retval(&retval, "CVodeSetLinearSolver", 1)) return 1;
 
       break;
-
     }
 
 
     /* Set preconditioner setup and solve routines Precond and PSolve,
        and the pointer to the user-defined block data */
     retval = CVodeSetPreconditioner(cvode_mem, Precond, PSolve);
-    if(check_retval(&retval, "CVodeSetPreconditioner", 1)) return(1);
+    if (check_retval(&retval, "CVodeSetPreconditioner", 1)) return(1);
 
-    /* In loop over output points, call CVode, print results, test for error */
+    /* In loop over output points, call CVode, print results, and test for error */
     printf(" \n2-species diurnal advection-diffusion problem\n\n");
     for (iout=1, tout = TWOHR; iout <= NOUT; iout++, tout += TWOHR) {
       retval = CVode(cvode_mem, tout, u, &t, CV_NORMAL);
-      PrintOutput(cvode_mem, u, t);
-      if(check_retval(&retval, "CVode", 1)) break;
+      if (!monitor) PrintOutput(cvode_mem, u, t);
+      if (check_retval(&retval, "CVode", 1)) break;
     }
-
-    PrintFinalStats(cvode_mem, linsolver);
+    if (monitor) PrintOutput(cvode_mem, u, t);
+    PrintStats(cvode_mem, linsolver, 1);
 
   }  /* END: Loop through SPGMR, SPBCG and SPTFQMR linear solver modules */
 
   /* Free memory */
+  if (monitor) fclose(infofp);
   N_VDestroy(u);
   FreeUserData(data);
   CVodeFree(&cvode_mem);
@@ -345,8 +452,9 @@ static UserData AllocUserData(void)
 
 /* Load problem constants in data */
 
-static void InitUserData(UserData data)
+static void InitUserData(UserData data, N_Vector u)
 {
+  data->u = u;
   data->om = PI/HALFDAY;
   data->dx = (XMAX-XMIN)/(MX-1);
   data->dy = (YMAX-YMIN)/(MY-1);
@@ -444,12 +552,12 @@ static void PrintOutput(void *cvode_mem, N_Vector u, realtype t)
 
 /* Get and print final statistics */
 
-static void PrintFinalStats(void *cvode_mem, int linsolver)
+static void PrintStats(void *cvode_mem, int linsolver, int final)
 {
   long int lenrw, leniw ;
   long int lenrwLS, leniwLS;
   long int nst, nfe, nsetups, nni, ncfn, netf;
-  long int nli, npe, nps, ncfl, nfeLS;
+  long int nje, nli, npe, nps, ncfl, nfeLS, njts, njte;
   int retval;
 
   retval = CVodeGetWorkSpace(cvode_mem, &lenrw, &leniw);
@@ -469,18 +577,12 @@ static void PrintFinalStats(void *cvode_mem, int linsolver)
 
   retval = CVodeGetLinWorkSpace(cvode_mem, &lenrwLS, &leniwLS);
   check_retval(&retval, "CVodeGetLinWorkSpace", 1);
-  retval = CVodeGetNumLinIters(cvode_mem, &nli);
-  check_retval(&retval, "CVodeGetNumLinIters", 1);
-  retval = CVodeGetNumPrecEvals(cvode_mem, &npe);
-  check_retval(&retval, "CVodeGetNumPrecEvals", 1);
-  retval = CVodeGetNumPrecSolves(cvode_mem, &nps);
-  check_retval(&retval, "CVodeGetNumPrecSolves", 1);
-  retval = CVodeGetNumLinConvFails(cvode_mem, &ncfl);
-  check_retval(&retval, "CVodeGetNumLinConvFails", 1);
-  retval = CVodeGetNumLinRhsEvals(cvode_mem, &nfeLS);
-  check_retval(&retval, "CVodeGetNumLinRhsEvals", 1);
-
-  printf("\nFinal Statistics.. \n\n");
+  CVodeGetLinSolveStats(cvode_mem, &nje, &nfeLS, &nli, &ncfl, &npe,
+                        &nps, &njts, &njte);
+  check_retval(&retval, "CVodeGetLinWorkSpace", 1);
+
+  if (final) printf("\nFinal Statistics.. \n\n");
+  else       printf("\nIntermediate Statistics.. \n\n");
   printf("lenrw   = %5ld     leniw   = %5ld\n"  , lenrw, leniw);
   printf("lenrwLS = %5ld     leniwLS = %5ld\n"  , lenrwLS, leniwLS);
   printf("nst     = %5ld\n"                     , nst);
@@ -748,3 +850,18 @@ static int PSolve(realtype tn, N_Vector u, N_Vector fu,
 
   return(0);
 }
+
+
+/* Function that is called at some step interval by CVODE */
+
+static int myMonitorFunction(void* cvode_mem, void* user_data)
+{
+  UserData data = (UserData) user_data;
+  realtype t = 0;
+
+  CVodeGetCurrentTime(cvode_mem, &t);
+  PrintOutput(cvode_mem, data->u, t);
+  PrintStats(cvode_mem, data->linsolver, 0);
+
+  return(0);
+}
diff --git a/examples/cvode/serial/cvKrylovDemo_ls.out b/examples/cvode/serial/cvKrylovDemo_ls.out
index 522a87d9ad..6f1196281d 100644
--- a/examples/cvode/serial/cvKrylovDemo_ls.out
+++ b/examples/cvode/serial/cvKrylovDemo_ls.out
@@ -64,6 +64,74 @@ nsetups =    78     netf    =    27
 npe     =     8     nps     =  1176
 ncfn    =     0     ncfl    =     0
 
+======================================================================
+
+ --------- 
+| SPFGMR |
+ ---------
+ 
+2-species diurnal advection-diffusion problem
+
+t = 7.20e+03   no. steps = 192   order = 5   stepsize = 1.34e+02
+c1 (bot.left/middle/top rt.) =    1.047e+04     2.964e+04     1.119e+04
+c2 (bot.left/middle/top rt.) =    2.527e+11     7.154e+11     2.700e+11
+
+t = 1.44e+04   no. steps = 225   order = 5   stepsize = 3.12e+02
+c1 (bot.left/middle/top rt.) =    6.659e+06     5.316e+06     7.301e+06
+c2 (bot.left/middle/top rt.) =    2.582e+11     2.057e+11     2.833e+11
+
+t = 2.16e+04   no. steps = 248   order = 5   stepsize = 3.12e+02
+c1 (bot.left/middle/top rt.) =    2.665e+07     1.036e+07     2.931e+07
+c2 (bot.left/middle/top rt.) =    2.993e+11     1.028e+11     3.313e+11
+
+t = 2.88e+04   no. steps = 293   order = 3   stepsize = 9.18e+01
+c1 (bot.left/middle/top rt.) =    8.702e+06     1.292e+07     9.650e+06
+c2 (bot.left/middle/top rt.) =    3.380e+11     5.029e+11     3.751e+11
+
+t = 3.60e+04   no. steps = 324   order = 5   stepsize = 1.15e+02
+c1 (bot.left/middle/top rt.) =    1.404e+04     2.029e+04     1.561e+04
+c2 (bot.left/middle/top rt.) =    3.387e+11     4.895e+11     3.765e+11
+
+t = 4.32e+04   no. steps = 378   order = 5   stepsize = 5.86e+02
+c1 (bot.left/middle/top rt.) =    1.180e-09     7.956e-08     1.490e-09
+c2 (bot.left/middle/top rt.) =    3.382e+11     1.355e+11     3.804e+11
+
+t = 5.04e+04   no. steps = 391   order = 5   stepsize = 4.27e+02
+c1 (bot.left/middle/top rt.) =    6.580e-11     2.809e-08     1.507e-10
+c2 (bot.left/middle/top rt.) =    3.358e+11     4.930e+11     3.864e+11
+
+t = 5.76e+04   no. steps = 402   order = 5   stepsize = 4.96e+02
+c1 (bot.left/middle/top rt.) =    2.069e-10     1.559e-08     2.264e-10
+c2 (bot.left/middle/top rt.) =    3.320e+11     9.650e+11     3.909e+11
+
+t = 6.48e+04   no. steps = 412   order = 5   stepsize = 7.86e+02
+c1 (bot.left/middle/top rt.) =    3.735e-11    -1.076e-09    -3.688e-11
+c2 (bot.left/middle/top rt.) =    3.313e+11     8.922e+11     3.963e+11
+
+t = 7.20e+04   no. steps = 421   order = 5   stepsize = 7.86e+02
+c1 (bot.left/middle/top rt.) =   -1.410e-11    -1.148e-09     9.417e-11
+c2 (bot.left/middle/top rt.) =    3.330e+11     6.186e+11     4.039e+11
+
+t = 7.92e+04   no. steps = 430   order = 5   stepsize = 7.86e+02
+c1 (bot.left/middle/top rt.) =    1.671e-12    -2.171e-10    -4.471e-12
+c2 (bot.left/middle/top rt.) =    3.334e+11     6.669e+11     4.120e+11
+
+t = 8.64e+04   no. steps = 439   order = 5   stepsize = 7.86e+02
+c1 (bot.left/middle/top rt.) =    5.687e-13    -9.924e-11    -1.149e-12
+c2 (bot.left/middle/top rt.) =    3.352e+11     9.106e+11     4.162e+11
+
+
+Final Statistics.. 
+
+lenrw   =  2689     leniw   =    53
+lenrwLS =  3254     leniwLS =    46
+nst     =   439
+nfe     =   565     nfeLS   =   858
+nni     =   562     nli     =   858
+nsetups =    77     netf    =    26
+npe     =     8     nps     =   858
+ncfn    =     0     ncfl    =     0
+
 ======================================================================
 
  ------- 
@@ -132,8 +200,6 @@ nsetups =    72     netf    =    25
 npe     =     8     nps     =  1453
 ncfn    =     0     ncfl    =     0
 
-======================================================================
-
  --------- 
 | SPTFQMR |
  ---------
diff --git a/examples/cvode/serial/cvKrylovDemo_ls_1.out b/examples/cvode/serial/cvKrylovDemo_ls_1.out
new file mode 100644
index 0000000000..2a61824c44
--- /dev/null
+++ b/examples/cvode/serial/cvKrylovDemo_ls_1.out
@@ -0,0 +1,654 @@
+ ------- 
+| SPGMR |
+ -------
+ 
+2-species diurnal advection-diffusion problem
+
+t = 7.88e-01   no. steps = 50   order = 5   stepsize = 2.51e-02
+c1 (bot.left/middle/top rt.) =    2.151e+03     8.392e+03     2.151e+03
+c2 (bot.left/middle/top rt.) =    2.500e+11     9.756e+11     2.500e+11
+
+
+Intermediate Statistics.. 
+
+lenrw   =  2689     leniw   =    53
+lenrwLS =  2454     leniwLS =    42
+nst     =    50
+nfe     =    80     nfeLS   =    55
+nni     =    77     nli     =    55
+nsetups =    16     netf    =     6
+npe     =     1     nps     =   110
+ncfn    =     0     ncfl    =     0
+
+======================================================================
+
+t = 2.21e+00   no. steps = 100   order = 5   stepsize = 3.77e-02
+c1 (bot.left/middle/top rt.) =    4.149e-01     1.618e+00     4.149e-01
+c2 (bot.left/middle/top rt.) =    2.500e+11     9.757e+11     2.500e+11
+
+
+Intermediate Statistics.. 
+
+lenrw   =  2689     leniw   =    53
+lenrwLS =  2454     leniwLS =    42
+nst     =   100
+nfe     =   131     nfeLS   =   105
+nni     =   128     nli     =   105
+nsetups =    19     netf    =     6
+npe     =     2     nps     =   210
+ncfn    =     0     ncfl    =     0
+
+======================================================================
+
+t = 3.67e+03   no. steps = 150   order = 4   stepsize = 3.74e+01
+c1 (bot.left/middle/top rt.) =    1.236e-02     4.596e-02     1.288e-02
+c2 (bot.left/middle/top rt.) =    2.509e+11     9.328e+11     2.615e+11
+
+
+Intermediate Statistics.. 
+
+lenrw   =  2689     leniw   =    53
+lenrwLS =  2454     leniwLS =    42
+nst     =   150
+nfe     =   197     nfeLS   =   162
+nni     =   194     nli     =   162
+nsetups =    34     netf    =     8
+npe     =     3     nps     =   321
+ncfn    =     0     ncfl    =     0
+
+======================================================================
+
+t = 5.77e+03   no. steps = 200   order = 3   stepsize = 3.60e+01
+c1 (bot.left/middle/top rt.) =    3.285e+02     1.066e+03     3.480e+02
+c2 (bot.left/middle/top rt.) =    2.518e+11     8.171e+11     2.667e+11
+
+
+Intermediate Statistics.. 
+
+lenrw   =  2689     leniw   =    53
+lenrwLS =  2454     leniwLS =    42
+nst     =   200
+nfe     =   259     nfeLS   =   214
+nni     =   256     nli     =   214
+nsetups =    38     netf    =    10
+npe     =     4     nps     =   425
+ncfn    =     0     ncfl    =     0
+
+======================================================================
+
+t = 1.43e+04   no. steps = 250   order = 5   stepsize = 3.77e+02
+c1 (bot.left/middle/top rt.) =    6.387e+06     5.244e+06     7.000e+06
+c2 (bot.left/middle/top rt.) =    2.581e+11     2.116e+11     2.831e+11
+
+
+Intermediate Statistics.. 
+
+lenrw   =  2689     leniw   =    53
+lenrwLS =  2454     leniwLS =    42
+nst     =   250
+nfe     =   320     nfeLS   =   265
+nni     =   317     nli     =   265
+nsetups =    45     netf    =    11
+npe     =     5     nps     =   535
+ncfn    =     0     ncfl    =     0
+
+======================================================================
+
+t = 2.79e+04   no. steps = 300   order = 4   stepsize = 4.41e+01
+c1 (bot.left/middle/top rt.) =    1.189e+07     1.588e+07     1.317e+07
+c2 (bot.left/middle/top rt.) =    3.375e+11     4.520e+11     3.743e+11
+
+
+Intermediate Statistics.. 
+
+lenrw   =  2689     leniw   =    53
+lenrwLS =  2454     leniwLS =    42
+nst     =   300
+nfe     =   391     nfeLS   =   337
+nni     =   388     nli     =   337
+nsetups =    54     netf    =    17
+npe     =     6     nps     =   676
+ncfn    =     0     ncfl    =     0
+
+======================================================================
+
+t = 3.74e+04   no. steps = 350   order = 5   stepsize = 1.02e+02
+c1 (bot.left/middle/top rt.) =    4.321e+02     5.131e+02     4.808e+02
+c2 (bot.left/middle/top rt.) =    3.387e+11     4.022e+11     3.769e+11
+
+
+Intermediate Statistics.. 
+
+lenrw   =  2689     leniw   =    53
+lenrwLS =  2454     leniwLS =    42
+nst     =   350
+nfe     =   461     nfeLS   =   428
+nni     =   458     nli     =   428
+nsetups =    62     netf    =    22
+npe     =     6     nps     =   837
+ncfn    =     0     ncfl    =     0
+
+======================================================================
+
+t = 4.98e+04   no. steps = 400   order = 4   stepsize = 3.15e+02
+c1 (bot.left/middle/top rt.) =   -3.654e-08    -4.295e-06    -6.514e-08
+c2 (bot.left/middle/top rt.) =    3.361e+11     4.418e+11     3.860e+11
+
+
+Intermediate Statistics.. 
+
+lenrw   =  2689     leniw   =    53
+lenrwLS =  2454     leniwLS =    42
+nst     =   400
+nfe     =   518     nfeLS   =   521
+nni     =   515     nli     =   521
+nsetups =    68     netf    =    23
+npe     =     7     nps     =   985
+ncfn    =     0     ncfl    =     0
+
+======================================================================
+
+t = 7.88e+04   no. steps = 450   order = 5   stepsize = 7.01e+02
+c1 (bot.left/middle/top rt.) =    6.689e-13     3.345e-11     1.269e-12
+c2 (bot.left/middle/top rt.) =    3.334e+11     6.542e+11     4.117e+11
+
+
+Intermediate Statistics.. 
+
+lenrw   =  2689     leniw   =    53
+lenrwLS =  2454     leniwLS =    42
+nst     =   450
+nfe     =   586     nfeLS   =   614
+nni     =   583     nli     =   614
+nsetups =    77     netf    =    27
+npe     =     8     nps     =  1143
+ncfn    =     0     ncfl    =     0
+
+======================================================================
+
+t = 8.64e+04   no. steps = 461   order = 5   stepsize = 7.01e+02
+c1 (bot.left/middle/top rt.) =    4.144e-14     2.072e-12     7.862e-14
+c2 (bot.left/middle/top rt.) =    3.352e+11     9.107e+11     4.163e+11
+
+
+Final Statistics.. 
+
+lenrw   =  2689     leniw   =    53
+lenrwLS =  2454     leniwLS =    42
+nst     =   461
+nfe     =   597     nfeLS   =   636
+nni     =   594     nli     =   636
+nsetups =    78     netf    =    27
+npe     =     8     nps     =  1176
+ncfn    =     0     ncfl    =     0
+
+======================================================================
+
+ --------- 
+| SPFGMR |
+ ---------
+ 
+2-species diurnal advection-diffusion problem
+
+t = 7.88e-01   no. steps = 50   order = 5   stepsize = 2.51e-02
+c1 (bot.left/middle/top rt.) =    2.151e+03     8.392e+03     2.151e+03
+c2 (bot.left/middle/top rt.) =    2.500e+11     9.756e+11     2.500e+11
+
+
+Intermediate Statistics.. 
+
+lenrw   =  2689     leniw   =    53
+lenrwLS =  3254     leniwLS =    46
+nst     =    50
+nfe     =    80     nfeLS   =    55
+nni     =    77     nli     =    55
+nsetups =    16     netf    =     6
+npe     =     1     nps     =    55
+ncfn    =     0     ncfl    =     0
+
+======================================================================
+
+t = 2.21e+00   no. steps = 100   order = 5   stepsize = 3.77e-02
+c1 (bot.left/middle/top rt.) =    4.149e-01     1.618e+00     4.149e-01
+c2 (bot.left/middle/top rt.) =    2.500e+11     9.757e+11     2.500e+11
+
+
+Intermediate Statistics.. 
+
+lenrw   =  2689     leniw   =    53
+lenrwLS =  3254     leniwLS =    46
+nst     =   100
+nfe     =   131     nfeLS   =   105
+nni     =   128     nli     =   105
+nsetups =    19     netf    =     6
+npe     =     2     nps     =   105
+ncfn    =     0     ncfl    =     0
+
+======================================================================
+
+t = 3.94e+03   no. steps = 150   order = 4   stepsize = 9.97e+01
+c1 (bot.left/middle/top rt.) =    8.547e-02     3.136e-01     8.929e-02
+c2 (bot.left/middle/top rt.) =    2.510e+11     9.210e+11     2.622e+11
+
+
+Intermediate Statistics.. 
+
+lenrw   =  2689     leniw   =    53
+lenrwLS =  3254     leniwLS =    46
+nst     =   150
+nfe     =   200     nfeLS   =   171
+nni     =   197     nli     =   171
+nsetups =    37     netf    =     9
+npe     =     3     nps     =   171
+ncfn    =     0     ncfl    =     0
+
+======================================================================
+
+t = 8.36e+03   no. steps = 200   order = 5   stepsize = 1.34e+02
+c1 (bot.left/middle/top rt.) =    6.996e+04     1.728e+05     7.519e+04
+c2 (bot.left/middle/top rt.) =    2.535e+11     6.262e+11     2.724e+11
+
+
+Intermediate Statistics.. 
+
+lenrw   =  2689     leniw   =    53
+lenrwLS =  3254     leniwLS =    46
+nst     =   200
+nfe     =   261     nfeLS   =   225
+nni     =   258     nli     =   225
+nsetups =    43     netf    =    12
+npe     =     4     nps     =   225
+ncfn    =     0     ncfl    =     0
+
+======================================================================
+
+t = 2.23e+04   no. steps = 250   order = 5   stepsize = 3.12e+02
+c1 (bot.left/middle/top rt.) =    2.698e+07     1.250e+07     2.967e+07
+c2 (bot.left/middle/top rt.) =    3.075e+11     1.307e+11     3.403e+11
+
+
+Intermediate Statistics.. 
+
+lenrw   =  2689     leniw   =    53
+lenrwLS =  3254     leniwLS =    46
+nst     =   250
+nfe     =   315     nfeLS   =   359
+nni     =   312     nli     =   359
+nsetups =    46     netf    =    12
+npe     =     5     nps     =   359
+ncfn    =     0     ncfl    =     0
+
+======================================================================
+
+t = 3.01e+04   no. steps = 300   order = 4   stepsize = 1.75e+02
+c1 (bot.left/middle/top rt.) =    5.017e+06     8.298e+06     5.570e+06
+c2 (bot.left/middle/top rt.) =    3.383e+11     5.600e+11     3.757e+11
+
+
+Intermediate Statistics.. 
+
+lenrw   =  2689     leniw   =    53
+lenrwLS =  3254     leniwLS =    46
+nst     =   300
+nfe     =   396     nfeLS   =   484
+nni     =   393     nli     =   484
+nsetups =    62     netf    =    20
+npe     =     6     nps     =   484
+ncfn    =     0     ncfl    =     0
+
+======================================================================
+
+t = 3.86e+04   no. steps = 350   order = 5   stepsize = 8.38e+01
+c1 (bot.left/middle/top rt.) =    4.314e+00     4.141e+00     4.806e+00
+c2 (bot.left/middle/top rt.) =    3.387e+11     3.251e+11     3.774e+11
+
+
+Intermediate Statistics.. 
+
+lenrw   =  2689     leniw   =    53
+lenrwLS =  3254     leniwLS =    46
+nst     =   350
+nfe     =   462     nfeLS   =   656
+nni     =   459     nli     =   656
+nsetups =    67     netf    =    24
+npe     =     6     nps     =   656
+ncfn    =     0     ncfl    =     0
+
+======================================================================
+
+t = 5.68e+04   no. steps = 400   order = 5   stepsize = 7.44e+02
+c1 (bot.left/middle/top rt.) =   -1.055e-12    -8.927e-09     2.668e-11
+c2 (bot.left/middle/top rt.) =    3.324e+11     9.373e+11     3.904e+11
+
+
+Intermediate Statistics.. 
+
+lenrw   =  2689     leniw   =    53
+lenrwLS =  3254     leniwLS =    46
+nst     =   400
+nfe     =   519     nfeLS   =   758
+nni     =   516     nli     =   758
+nsetups =    74     netf    =    25
+npe     =     7     nps     =   758
+ncfn    =     0     ncfl    =     0
+
+======================================================================
+
+t = 8.64e+04   no. steps = 439   order = 5   stepsize = 7.86e+02
+c1 (bot.left/middle/top rt.) =    5.687e-13    -9.924e-11    -1.149e-12
+c2 (bot.left/middle/top rt.) =    3.352e+11     9.106e+11     4.162e+11
+
+
+Final Statistics.. 
+
+lenrw   =  2689     leniw   =    53
+lenrwLS =  3254     leniwLS =    46
+nst     =   439
+nfe     =   565     nfeLS   =   858
+nni     =   562     nli     =   858
+nsetups =    77     netf    =    26
+npe     =     8     nps     =   858
+ncfn    =     0     ncfl    =     0
+
+======================================================================
+
+ ------- 
+| SPBCGS |
+ -------
+ 
+2-species diurnal advection-diffusion problem
+
+t = 7.88e-01   no. steps = 50   order = 5   stepsize = 2.51e-02
+c1 (bot.left/middle/top rt.) =    2.151e+03     8.392e+03     2.151e+03
+c2 (bot.left/middle/top rt.) =    2.500e+11     9.756e+11     2.500e+11
+
+
+Intermediate Statistics.. 
+
+lenrw   =  2689     leniw   =    53
+lenrwLS =  2202     leniwLS =    41
+nst     =    50
+nfe     =    80     nfeLS   =   110
+nni     =    77     nli     =    55
+nsetups =    16     netf    =     6
+npe     =     1     nps     =   165
+ncfn    =     0     ncfl    =     0
+
+t = 2.21e+00   no. steps = 100   order = 5   stepsize = 3.77e-02
+c1 (bot.left/middle/top rt.) =    4.149e-01     1.618e+00     4.149e-01
+c2 (bot.left/middle/top rt.) =    2.500e+11     9.757e+11     2.500e+11
+
+
+Intermediate Statistics.. 
+
+lenrw   =  2689     leniw   =    53
+lenrwLS =  2202     leniwLS =    41
+nst     =   100
+nfe     =   131     nfeLS   =   210
+nni     =   128     nli     =   105
+nsetups =    19     netf    =     6
+npe     =     2     nps     =   315
+ncfn    =     0     ncfl    =     0
+
+t = 3.88e+03   no. steps = 150   order = 4   stepsize = 1.45e+02
+c1 (bot.left/middle/top rt.) =    5.801e-02     2.135e-01     6.057e-02
+c2 (bot.left/middle/top rt.) =    2.509e+11     9.235e+11     2.620e+11
+
+
+Intermediate Statistics.. 
+
+lenrw   =  2689     leniw   =    53
+lenrwLS =  2202     leniwLS =    41
+nst     =   150
+nfe     =   197     nfeLS   =   312
+nni     =   194     nli     =   156
+nsetups =    34     netf    =     8
+npe     =     3     nps     =   470
+ncfn    =     0     ncfl    =     0
+
+t = 8.84e+03   no. steps = 200   order = 5   stepsize = 1.58e+02
+c1 (bot.left/middle/top rt.) =    1.311e+05     3.042e+05     1.413e+05
+c2 (bot.left/middle/top rt.) =    2.538e+11     5.888e+11     2.734e+11
+
+
+Intermediate Statistics.. 
+
+lenrw   =  2689     leniw   =    53
+lenrwLS =  2202     leniwLS =    41
+nst     =   200
+nfe     =   261     nfeLS   =   418
+nni     =   258     nli     =   209
+nsetups =    41     netf    =    11
+npe     =     4     nps     =   629
+ncfn    =     0     ncfl    =     0
+
+t = 2.36e+04   no. steps = 250   order = 5   stepsize = 4.34e+02
+c1 (bot.left/middle/top rt.) =    2.596e+07     1.592e+07     2.856e+07
+c2 (bot.left/middle/top rt.) =    3.194e+11     1.885e+11     3.533e+11
+
+
+Intermediate Statistics.. 
+
+lenrw   =  2689     leniw   =    53
+lenrwLS =  2202     leniwLS =    41
+nst     =   250
+nfe     =   318     nfeLS   =   520
+nni     =   315     nli     =   260
+nsetups =    46     netf    =    12
+npe     =     5     nps     =   788
+ncfn    =     0     ncfl    =     0
+
+t = 3.34e+04   no. steps = 300   order = 5   stepsize = 2.67e+02
+c1 (bot.left/middle/top rt.) =    5.232e+05     9.140e+05     5.813e+05
+c2 (bot.left/middle/top rt.) =    3.385e+11     5.914e+11     3.761e+11
+
+
+Intermediate Statistics.. 
+
+lenrw   =  2689     leniw   =    53
+lenrwLS =  2202     leniwLS =    41
+nst     =   300
+nfe     =   395     nfeLS   =   642
+nni     =   392     nli     =   321
+nsetups =    58     netf    =    20
+npe     =     5     nps     =   975
+ncfn    =     0     ncfl    =     0
+
+t = 3.92e+04   no. steps = 350   order = 5   stepsize = 9.48e+01
+c1 (bot.left/middle/top rt.) =    1.804e-01     1.542e-01     2.011e-01
+c2 (bot.left/middle/top rt.) =    3.387e+11     2.896e+11     3.776e+11
+
+
+Intermediate Statistics.. 
+
+lenrw   =  2689     leniw   =    53
+lenrwLS =  2202     leniwLS =    41
+nst     =   350
+nfe     =   460     nfeLS   =   752
+nni     =   457     nli     =   376
+nsetups =    63     netf    =    23
+npe     =     6     nps     =  1140
+ncfn    =     0     ncfl    =     0
+
+t = 5.86e+04   no. steps = 400   order = 5   stepsize = 5.96e+02
+c1 (bot.left/middle/top rt.) =    7.894e-14    -5.680e-12     1.260e-13
+c2 (bot.left/middle/top rt.) =    3.317e+11     9.872e+11     3.915e+11
+
+
+Intermediate Statistics.. 
+
+lenrw   =  2689     leniw   =    53
+lenrwLS =  2202     leniwLS =    41
+nst     =   400
+nfe     =   522     nfeLS   =   870
+nni     =   519     nli     =   435
+nsetups =    70     netf    =    25
+npe     =     7     nps     =  1312
+ncfn    =     0     ncfl    =     0
+
+t = 8.64e+04   no. steps = 447   order = 5   stepsize = 5.96e+02
+c1 (bot.left/middle/top rt.) =   -9.999e-27    -1.866e-22    -1.581e-27
+c2 (bot.left/middle/top rt.) =    3.352e+11     9.107e+11     4.163e+11
+
+
+Final Statistics.. 
+
+lenrw   =  2689     leniw   =    53
+lenrwLS =  2202     leniwLS =    41
+nst     =   447
+nfe     =   569     nfeLS   =   964
+nni     =   566     nli     =   482
+nsetups =    72     netf    =    25
+npe     =     8     nps     =  1453
+ncfn    =     0     ncfl    =     0
+
+ --------- 
+| SPTFQMR |
+ ---------
+ 
+2-species diurnal advection-diffusion problem
+
+t = 7.88e-01   no. steps = 50   order = 5   stepsize = 2.51e-02
+c1 (bot.left/middle/top rt.) =    2.151e+03     8.392e+03     2.151e+03
+c2 (bot.left/middle/top rt.) =    2.500e+11     9.756e+11     2.500e+11
+
+
+Intermediate Statistics.. 
+
+lenrw   =  2689     leniw   =    53
+lenrwLS =  2602     leniwLS =    43
+nst     =    50
+nfe     =    80     nfeLS   =   110
+nni     =    77     nli     =    55
+nsetups =    16     netf    =     6
+npe     =     1     nps     =   165
+ncfn    =     0     ncfl    =     0
+
+t = 2.21e+00   no. steps = 100   order = 5   stepsize = 3.77e-02
+c1 (bot.left/middle/top rt.) =    4.149e-01     1.618e+00     4.149e-01
+c2 (bot.left/middle/top rt.) =    2.500e+11     9.757e+11     2.500e+11
+
+
+Intermediate Statistics.. 
+
+lenrw   =  2689     leniw   =    53
+lenrwLS =  2602     leniwLS =    43
+nst     =   100
+nfe     =   131     nfeLS   =   210
+nni     =   128     nli     =   105
+nsetups =    19     netf    =     6
+npe     =     2     nps     =   315
+ncfn    =     0     ncfl    =     0
+
+t = 3.65e+03   no. steps = 150   order = 4   stepsize = 3.82e+01
+c1 (bot.left/middle/top rt.) =    1.067e-02     3.969e-02     1.111e-02
+c2 (bot.left/middle/top rt.) =    2.509e+11     9.336e+11     2.614e+11
+
+
+Intermediate Statistics.. 
+
+lenrw   =  2689     leniw   =    53
+lenrwLS =  2602     leniwLS =    43
+nst     =   150
+nfe     =   197     nfeLS   =   318
+nni     =   194     nli     =   156
+nsetups =    34     netf    =     8
+npe     =     3     nps     =   483
+ncfn    =     0     ncfl    =     0
+
+t = 5.70e+03   no. steps = 200   order = 3   stepsize = 3.36e+01
+c1 (bot.left/middle/top rt.) =    2.634e+02     8.596e+02     2.788e+02
+c2 (bot.left/middle/top rt.) =    2.518e+11     8.218e+11     2.666e+11
+
+
+Intermediate Statistics.. 
+
+lenrw   =  2689     leniw   =    53
+lenrwLS =  2602     leniwLS =    43
+nst     =   200
+nfe     =   259     nfeLS   =   422
+nni     =   256     nli     =   208
+nsetups =    38     netf    =    10
+npe     =     4     nps     =   639
+ncfn    =     0     ncfl    =     0
+
+t = 1.45e+04   no. steps = 250   order = 5   stepsize = 3.27e+02
+c1 (bot.left/middle/top rt.) =    6.824e+06     5.357e+06     7.483e+06
+c2 (bot.left/middle/top rt.) =    2.583e+11     2.022e+11     2.834e+11
+
+
+Intermediate Statistics.. 
+
+lenrw   =  2689     leniw   =    53
+lenrwLS =  2602     leniwLS =    43
+nst     =   250
+nfe     =   318     nfeLS   =   525
+nni     =   315     nli     =   259
+nsetups =    44     netf    =    11
+npe     =     5     nps     =   798
+ncfn    =     0     ncfl    =     0
+
+t = 2.94e+04   no. steps = 300   order = 5   stepsize = 3.39e+02
+c1 (bot.left/middle/top rt.) =    6.796e+06     1.069e+07     7.541e+06
+c2 (bot.left/middle/top rt.) =    3.382e+11     5.329e+11     3.754e+11
+
+
+Intermediate Statistics.. 
+
+lenrw   =  2689     leniw   =    53
+lenrwLS =  2602     leniwLS =    43
+nst     =   300
+nfe     =   382     nfeLS   =   671
+nni     =   379     nli     =   313
+nsetups =    51     netf    =    14
+npe     =     6     nps     =  1040
+ncfn    =     0     ncfl    =     0
+
+t = 3.87e+04   no. steps = 350   order = 5   stepsize = 9.42e+01
+c1 (bot.left/middle/top rt.) =    3.613e+00     3.443e+00     4.025e+00
+c2 (bot.left/middle/top rt.) =    3.387e+11     3.228e+11     3.774e+11
+
+
+Intermediate Statistics.. 
+
+lenrw   =  2689     leniw   =    53
+lenrwLS =  2602     leniwLS =    43
+nst     =   350
+nfe     =   448     nfeLS   =   840
+nni     =   445     nli     =   370
+nsetups =    55     netf    =    18
+npe     =     6     nps     =  1324
+ncfn    =     0     ncfl    =     0
+
+t = 5.67e+04   no. steps = 400   order = 5   stepsize = 6.79e+02
+c1 (bot.left/middle/top rt.) =    4.357e-09     5.640e-07     2.278e-09
+c2 (bot.left/middle/top rt.) =    3.324e+11     9.339e+11     3.904e+11
+
+
+Intermediate Statistics.. 
+
+lenrw   =  2689     leniw   =    53
+lenrwLS =  2602     leniwLS =    43
+nst     =   400
+nfe     =   505     nfeLS   =   987
+nni     =   502     nli     =   424
+nsetups =    62     netf    =    19
+npe     =     7     nps     =  1554
+ncfn    =     0     ncfl    =     0
+
+t = 8.64e+04   no. steps = 445   order = 5   stepsize = 6.89e+02
+c1 (bot.left/middle/top rt.) =   -1.673e-20    -1.128e-15    -3.081e-21
+c2 (bot.left/middle/top rt.) =    3.352e+11     9.106e+11     4.163e+11
+
+
+Final Statistics.. 
+
+lenrw   =  2689     leniw   =    53
+lenrwLS =  2602     leniwLS =    43
+nst     =   445
+nfe     =   558     nfeLS   =  1143
+nni     =   555     nli     =   477
+nsetups =    67     netf    =    21
+npe     =     8     nps     =  1805
+ncfn    =     0     ncfl    =     0
+
diff --git a/examples/cvode/serial/cvParticle_dns.c b/examples/cvode/serial/cvParticle_dns.c
new file mode 100644
index 0000000000..a5d8a29659
--- /dev/null
+++ b/examples/cvode/serial/cvParticle_dns.c
@@ -0,0 +1,620 @@
+/* -----------------------------------------------------------------------------
+ * Programmer(s): David J. Gardner @ LLNL
+ * -----------------------------------------------------------------------------
+ * Based on an example from Jean-Luc Fattebert @ ORNL
+ * -----------------------------------------------------------------------------
+ * SUNDIALS Copyright Start
+ * Copyright (c) 2002-2020, Lawrence Livermore National Security
+ * and Southern Methodist University.
+ * All rights reserved.
+ *
+ * See the top-level LICENSE and NOTICE files for details.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ * SUNDIALS Copyright End
+ * -----------------------------------------------------------------------------
+ * This example solves the equation for a particle moving conterclockwise with
+ * velocity alpha on the unit circle in the xy-plane. The ODE system is given by
+ *
+ *   x' = -alpha * y
+ *   y' =  alpha * x
+ *
+ * where x and y are subject to the constraint
+ *
+ *   x^2 + y^2 - 1 = 0
+ *
+ * with initial condition x = 1 and y = 0 at t = 0. The system has the analytic
+ * solution
+ *
+ *  x(t) = cos(alpha * t)
+ *  y(t) = sin(alpha * t)
+ *
+ * For a description of the command line options for this example run the
+ * program with the --help flag.
+ * ---------------------------------------------------------------------------*/
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <math.h>
+
+#include <cvode/cvode.h>               /* access to CVODE                 */
+#include <nvector/nvector_serial.h>    /* access to serial N_Vector       */
+#include <sunmatrix/sunmatrix_dense.h> /* access to dense SUNMatrix       */
+#include <sunlinsol/sunlinsol_dense.h> /* access to dense SUNLinearSolver */
+
+/* Precision specific formatting macros */
+#if defined(SUNDIALS_EXTENDED_PRECISION)
+#define GSYM "Lg"
+#define ESYM "Le"
+#define FSYM "Lf"
+#else
+#define GSYM "g"
+#define ESYM "e"
+#define FSYM "f"
+#endif
+
+/* Precision specific math function macros */
+#if defined(SUNDIALS_DOUBLE_PRECISION)
+#define SIN(x)   (sin((x)))
+#define COS(x)   (cos((x)))
+#define SQRT(x)  (sqrt((x)))
+#elif defined(SUNDIALS_SINGLE_PRECISION)
+#define SIN(x)   (sinf((x)))
+#define COS(x)   (cosf((x)))
+#define SQRT(x)  (sqrtf((x)))
+#elif defined(SUNDIALS_EXTENDED_PRECISION)
+#define SIN(x)   (sinl((x)))
+#define COS(x)   (cosl((x)))
+#define SQRT(x)  (sqrtl((x)))
+#endif
+
+/* Problem Constants */
+#define PI    RCONST(3.141592653589793238462643383279502884197169)
+#define ZERO  RCONST(0.0)
+#define ONE   RCONST(1.0)
+#define TWO   RCONST(2.0)
+
+/* User-defined data structure */
+typedef struct UserData_
+{
+  realtype alpha; /* particle velocity */
+
+  int      orbits; /* number of orbits */
+  realtype torbit; /* orbit time       */
+
+  realtype rtol; /* integration tolerances */
+  realtype atol;
+
+  int proj;    /* enable/disable solution projection */
+  int projerr; /* enable/disable error projection */
+
+  int tstop; /* use tstop mode */
+  int nout;  /* number of outputs per orbit */
+
+} *UserData;
+
+/* Functions provided to CVODE */
+static int f(realtype t, N_Vector y, N_Vector ydot, void *user_data);
+static int Jac(realtype t, N_Vector y, N_Vector fy, SUNMatrix J,
+               void *user_data, N_Vector tmp1, N_Vector tmp2, N_Vector tmp3);
+static int Proj(realtype t, N_Vector ycur, N_Vector corr, realtype epsProj,
+                N_Vector err, void *user_data);
+
+/* Utility functions */
+static int InitUserData(int *argc, char ***argv, UserData udata);
+static int PrintUserData(UserData udata);
+static void InputHelp();
+static int ComputeSolution(realtype t, N_Vector y, UserData udata);
+static int ComputeError(realtype t, N_Vector y, N_Vector e, realtype *ec,
+                        UserData udata);
+static int WriteOutput(realtype t, N_Vector y, N_Vector e, realtype ec,
+                       int screenfile, FILE *YFID, FILE *EFID);
+static int PrintStats(void *cvode_mem);
+static int check_retval(void *returnvalue, const char *funcname, int opt);
+
+
+/* -----------------------------------------------------------------------------
+ * Main Program
+ * ---------------------------------------------------------------------------*/
+
+int main(int argc, char* argv[])
+{
+  int      retval;          /* reusable return flag       */
+  int      out      = 0;    /* output counter             */
+  int      totalout = 0;    /* output counter             */
+  realtype t        = ZERO; /* current integration time   */
+  realtype dtout    = ZERO; /* output spacing             */
+  realtype tout     = ZERO; /* next output time           */
+  realtype ec       = ZERO; /* constraint error           */
+  UserData udata    = NULL; /* user data structure        */
+
+  void            *cvode_mem = NULL; /* CVODE memory         */
+  N_Vector         y         = NULL; /* solution vector      */
+  realtype        *ydata     = NULL; /* solution vector data */
+  N_Vector         e         = NULL; /* error vector         */
+  SUNMatrix        A         = NULL; /* Jacobian matrix      */
+  SUNLinearSolver  LS        = NULL; /* linear solver        */
+
+  FILE *YFID = NULL; /* solution output file */
+  FILE *EFID = NULL; /* error output file    */
+
+  /* Allocate and initialize user data structure */
+  udata = (UserData) malloc(sizeof *udata);
+  if (check_retval((void *)udata, "malloc", 0)) return(1);
+
+  retval = InitUserData(&argc, &argv, udata);
+  if (check_retval(&retval, "InitUserData", 1)) return(1);
+
+  /* Create serial vector to store the solution */
+  y = N_VNew_Serial(2);
+  if (check_retval((void *)y, "N_VNew_Serial", 0)) return(1);
+
+  /* Set initial contion */
+  ydata    = N_VGetArrayPointer(y);
+  ydata[0] = ONE;
+  ydata[1] = ZERO;
+
+  /* Create serial vector to store the solution error */
+  e = N_VClone(y);
+  if (check_retval((void *)y, "N_VClone", 0)) return(1);
+
+  /* Set initial error */
+  N_VConst(ZERO, e);
+
+  /* Create CVODE memory */
+  cvode_mem = CVodeCreate(CV_BDF);
+  if (check_retval((void *)cvode_mem, "CVodeCreate", 0)) return(1);
+
+  /* Initialize CVODE */
+  retval = CVodeInit(cvode_mem, f, t, y);
+  if (check_retval(&retval, "CVodeInit", 1)) return(1);
+
+  /* Attach user-defined data structure to CVODE */
+  retval = CVodeSetUserData(cvode_mem, udata);
+  if(check_retval(&retval, "CVodeSetUserData", 1)) return(1);
+
+  /* Set integration tolerances */
+  retval = CVodeSStolerances(cvode_mem, udata->rtol, udata->atol);
+  if (check_retval(&retval, "CVodeSStolerances", 1)) return(1);
+
+  /* Create dense SUNMatrix for use in linear solves */
+  A = SUNDenseMatrix(2, 2);
+  if(check_retval((void *)A, "SUNDenseMatrix", 0)) return(1);
+
+  /* Create dense SUNLinearSolver object */
+  LS = SUNLinSol_Dense(y, A);
+  if(check_retval((void *)LS, "SUNLinSol_Dense", 0)) return(1);
+
+  /* Attach the matrix and linear solver to CVODE */
+  retval = CVodeSetLinearSolver(cvode_mem, LS, A);
+  if(check_retval(&retval, "CVodeSetLinearSolver", 1)) return(1);
+
+  /* Set a user-supplied Jacobian function */
+  retval = CVodeSetJacFn(cvode_mem, Jac);
+  if(check_retval(&retval, "CVodeSetJacFn", 1)) return(1);
+
+  /* Set a user-supplied projection function */
+  if (udata->proj)
+  {
+    retval = CVodeSetProjFn(cvode_mem, Proj);
+    if(check_retval(&retval, "CVodeSetProjFn", 1)) return(1);
+
+    retval = CVodeSetProjErrEst(cvode_mem, udata->projerr);
+    if(check_retval(&retval, "CVodeSetProjErrEst", 1)) return(1);
+  }
+
+  /* Set max steps between outputs */
+  retval = CVodeSetMaxNumSteps(cvode_mem, 100000);
+  if (check_retval(&retval, "CVodeSetMaxNumSteps", 1)) return(1);
+
+  /* Output problem setup */
+  retval = PrintUserData(udata);
+  if(check_retval(&retval, "PrintUserData", 1)) return(1);
+
+  /* Output initial condition */
+  printf("\n     t            x              y");
+  printf("             err x          err y       err constr\n");
+  WriteOutput(t, y, e, ec, 0, NULL, NULL);
+
+  if (udata->nout > 0)
+  {
+    YFID = fopen("cvParticle_solution.txt","w");
+    EFID = fopen("cvParticle_error.txt","w");
+    WriteOutput(t, y, e, ec, 1, YFID, EFID);
+  }
+
+  /* Integrate in time and periodically output the solution and error */
+  if (udata->nout > 0)
+  {
+    totalout = udata->orbits * udata->nout;
+    dtout    = udata->torbit / udata->nout;
+  }
+  else
+  {
+    totalout = 1;
+    dtout    = udata->torbit * udata->orbits;
+  }
+  tout = dtout;
+
+  for (out = 0; out < totalout; out++)
+  {
+    /* Stop at output time (do not interpolate output) */
+    if (udata->tstop || udata->nout == 0)
+    {
+      retval = CVodeSetStopTime(cvode_mem, tout);
+      if (check_retval(&retval, "CVodeSetStopTime", 1)) return(1);
+    }
+
+    /* Advance in time */
+    retval = CVode(cvode_mem, tout, y, &t, CV_NORMAL);
+    if (check_retval(&retval, "CVode", 1)) break;
+
+    /* Output solution and error */
+    if (udata->nout > 0)
+    {
+      retval = ComputeError(t, y, e, &ec, udata);
+      if (check_retval(&retval, "ComputeError", 1)) break;
+
+      WriteOutput(t, y, e, ec, 1, YFID, EFID);
+      if (check_retval(&retval, "WriteOutput", 1)) break;
+    }
+
+    /* Update output time */
+    if (out < totalout - 1)
+    {
+      tout += dtout;
+    }
+    else
+    {
+      tout = udata->torbit * udata->orbits;
+    }
+  }
+
+  /* Close output files */
+  if (udata->nout > 0)
+  {
+    fclose(YFID);
+    fclose(EFID);
+  }
+
+  /* Output final solution and error to screen */
+  ComputeError(t, y, e, &ec, udata);
+  if (check_retval(&retval, "ComputeError", 1)) return(1);
+
+  WriteOutput(t, y, e, ec, 0, NULL, NULL);
+  if (check_retval(&retval, "WriteOutput", 1)) return(1);
+
+  /* Print some final statistics */
+  PrintStats(cvode_mem);
+
+  /* Free memory */
+  N_VDestroy(y);
+  SUNMatDestroy(A);
+  SUNLinSolFree(LS);
+  CVodeFree(&cvode_mem);
+
+  return(0);
+}
+
+
+/* -----------------------------------------------------------------------------
+ * Functions provided to CVODE
+ * ---------------------------------------------------------------------------*/
+
+
+/* Compute the right-hand side function, y' = f(t,y) */
+static int f(realtype t, N_Vector y, N_Vector ydot, void *user_data)
+{
+  UserData  udata = (UserData) user_data;
+  realtype *ydata = N_VGetArrayPointer(y);
+  realtype *fdata = N_VGetArrayPointer(ydot);
+
+  fdata[0] = -(udata->alpha) * ydata[1];
+  fdata[1] =  (udata->alpha) * ydata[0];
+
+  return(0);
+}
+
+
+/* Compute the Jacobian of the right-hand side function, J(t,y) = df/dy */
+static int Jac(realtype t, N_Vector y, N_Vector fy, SUNMatrix J,
+               void *user_data, N_Vector tmp1, N_Vector tmp2, N_Vector tmp3)
+{
+  UserData  udata = (UserData) user_data;
+  realtype *Jdata = SUNDenseMatrix_Data(J);
+
+  Jdata[0] =  ZERO;
+  Jdata[1] = -(udata->alpha);
+  Jdata[2] =  (udata->alpha);
+  Jdata[3] =  ZERO;
+
+  return(0);
+}
+
+/* Project the solution onto the constraint manifold */
+static int Proj(realtype t, N_Vector ycur, N_Vector corr, realtype epsProj,
+                N_Vector err, void *user_data)
+{
+  realtype *ydata = N_VGetArrayPointer(ycur);
+  realtype *cdata = N_VGetArrayPointer(corr);
+  realtype *edata = NULL;
+  realtype  x = ydata[0];
+  realtype  y = ydata[1];
+  realtype  xp, yp, r;
+  realtype  errxp, erryp;
+
+  /* project onto the unit circle */
+  r = SQRT(x * x + y * y);
+
+  xp = x / r;
+  yp = y / r;
+
+  /* correction to the unprojected solution */
+  cdata[0] = xp - x;
+  cdata[1] = yp - y;
+
+  /* project the error */
+  if (err != NULL)
+  {
+    edata = N_VGetArrayPointer(err);
+
+    errxp =  edata[0] * yp * yp - edata[1] * xp * yp;
+    erryp = -edata[0] * xp * yp + edata[1] * xp * xp;
+
+    edata[0] = errxp;
+    edata[1] = erryp;
+  }
+
+  return(0);
+}
+
+
+/* -----------------------------------------------------------------------------
+ * Private helper functions
+ * ---------------------------------------------------------------------------*/
+
+static int InitUserData(int *argc, char ***argv, UserData udata)
+{
+  int arg_idx = 1;
+
+  /* set default values */
+  udata->alpha = ONE;
+
+  udata->orbits = 100;
+  udata->torbit = (TWO * PI) / udata->alpha;
+
+  udata->rtol = RCONST(1.0e-4);
+  udata->atol = RCONST(1.0e-9);
+
+  udata->proj    = 1;
+  udata->projerr = 0;
+
+  udata->tstop = 0;
+  udata->nout  = 0;
+
+  /* check for input args */
+  while (arg_idx < (*argc))
+  {
+    if (strcmp((*argv)[arg_idx],"--alpha") == 0)
+    {
+      arg_idx++;
+      udata->alpha = atof((*argv)[arg_idx++]);
+      udata->torbit  = (TWO * PI) / udata->alpha;
+    }
+    else if (strcmp((*argv)[arg_idx],"--orbits") == 0)
+    {
+      arg_idx++;
+      udata->orbits = atoi((*argv)[arg_idx++]);
+    }
+    else if (strcmp((*argv)[arg_idx],"--rtol") == 0)
+    {
+      arg_idx++;
+      udata->rtol = atof((*argv)[arg_idx++]);
+    }
+    else if (strcmp((*argv)[arg_idx],"--atol") == 0)
+    {
+      arg_idx++;
+      udata->atol = atof((*argv)[arg_idx++]);
+    }
+    else if (strcmp((*argv)[arg_idx],"--proj") == 0)
+    {
+      arg_idx++;
+      udata->proj = atoi((*argv)[arg_idx++]);
+    }
+    else if (strcmp((*argv)[arg_idx],"--projerr") == 0)
+    {
+      arg_idx++;
+      udata->projerr = atoi((*argv)[arg_idx++]);
+    }
+    else if (strcmp((*argv)[arg_idx],"--nout") == 0)
+    {
+      arg_idx++;
+      udata->nout = atoi((*argv)[arg_idx++]);
+    }
+    else if (strcmp((*argv)[arg_idx],"--tstop") == 0)
+    {
+      arg_idx++;
+      udata->tstop = 1;
+    }
+    else if (strcmp((*argv)[arg_idx],"--help") == 0 )
+    {
+      InputHelp();
+      return(-1);
+    }
+    else
+    {
+      fprintf(stderr, "ERROR: Invalid input %s",(*argv)[arg_idx]);
+      InputHelp();
+      return(-1);
+    }
+  }
+
+  /* If projection is disabled then disable error projection */
+  if (!(udata->proj)) udata->projerr = 0;
+
+  return(0);
+}
+
+static int PrintUserData(UserData udata)
+{
+  if (udata == NULL) return(-1);
+
+  printf("\nParticle traveling on the unit circle example\n");
+  printf("---------------------------------------------\n");
+  printf("alpha      = %0.4" ESYM"\n", udata->alpha);
+  printf("num orbits = %d\n", udata->orbits);
+  printf("---------------------------------------------\n");
+  printf("rtol       = %" GSYM"\n", udata->rtol);
+  printf("atol       = %" GSYM"\n", udata->atol);
+  printf("proj sol   = %d\n", udata->proj);
+  printf("proj err   = %d\n", udata->projerr);
+  printf("nout       = %d\n", udata->nout);
+  printf("tstop      = %d\n", udata->tstop);
+  printf("---------------------------------------------\n");
+
+  return(0);
+}
+
+
+/* Print command line options */
+static void InputHelp()
+{
+  printf("\nCommand line options:\n");
+  printf("  --alpha <vel>      : particle velocity\n");
+  printf("  --orbits <orbits>  : number of orbits to perform\n");
+  printf("  --rtol <rtol>      : relative tolerance\n");
+  printf("  --atol <atol>      : absoltue tolerance\n");
+  printf("  --proj <1 or 0>    : enable (1) / disable (0) projection\n");
+  printf("  --projerr <1 or 0> : enable (1) / disable (0) error projection\n");
+  printf("  --nout <nout>      : outputs per period\n");
+  printf("  --tstop            : stop at output time (do not interpolate)\n");
+  return;
+}
+
+
+/* Compute the analytical solution */
+static int ComputeSolution(realtype t, N_Vector y, UserData udata)
+{
+  realtype *ydata = N_VGetArrayPointer(y);
+
+  ydata[0] = COS((udata->alpha) * t);
+  ydata[1] = SIN((udata->alpha) * t);
+
+  return(0);
+}
+
+
+/* Compute the error in the solution and constraint */
+static int ComputeError(realtype t, N_Vector y, N_Vector e, realtype *ec,
+                        UserData udata)
+{
+  realtype *ydata = N_VGetArrayPointer(y);
+  int retval;
+
+  /* solution error */
+  retval = ComputeSolution(t, e, udata);
+  if (check_retval(&retval, "ComputeSolution", 1)) return(1);
+  N_VLinearSum(ONE, y, -ONE, e, e);
+
+  /* constraint error */
+  *ec = ydata[0] * ydata[0] + ydata[1] * ydata[1] - ONE;
+
+  return(0);
+}
+
+/* Output the solution to the screen or disk */
+static int WriteOutput(realtype t, N_Vector y, N_Vector e, realtype ec,
+                       int screenfile, FILE* YFID, FILE* EFID)
+{
+  realtype *ydata = N_VGetArrayPointer(y);
+  realtype *edata = N_VGetArrayPointer(e);
+
+  if (screenfile == 0)
+  {
+    /* output solution and error to screen */
+    printf("%0.4" ESYM" %14.6" ESYM" %14.6" ESYM" %14.6" ESYM" %14.6" ESYM" %14.6" ESYM"\n",
+           t, ydata[0], ydata[1], edata[0], edata[1], ec);
+  }
+  else
+  {
+    /* check file pointers */
+    if (YFID == NULL || EFID == NULL) return(1);
+
+    /* output solution to disk */
+    fprintf(YFID, "%24.16" ESYM" %24.16" ESYM" %24.16"ESYM"\n",
+            t, ydata[0], ydata[1]);
+
+    /* output error to disk */
+    fprintf(EFID,
+            "%24.16" ESYM" %24.16" ESYM" %24.16"ESYM" %24.16"ESYM"\n",
+            t, edata[0], edata[1], ec);
+  }
+
+  return(0);
+}
+
+
+/* Print final statistics */
+static int PrintStats(void *cvode_mem)
+{
+  int retval;
+  long int nst, nfe, nsetups, nje, nni, ncfn, netf;
+
+  retval = CVodeGetNumSteps(cvode_mem, &nst);
+  check_retval(&retval, "CVodeGetNumSteps", 1);
+  retval = CVodeGetNumRhsEvals(cvode_mem, &nfe);
+  check_retval(&retval, "CVodeGetNumRhsEvals", 1);
+  retval = CVodeGetNumLinSolvSetups(cvode_mem, &nsetups);
+  check_retval(&retval, "CVodeGetNumLinSolvSetups", 1);
+  retval = CVodeGetNumErrTestFails(cvode_mem, &netf);
+  check_retval(&retval, "CVodeGetNumErrTestFails", 1);
+  retval = CVodeGetNumNonlinSolvIters(cvode_mem, &nni);
+  check_retval(&retval, "CVodeGetNumNonlinSolvIters", 1);
+  retval = CVodeGetNumNonlinSolvConvFails(cvode_mem, &ncfn);
+  check_retval(&retval, "CVodeGetNumNonlinSolvConvFails", 1);
+
+  retval = CVodeGetNumJacEvals(cvode_mem, &nje);
+  check_retval(&retval, "CVodeGetNumJacEvals", 1);
+
+  printf("\nIntegration Statistics:\n");
+
+  printf("Number of steps taken = %-6ld\n", nst);
+  printf("Number of function evaluations = %-6ld\n", nfe);
+
+  printf("Number of linear solver setups = %-6ld\n", nsetups);
+  printf("Number of Jacobian evaluations = %-6ld\n", nje);
+
+  printf("Number of nonlinear solver iterations = %-6ld\n", nni);
+  printf("Number of convergence failures = %-6ld\n", ncfn);
+  printf("Number of error test failures = %-6ld\n", netf);
+
+  return(0);
+}
+
+/* Check function return value */
+static int check_retval(void *returnvalue, const char *funcname, int opt)
+{
+  int *retval;
+
+  /* Opt 0: Check if a NULL pointer was returned - no memory allocated */
+  if (opt == 0 && returnvalue == NULL)
+  {
+    fprintf(stderr, "\nERROR: %s() returned a NULL pointer\n\n",
+            funcname);
+    return(1);
+  }
+  /* Opt 1: Check if retval < 0 */
+  else if (opt == 1)
+  {
+    retval = (int *) returnvalue;
+    if (*retval < 0)
+    {
+      fprintf(stderr, "\nERROR: %s() returned = %d\n\n",
+              funcname, *retval);
+      return(1);
+    }
+  }
+
+  return(0);
+}
diff --git a/examples/cvode/serial/cvParticle_dns.out b/examples/cvode/serial/cvParticle_dns.out
new file mode 100644
index 0000000000..a2639038fa
--- /dev/null
+++ b/examples/cvode/serial/cvParticle_dns.out
@@ -0,0 +1,26 @@
+
+Particle traveling on the unit circle example
+---------------------------------------------
+alpha      = 1.0000e+00
+num orbits = 100
+---------------------------------------------
+rtol       = 0.0001
+atol       = 1e-09
+proj sol   = 1
+proj err   = 0
+nout       = 0
+tstop      = 0
+---------------------------------------------
+
+     t            x              y             err x          err y       err constr
+0.0000e+00   1.000000e+00   0.000000e+00   0.000000e+00   0.000000e+00   0.000000e+00
+6.2832e+02   9.977240e-01   6.742947e-02  -2.275957e-03   6.742947e-02   2.220446e-16
+
+Integration Statistics:
+Number of steps taken = 4044  
+Number of function evaluations = 5831  
+Number of linear solver setups = 806   
+Number of Jacobian evaluations = 103   
+Number of nonlinear solver iterations = 5828  
+Number of convergence failures = 20    
+Number of error test failures = 329   
diff --git a/examples/cvode/serial/cvPendulum_dns.c b/examples/cvode/serial/cvPendulum_dns.c
new file mode 100644
index 0000000000..800fede1b2
--- /dev/null
+++ b/examples/cvode/serial/cvPendulum_dns.c
@@ -0,0 +1,771 @@
+/* -----------------------------------------------------------------------------
+ * Programmer(s): Radu Serban and David J. Gardner @ LLNL
+ * -----------------------------------------------------------------------------
+ * SUNDIALS Copyright Start
+ * Copyright (c) 2002-2020, Lawrence Livermore National Security
+ * and Southern Methodist University.
+ * All rights reserved.
+ *
+ * See the top-level LICENSE and NOTICE files for details.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ * SUNDIALS Copyright End
+ * -----------------------------------------------------------------------------
+ * This example solves a simple pendulum equation in Cartesian coordinates where
+ * the pendulum bob has mass 1 and is suspended from the origin with a rod of
+ * length 1. The governing equations are
+ *
+ * x'  = vx
+ * y'  = vy
+ * vx' = -x * T
+ * vy' = -y * T - g
+ *
+ * with the constraints
+ *
+ * x^2 + y^2 - 1 = 0
+ * x * vx + y * vy = 0
+ *
+ * where x and y are the pendulum bob position, vx and vy are the bob velocity
+ * in the x and y directions respectively, T is the tension in the rod, and
+ * g is acceleration due to gravity chosen such that the pendulum has period 2.
+ * The initial condition at t = 0 is x = 1, y = 0, vx = 0, and vy = 0.
+ *
+ * A reference solution is computed using the pendulum equation in terms of the
+ * angle between the x-axis and the pendulum rod i.e., theta in [0, -pi]. The
+ * governing equations are
+ *
+ * theta'  = vtheta
+ * vtheta' = -g * cos(theta)
+ *
+ * where theta is the angle from the x-axis, vtheta is the angular velocity, and
+ * g the same acceleration due to gravity from above. The initial condition at
+ * t = 0 is theta = 0 and vtheta = 0.
+ *
+ * The Cartesian formulation is run to a final time tf (default 30) with and
+ * without projection for various integration tolerances. The error in the
+ * position and velocity at tf compared to the reference solution, the error in
+ * the position constraint equation, and various integrator statistics are
+ * printed to the screen for each run.
+ *
+ * When projection is enabled a user-supplied function is used to project the
+ * position, velocity, and error to the constraint manifold.
+ *
+ * Optional command line inputs may be used to change the final simulation time
+ * (default 30), the initial tolerance (default 1e-5), the number of outputs
+ * (default 1), or disable error projection. Use the option --help for a list
+ * of the command line flags.
+ * ---------------------------------------------------------------------------*/
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <math.h>
+
+#include <cvode/cvode.h>               /* access to CVODE                 */
+#include <nvector/nvector_serial.h>    /* access to serial N_Vector       */
+#include <sunmatrix/sunmatrix_dense.h> /* access to dense SUNmatrix       */
+#include <sunlinsol/sunlinsol_dense.h> /* access to dense SUNLinearSolver */
+
+/* Precision specific formatting macros */
+#if defined(SUNDIALS_EXTENDED_PRECISION)
+#define GSYM "Lg"
+#define ESYM "Le"
+#define FSYM "Lf"
+#else
+#define GSYM "g"
+#define ESYM "e"
+#define FSYM "f"
+#endif
+
+/* Precision specific math function macros */
+#if defined(SUNDIALS_DOUBLE_PRECISION)
+#define SIN(x)   (sin((x)))
+#define COS(x)   (cos((x)))
+#define SQRT(x)  (sqrt((x)))
+#define ABS(x)   (fabs((x)))
+#elif defined(SUNDIALS_SINGLE_PRECISION)
+#define SIN(x)   (sinf((x)))
+#define COS(x)   (cosf((x)))
+#define SQRT(x)  (sqrtf((x)))
+#define ABS(x)   (fabsf((x)))
+#elif defined(SUNDIALS_EXTENDED_PRECISION)
+#define SIN(x)   (sinl((x)))
+#define COS(x)   (cosl((x)))
+#define SQRT(x)  (sqrtl((x)))
+#define ABS(x)   (fabsl((x)))
+#endif
+
+/* Problem Constants */
+#define ZERO  RCONST(0.0)
+#define ONE   RCONST(1.0)
+#define GRAV  RCONST(13.750371636040745654980191559621114395801712)
+
+/* Functions provided to CVODE */
+static int fref(realtype t, N_Vector yy, N_Vector fy, void *f_data);
+
+static int f(realtype t, N_Vector yy, N_Vector fy, void *f_data);
+static int proj(realtype t, N_Vector yy, N_Vector corr,
+                realtype epsProj, N_Vector err, void *pdata);
+
+/* Functions to integrate the Cartesian and reference solutions */
+int GetSol(void *cvode_mem, N_Vector yy0, realtype tol, realtype tf,
+           int nout, booleantype proj, booleantype projerr, N_Vector yref);
+
+int RefSol(realtype tf, N_Vector yref, int nout);
+
+/* Utility functions */
+static int ReadInputs(int *argc, char ***argv, realtype *tol, realtype *tf,
+                      int *nout, booleantype *projerr);
+static void InputHelp();
+static int check_retval(void *returnvalue, const char *funcname, int opt);
+
+/* -----------------------------------------------------------------------------
+ * Main Program
+ * ---------------------------------------------------------------------------*/
+
+int main(int argc, char* argv[])
+{
+  int         i;
+  int         retval;                   /* reusable return flag    */
+  int         nout    = 1;              /* number of outputs       */
+  realtype    tol     = RCONST(1.0e-5); /* integration tolerance   */
+  realtype    tf      = RCONST(30.0);   /* final integration time  */
+  booleantype projerr = SUNTRUE;        /* enable error projection */
+
+  void            *cvode_mem = NULL; /* CVODE memory              */
+  N_Vector         yy0       = NULL; /* initial condition vector  */
+  realtype        *yy0data   = NULL; /* vector data               */
+  N_Vector         yref      = NULL; /* reference solution vector */
+  SUNMatrix        A         = NULL; /* Jacobian matrix           */
+  SUNLinearSolver  LS        = NULL; /* linear solver             */
+
+  /* Read command line inputs */
+  retval = ReadInputs(&argc, &argv, &tol, &tf, &nout, &projerr);
+  if (check_retval(&retval, "ReadInputs", 1)) return(1);
+
+  /* Compute reference solution */
+  yref = N_VNew_Serial(4);
+
+  retval = RefSol(tf, yref, nout);
+  if (check_retval(&retval, "RefSol", 1)) return(1);
+
+  /* Create serial vector to store the initial condition */
+  yy0 = N_VNew_Serial(4);
+  if (check_retval((void *)yy0, "N_VNew_Serial", 0)) return(1);
+
+  /* Set the initial condition values */
+  yy0data = N_VGetArrayPointer(yy0);
+
+  yy0data[0] = ONE;  /* x  */
+  yy0data[1] = ZERO; /* y  */
+  yy0data[2] = ZERO; /* xd */
+  yy0data[3] = ZERO; /* yd */
+
+  /* Create CVODE memory */
+  cvode_mem = CVodeCreate(CV_BDF);
+  if (check_retval((void *)cvode_mem, "CVodeCreate", 0)) return(1);
+
+  /* Initialize CVODE */
+  retval = CVodeInit(cvode_mem, f, ZERO, yy0);
+  if (check_retval(&retval, "CVodeInit", 1)) return(1);
+
+  /* Set integration tolerances */
+  retval = CVodeSStolerances(cvode_mem, tol, tol);
+  if (check_retval(&retval, "CVodeSStolerances", 1)) return(1);
+
+  /* Create dense SUNMatrix for use in linear solves */
+  A = SUNDenseMatrix(4, 4);
+  if(check_retval((void *)A, "SUNDenseMatrix", 0)) return(1);
+
+  /* Create dense SUNLinearSolver object */
+  LS = SUNLinSol_Dense(yy0, A);
+  if(check_retval((void *)LS, "SUNLinSol_Dense", 0)) return(1);
+
+  /* Attach the matrix and linear solver to CVODE */
+  retval = CVodeSetLinearSolver(cvode_mem, LS, A);
+  if(check_retval(&retval, "CVodeSetLinearSolver", 1)) return(1);
+
+  /* Set a user-supplied projection function */
+  retval = CVodeSetProjFn(cvode_mem, proj);
+  if(check_retval(&retval, "CVodeSetProjFn", 1)) return(1);
+
+  /* Set maximum number of steps between outputs */
+  retval = CVodeSetMaxNumSteps(cvode_mem, 50000);
+  if (check_retval(&retval, "CVodeSetMaxNumSteps", 1)) return(1);
+
+  /* Compute the solution with various tolerances */
+  for (i = 0; i < 5; i++) {
+
+    /* Output tolerance and output header for this run */
+    printf("\n\nTol = %8.2" ESYM"\n", tol);
+    printf("Project    x         y");
+    printf("         x'        y'     |     g      |    ");
+    printf("nst     rhs eval    setups (J eval)  |   cf   ef\n");
+
+    /* Compute solution with projection */
+    retval = GetSol(cvode_mem, yy0, tol, tf, nout, SUNTRUE, projerr, yref);
+    if (check_retval(&retval, "GetSol", 1)) return(1);
+
+    /* Compute solution without projection */
+    retval = GetSol(cvode_mem, yy0, tol, tf, nout, SUNFALSE, SUNFALSE, yref);
+    if (check_retval(&retval, "GetSol", 1)) return(1);
+
+    /* Reduce tolerance for next run */
+    tol /= RCONST(10.0);
+  }
+
+  /* Free memory */
+  N_VDestroy_Serial(yref);
+  N_VDestroy_Serial(yy0);
+  SUNMatDestroy(A);
+  SUNLinSolFree(LS);
+  CVodeFree(&cvode_mem);
+
+  return(0);
+}
+
+
+/* -----------------------------------------------------------------------------
+ * Functions to integrate the Cartesian and reference systems
+ * ---------------------------------------------------------------------------*/
+
+
+/* Compute the Cartesian system solution */
+int GetSol(void *cvode_mem, N_Vector yy0, realtype tol, realtype tf, int nout,
+           booleantype proj, booleantype projerr, N_Vector yref)
+{
+  char      outname[100];  /* output file name */
+  FILE     *FID    = NULL; /* output file      */
+  N_Vector  yy     = NULL; /* solution vector  */
+  realtype *yydata = NULL; /* vector data      */
+
+  int      retval; /* reusable return flag */
+  int      out;    /* output counter       */
+  realtype dtout;  /* output frequency     */
+  realtype tout;   /* output time          */
+  realtype t;      /* return time          */
+  realtype x, y;   /* position values      */
+  realtype xd, yd; /* velocity values      */
+  realtype g;      /* constraint value     */
+
+  /* Integrator stats */
+  long int nst, nfe, nsetups, nje, nfeLS, ncfn, netf;
+
+  /* Enable or disable projection */
+  if (proj)
+  {
+    printf("  YES   ");
+    retval = CVodeSetProjFrequency(cvode_mem, 1);
+    if(check_retval(&retval, "CVodeSetProjFrequency", 1)) return(1);
+
+    /* Enable or disable error projection */
+    retval = CVodeSetProjErrEst(cvode_mem, projerr);
+    if(check_retval(&retval, "CVodeSetProjErrEst", 1)) return(1);
+  }
+  else
+  {
+    retval = CVodeSetProjFrequency(cvode_mem, 0);
+    if(check_retval(&retval, "CVodeSetProjFrequency", 1)) return(1);
+    printf("  NO    ");
+  }
+
+  /* Create vector to store the solution */
+  yy = N_VNew_Serial(4);
+
+  /* Copy initial condition into solution vector */
+  N_VScale(ONE, yy0, yy);
+
+  /* Get pointer to vector data */
+  yydata = N_VGetArrayPointer(yy);
+
+  /* Reinitialize CVODE for this run */
+  retval = CVodeReInit(cvode_mem, ZERO, yy0);
+  if (check_retval(&retval, "CVodeReInit", 1))
+  {
+    N_VDestroy_Serial(yy);
+    return(retval);
+  }
+
+  /* Set integration tolerances for this run */
+  retval = CVodeSStolerances(cvode_mem, tol, tol);
+  if (check_retval(&retval, "CVodeSStolerances", 1))
+  {
+    N_VDestroy_Serial(yy);
+    return(retval);
+  }
+
+  /* Open output file */
+  if (proj)
+  {
+    sprintf(outname, "cvPendulum_dns_tol_%03.2" ESYM"_proj.txt", tol);
+  }
+  else
+  {
+    sprintf(outname, "cvPendulum_dns_tol_%03.2" ESYM".txt", tol);
+  }
+  FID = fopen(outname, "w");
+
+  /* Output initial condition */
+  fprintf(FID,
+          "%0.4" ESYM" %14.6" ESYM" %14.6" ESYM" %14.6" ESYM" %14.6" ESYM"\n",
+          ZERO, yydata[0], yydata[1], yydata[2], yydata[3]);
+
+  /* Integrate to tf and peridoically output the solution */
+  dtout = tf / nout;
+  tout  = dtout;
+
+  for (out = 0; out < nout; out++)
+  {
+    /* Set stop time (do not interpolate output) */
+    retval = CVodeSetStopTime(cvode_mem, tout);
+    if (check_retval(&retval, "CVodeSetStopTime", 1))
+    {
+      N_VDestroy_Serial(yy);
+      fclose(FID);
+      return(retval);
+    }
+
+    /* Integrate to tout */
+    retval = CVode(cvode_mem, tout, yy, &t, CV_NORMAL);
+    if (check_retval(&retval, "CVode", 1))
+    {
+      N_VDestroy_Serial(yy);
+      fclose(FID);
+      return(retval);
+    }
+
+    /* Write output */
+    fprintf(FID,
+            "%0.4" ESYM" %14.6" ESYM" %14.6" ESYM" %14.6" ESYM" %14.6" ESYM"\n",
+            t, yydata[0], yydata[1], yydata[2], yydata[3]);
+
+    /* Update output time */
+    if (out < nout - 1)
+    {
+      tout += dtout;
+    }
+    else
+    {
+      tout = tf;
+    }
+  }
+
+  /* Close output file */
+  fclose(FID);
+
+  /* Compute the constraint violation */
+  x = yydata[0];
+  y = yydata[1];
+  g = ABS(x*x + y*y - ONE);
+
+  /* Compute the absolute error compared to the reference solution */
+  N_VLinearSum(ONE, yy, -ONE, yref, yy);
+  N_VAbs(yy, yy);
+
+  x  = yydata[0];
+  y  = yydata[1];
+  xd = yydata[2];
+  yd = yydata[3];
+
+  /* Output errors */
+  printf("%8.2" ESYM"  %8.2" ESYM"  %8.2" ESYM"  %8.2" ESYM"  |  %8.2" ESYM"  |",
+         x, y, xd, yd, g);
+
+  /* Free solution vector */
+  N_VDestroy_Serial(yy);
+
+  /* Get integrator stats */
+  retval = CVodeGetNumSteps(cvode_mem, &nst);
+  if (check_retval(&retval, "CVodeGetNumSteps", 1)) return(retval);
+
+  retval = CVodeGetNumRhsEvals(cvode_mem, &nfe);
+  if (check_retval(&retval, "CVodeGetNumFctEvals", 1)) return(retval);
+
+  retval = CVodeGetNumLinSolvSetups(cvode_mem, &nsetups);
+  if (check_retval(&retval, "CVodeGetNumLinSolvSetups", 1)) return(retval);
+
+  retval = CVodeGetNumErrTestFails(cvode_mem, &netf);
+  if (check_retval(&retval, "CVodeGetNumErrTestFails", 1)) return(retval);
+
+  retval = CVodeGetNumNonlinSolvConvFails(cvode_mem, &ncfn);
+  if (check_retval(&retval, "CVodeGetNumNonlinSolvConvFails", 1)) return(retval);
+
+  retval = CVodeGetNumJacEvals(cvode_mem, &nje);
+  if (check_retval(&retval, "CVodeGetNumJacEvals", 1)) return(retval);
+
+  retval = CVodeGetNumLinRhsEvals(cvode_mem, &nfeLS);
+  if (check_retval(&retval, "CVodeGetNumLinRhsEvals", 1)) return(retval);
+
+  /* Output stats */
+  printf(" %6ld   %6ld+%-4ld     %4ld (%3ld)     |  %3ld  %3ld\n",
+         nst, nfe, nfeLS, nsetups, nje, ncfn, netf);
+
+  return(0);
+}
+
+
+/* Compute the reference system solution */
+int RefSol(realtype tf, N_Vector yref, int nout)
+{
+  FILE            *FID       = NULL; /* output file     */
+  void            *cvode_mem = NULL; /* CVODE memory    */
+  N_Vector         yy        = NULL; /* solution vector */
+  realtype        *yydata    = NULL; /* vector data     */
+  SUNMatrix        A         = NULL; /* Jacobian matrix */
+  SUNLinearSolver  LS        = NULL; /* linear solver   */
+
+  int      retval;                /* reusable return flag  */
+  int      out;                   /* output counter        */
+  realtype dtout;                 /* output frequency      */
+  realtype tout;                  /* output time           */
+  realtype t;                     /* return time           */
+  realtype th, thd;               /* theta and theta dot   */
+  realtype tol = RCONST(1.0e-14); /* integration tolerance */
+
+  /* Create the solution vector */
+  yy = N_VNew_Serial(2);
+  if (check_retval((void *)yy, "N_VNew_Serial", 0)) return(-1);
+
+  /* Set the initial condition */
+  yydata = N_VGetArrayPointer(yy);
+
+  yydata[0] = ZERO; /* theta  */
+  yydata[1] = ZERO; /* theta' */
+
+  /* Create CVODE memory */
+  cvode_mem = CVodeCreate(CV_BDF);
+  if (check_retval((void *)cvode_mem, "CVodeCreate", 0)) return(1);
+
+  /* Initialize CVODE */
+  retval = CVodeInit(cvode_mem, fref, ZERO, yy);
+  if (check_retval(&retval, "CVodeInit", 1)) return(1);
+
+  /* Set integration tolerances */
+  retval = CVodeSStolerances(cvode_mem, tol, tol);
+  if (check_retval(&retval, "CVodeSStolerances", 1)) return(1);
+
+  /* Create dense SUNMatrix for use in linear solves */
+  A = SUNDenseMatrix(2, 2);
+  if(check_retval((void *)A, "SUNDenseMatrix", 0)) return(1);
+
+  /* Create dense SUNLinearSolver object */
+  LS = SUNLinSol_Dense(yy, A);
+  if(check_retval((void *)LS, "SUNLinSol_Dense", 0)) return(1);
+
+  /* Attach the matrix and linear solver to CVODE */
+  retval = CVodeSetLinearSolver(cvode_mem, LS, A);
+  if(check_retval(&retval, "CVodeSetLinearSolver", 1)) return(1);
+
+  /* Set CVODE optional inputs */
+  retval = CVodeSetMaxNumSteps(cvode_mem, 100000);
+  if (check_retval(&retval, "CVodeSetMaxNumSteps", 1)) return(1);
+
+  retval = CVodeSetStopTime(cvode_mem, tf);
+  if (check_retval(&retval, "CVodeSetStopTime", 1)) return(1);
+
+  /* Open output file */
+  FID = fopen("cvPendulum_dns_ref.txt", "w");
+
+  /* Output initial condition */
+  th  = yydata[0];
+  thd = yydata[1];
+  fprintf(FID,
+          "%0.4" ESYM" %14.6" ESYM" %14.6" ESYM" %14.6" ESYM" %14.6" ESYM"\n",
+          ZERO, COS(th), SIN(th), -thd * SIN(th), thd * COS(th));
+
+  /* Integrate to tf and periodically output the solution */
+  dtout = tf / nout;
+  tout  = dtout;
+
+  for (out = 0; out < nout; out++)
+  {
+    /* Set stop time (do not interpolate output) */
+    retval = CVodeSetStopTime(cvode_mem, tout);
+    if (check_retval(&retval, "CVodeSetStopTime", 1))
+    {
+      N_VDestroy_Serial(yy);
+      SUNMatDestroy(A);
+      SUNLinSolFree(LS);
+      CVodeFree(&cvode_mem);
+      fclose(FID);
+      return(retval);
+    }
+
+    /* Integrate to tout */
+    retval = CVode(cvode_mem, tf, yy, &t, CV_NORMAL);
+    if (check_retval(&retval, "CVode", 1))
+    {
+      N_VDestroy_Serial(yy);
+      SUNMatDestroy(A);
+      SUNLinSolFree(LS);
+      CVodeFree(&cvode_mem);
+      fclose(FID);
+      return(retval);
+    }
+
+    /* Write output */
+    th  = yydata[0];
+    thd = yydata[1];
+    fprintf(FID,
+            "%0.4" ESYM" %14.6" ESYM" %14.6" ESYM" %14.6" ESYM" %14.6" ESYM"\n",
+            t, COS(th), SIN(th), -thd * SIN(th), thd * COS(th));
+
+    /* Update output time */
+    if (out < nout - 1)
+    {
+      tout += dtout;
+    }
+    else
+    {
+      tout = tf;
+    }
+  }
+
+  /* Close output file */
+  fclose(FID);
+
+  /* Get solution components */
+  th  = yydata[0];
+  thd = yydata[1];
+
+  /* Convert to Cartesian reference solution */
+  yydata = N_VGetArrayPointer(yref);
+
+  yydata[0] = COS(th);
+  yydata[1] = SIN(th);
+  yydata[2] = -thd * SIN(th);
+  yydata[3] =  thd * COS(th);
+
+  /* Free memory */
+  N_VDestroy_Serial(yy);
+  SUNMatDestroy(A);
+  SUNLinSolFree(LS);
+  CVodeFree(&cvode_mem);
+
+  return(0);
+}
+
+
+/* -----------------------------------------------------------------------------
+ * Functions provided to CVODE
+ * ---------------------------------------------------------------------------*/
+
+
+/* ODE RHS function for the reference system */
+static int fref(realtype t, N_Vector yy, N_Vector fy, void *f_data)
+{
+  realtype *yydata = NULL; /* yy vector data */
+  realtype *fydata = NULL; /* fy vector data */
+
+  /* Get vector array pointers */
+  yydata = N_VGetArrayPointer(yy);
+  fydata = N_VGetArrayPointer(fy);
+
+  fydata[0] = yydata[1];              /* theta'          */
+  fydata[1] = -GRAV * COS(yydata[0]); /* -g * cos(theta) */
+  return(0);
+}
+
+
+/* ODE RHS function for the Cartesian system */
+static int f(realtype t, N_Vector yy, N_Vector fy, void *f_data)
+{
+  realtype *yydata = NULL; /* yy vector data */
+  realtype *fydata = NULL; /* fy vector data */
+
+  realtype x, y;   /* positions  */
+  realtype xd, yd; /* velocities */
+  realtype tmp;
+
+  /* Get vector array pointers */
+  yydata = N_VGetArrayPointer(yy);
+  fydata = N_VGetArrayPointer(fy);
+
+  /* Get vector components */
+  x  = yydata[0];
+  y  = yydata[1];
+  xd = yydata[2];
+  yd = yydata[3];
+
+  /* Compute tension */
+  tmp = xd * xd + yd * yd - GRAV * y;
+
+  /* Compute RHS */
+  fydata[0] = xd;
+  fydata[1] = yd;
+  fydata[2] = -x * tmp;
+  fydata[3] = -y * tmp - GRAV;
+
+  return(0);
+}
+
+
+/* Projection function */
+static int proj(realtype t, N_Vector yy, N_Vector corr,
+                realtype epsProj, N_Vector err, void *pdata)
+{
+  realtype *yydata = NULL; /* yy vector data   */
+  realtype *cdata  = NULL; /* corr vector data */
+  realtype *edata  = NULL; /* err vector data */
+
+  realtype x, y, x_new, y_new;     /* positions  */
+  realtype xd, yd, xd_new, yd_new; /* velocities */
+
+  realtype e1, e2, e3, e4;
+  realtype e1_new, e2_new, e3_new, e4_new;
+  realtype R;
+
+  /* Get vector array pointers */
+
+  yydata = N_VGetArrayPointer(yy);
+  cdata  = N_VGetArrayPointer(corr);
+
+  /* Extract current solution */
+
+  x  = yydata[0];
+  y  = yydata[1];
+  xd = yydata[2];
+  yd = yydata[3];
+
+  /* Project positions */
+
+  R = SQRT(x * x + y * y);
+
+  x_new = x / R;
+  y_new = y / R;
+
+  /* Project velocities
+   *
+   *        +-            -+  +-    -+
+   *        |  y*y    -x*y |  |  xd  |
+   *  P v = |              |  |      |
+   *        | -x*y     x*x |  |  yd  |
+   *        +-            -+  +-    -+
+   */
+
+  xd_new =   xd * y_new * y_new - yd * x_new * y_new;
+  yd_new = - xd * x_new * y_new + yd * x_new * x_new;
+
+  /* Return position and velocity corrections */
+
+  cdata[0] = x_new  - x;
+  cdata[1] = y_new  - y;
+  cdata[2] = xd_new - xd;
+  cdata[3] = yd_new - yd;
+
+  /* Project error P * err */
+  if (err != NULL)
+  {
+    edata = N_VGetArrayPointer(err);
+
+    e1 = edata[0];
+    e2 = edata[1];
+    e3 = edata[2];
+    e4 = edata[3];
+
+    e1_new =  y_new * y_new * e1 - x_new * y_new * e2;
+    e2_new = -x_new * y_new * e1 + x_new * x_new * e2;
+
+    e3_new =  y_new * y_new * e3 - x_new * y_new * e4;
+    e4_new = -x_new * y_new * e3 + x_new * x_new * e4;
+
+    edata[0] = e1_new;
+    edata[1] = e2_new;
+    edata[2] = e3_new;
+    edata[3] = e4_new;
+  }
+
+  return(0);
+}
+
+
+/* -----------------------------------------------------------------------------
+ * Private helper functions
+ * ---------------------------------------------------------------------------*/
+
+
+/* Read command line unputs */
+static int ReadInputs(int *argc, char ***argv, realtype *tol, realtype *tf,
+                      int *nout, booleantype *projerr)
+{
+  int arg_idx = 1;
+
+  /* check for input args */
+  while (arg_idx < (*argc))
+  {
+    if (strcmp((*argv)[arg_idx],"--tol") == 0)
+    {
+      arg_idx++;
+      *tol = atof((*argv)[arg_idx++]);
+    }
+    else if (strcmp((*argv)[arg_idx],"--tf") == 0)
+    {
+      arg_idx++;
+      *tf = atof((*argv)[arg_idx++]);
+    }
+    else if (strcmp((*argv)[arg_idx],"--nout") == 0)
+    {
+      arg_idx++;
+      *nout = atoi((*argv)[arg_idx++]);
+    }
+    else if (strcmp((*argv)[arg_idx],"--noerrproj") == 0)
+    {
+      arg_idx++;
+      *projerr = SUNFALSE;
+    }
+    else if (strcmp((*argv)[arg_idx],"--help") == 0 )
+    {
+      InputHelp();
+      return(-1);
+    }
+    else
+    {
+      fprintf(stderr, "ERROR: Invalid input %s",(*argv)[arg_idx]);
+      InputHelp();
+      return(-1);
+    }
+  }
+
+  return(0);
+}
+
+
+/* Print command line options */
+static void InputHelp()
+{
+  printf("\nCommand line options:\n");
+  printf("  --tol <tol>      : relative and absolute tolerance\n");
+  printf("  --tf <time>      : final simulation time\n");
+  printf("  --nout <outputs> : number of outputs\n");
+  printf("  --noerrproj      : disable error projection\n");
+
+  return;
+}
+
+
+/* Check function return value */
+static int check_retval(void *returnvalue, const char *funcname, int opt)
+{
+  int *retval;
+
+  /* Opt 0: Check if function returned NULL pointer - no memory allocated */
+  if (opt == 0 && returnvalue == NULL)
+  {
+    fprintf(stderr, "\nERROR: %s() returned NULL pointer\n\n",
+            funcname);
+    return(1);
+  }
+  /* Opt 1: Check if retval < 0 */
+  else if (opt == 1)
+  {
+    retval = (int *) returnvalue;
+    if (*retval < 0)
+    {
+      fprintf(stderr, "\nERROR: %s() returned = %d\n\n",
+              funcname, *retval);
+      return(1);
+    }
+  }
+
+  return(0);
+}
diff --git a/examples/cvode/serial/cvPendulum_dns.out b/examples/cvode/serial/cvPendulum_dns.out
new file mode 100644
index 0000000000..6454c0815e
--- /dev/null
+++ b/examples/cvode/serial/cvPendulum_dns.out
@@ -0,0 +1,30 @@
+
+
+Tol = 1.00e-05
+Project    x         y         x'        y'     |     g      |    nst     rhs eval    setups (J eval)  |   cf   ef
+  YES   2.53e-06  2.25e-03  1.80e-04  8.01e-02  |  0.00e+00  |   1421     1663+96         96 ( 24)     |    0   11
+  NO    5.61e-01  8.98e-01  4.26e+00  2.10e+00  |  6.19e-04  |   1549     1757+112       114 ( 28)     |    0   40
+
+
+Tol = 1.00e-06
+Project    x         y         x'        y'     |     g      |    nst     rhs eval    setups (J eval)  |   cf   ef
+  YES   1.07e-07  4.64e-04  9.25e-06  1.99e-02  |  0.00e+00  |   2089     2336+140       120 ( 35)     |    0    5
+  NO    1.68e-03  5.74e-02  5.77e-02  1.00e+00  |  7.05e-05  |   2221     2502+156       167 ( 39)     |    0   39
+
+
+Tol = 1.00e-07
+Project    x         y         x'        y'     |     g      |    nst     rhs eval    setups (J eval)  |   cf   ef
+  YES   3.04e-09  7.79e-05  2.73e-07  3.51e-03  |  2.22e-16  |   3013     3293+204       170 ( 51)     |    0    5
+  NO    2.58e-04  3.45e-02  1.61e-02  1.11e+00  |  6.77e-04  |   3236     3526+228       202 ( 57)     |    0   38
+
+
+Tol = 1.00e-08
+Project    x         y         x'        y'     |     g      |    nst     rhs eval    setups (J eval)  |   cf   ef
+  YES   9.43e-11  1.37e-05  8.22e-09  5.99e-04  |  0.00e+00  |   4295     4728+292       236 ( 73)     |    0    6
+  NO    9.55e-06  8.52e-03  3.38e-04  1.95e-01  |  5.35e-05  |   4600     4962+312       260 ( 78)     |    0   16
+
+
+Tol = 1.00e-09
+Project    x         y         x'        y'     |     g      |    nst     rhs eval    setups (J eval)  |   cf   ef
+  YES   1.60e-12  1.79e-06  1.39e-10  7.79e-05  |  1.11e-16  |   6542     7056+436       346 (109)     |    0    5
+  NO    1.39e-06  1.70e-03  1.12e-04  3.90e-02  |  5.67e-06  |   6661     7123+448       348 (112)     |    0    2
diff --git a/examples/cvode/serial/plot_cvParticle.py b/examples/cvode/serial/plot_cvParticle.py
new file mode 100755
index 0000000000..88d657d47b
--- /dev/null
+++ b/examples/cvode/serial/plot_cvParticle.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python
+# ------------------------------------------------------------------------------
+# Programmer(s): David J. Gardner @ LLNL
+# ------------------------------------------------------------------------------
+# SUNDIALS Copyright Start
+# Copyright (c) 2002-2020, Lawrence Livermore National Security
+# and Southern Methodist University.
+# All rights reserved.
+#
+# See the top-level LICENSE and NOTICE files for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+# SUNDIALS Copyright End
+# ------------------------------------------------------------------------------
+# matplotlib-based plotting script for cvPraticle_dns.c example
+# ------------------------------------------------------------------------------
+
+# imports
+import argparse
+import numpy as np
+import matplotlib.pyplot as plt
+
+# command line options
+parser = argparse.ArgumentParser(description='Plots cvPraticle_dns output')
+parser.add_argument('--sfile', type=str,
+                    default='cvParticle_solution.txt',
+                    help='solution output file to read')
+parser.add_argument('--efile', type=str,
+                    default='cvParticle_error.txt',
+                    help='error output file to read')
+parser.add_argument('--alpha', type=float, nargs=1,
+                    default=1.0,
+                    help='set a non-default alpha value')
+parser.add_argument('--slim', type=float, nargs=2,
+                    help='x and y limits for solution plot')
+parser.add_argument('--eylim', type=float, nargs=2,
+                    help='y limits for error plot')
+
+# parse inputs
+args = parser.parse_args()
+
+# read solution output file
+data = np.loadtxt(args.sfile, dtype=np.double)
+
+# extract times and positions
+t = data[:, 0]
+x = data[:, 1]
+y = data[:, 2]
+
+# unit circle
+tt = np.linspace(0,np.pi*2,10000)
+xt = np.cos(tt)
+yt = np.sin(tt)
+
+# plot solution
+fig, ax = plt.subplots()
+plt.plot(xt, yt, color='black', linestyle='--')
+plt.scatter(x, y, color='red')
+
+if (args.slim):
+    plt.xlim((args.slim[0], args.slim[1]))
+    plt.ylim((args.slim[0], args.slim[1]))
+
+plt.xlabel('x')
+plt.ylabel('y')
+plt.title('Solution')
+ax.set_aspect('equal')
+
+# true solution
+xt = np.cos(args.alpha * t)
+yt = np.sin(args.alpha * t)
+
+# plot solution
+fig, ax = plt.subplots()
+plt.plot(t, x, linestyle='-', label='x')
+plt.plot(t, xt, linestyle='--', label='x true')
+plt.plot(t, y, linestyle='-', label='y')
+plt.plot(t, yt, linestyle='--', label='y true')
+
+plt.xlabel('t')
+plt.ylabel('position')
+plt.title('Particle Position Over Time')
+plt.legend(loc='lower right')
+
+# read error output file
+data = np.loadtxt(args.efile, dtype=np.double)
+
+# extract times, position errors, and constraint error
+t = data[:, 0]
+xerr = np.absolute(data[:, 1])
+yerr = np.absolute(data[:, 2])
+cerr = np.absolute(data[:, 3])
+
+# plot solution
+fig, ax = plt.subplots()
+plt.semilogy(t, xerr, label='x err')
+plt.semilogy(t, yerr, label='y err')
+plt.semilogy(t, cerr, label='c err')
+
+if (args.eylim):
+    plt.ylim((args.eylim[0], args.eylim[1]))
+
+plt.xlabel('time')
+plt.ylabel('error')
+plt.legend(loc='lower right')
+plt.title('Error in position and constraint')
+plt.grid()
+
+# display plots
+plt.show()
+
+##### end of script #####
diff --git a/examples/cvode/serial/plot_cvPendulum.py b/examples/cvode/serial/plot_cvPendulum.py
new file mode 100755
index 0000000000..954986132c
--- /dev/null
+++ b/examples/cvode/serial/plot_cvPendulum.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python
+# ------------------------------------------------------------------------------
+# Programmer(s): David J. Gardner @ LLNL
+# ------------------------------------------------------------------------------
+# SUNDIALS Copyright Start
+# Copyright (c) 2002-2020, Lawrence Livermore National Security
+# and Southern Methodist University.
+# All rights reserved.
+#
+# See the top-level LICENSE and NOTICE files for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+# SUNDIALS Copyright End
+# ------------------------------------------------------------------------------
+# matplotlib-based plotting script for cvPendulum_dns.c example
+# ------------------------------------------------------------------------------
+
+# imports
+import argparse
+import numpy as np
+import matplotlib.pyplot as plt
+
+# command line options
+parser = argparse.ArgumentParser(description='Plots cvPendulum_dns output')
+parser.add_argument('sfile', type=str,
+                    help='solution output file to read')
+
+# parse inputs
+args = parser.parse_args()
+
+# read solution output file
+data = np.loadtxt(args.sfile, dtype=np.double)
+
+# extract times, positions, and velocities
+t  = data[:, 0]
+x  = data[:, 1]
+y  = data[:, 2]
+vx = data[:, 3]
+vy = data[:, 4]
+
+# lower half of unit circle
+tt = np.linspace(np.pi, 2*np.pi, 10000)
+xt = np.cos(tt)
+yt = np.sin(tt)
+
+# plot solution in xy plane
+fig, ax = plt.subplots()
+ax.axhline(y=0, color='black', linestyle='--')
+ax.axvline(x=0, color='black', linestyle='--')
+plt.plot(xt, yt, color='black', linestyle='--')
+plt.scatter(x, y, color='red')
+
+plt.xlabel('x')
+plt.ylabel('y')
+plt.title('Pendulum')
+ax.set_aspect('equal')
+
+# plot position over time
+fig, ax = plt.subplots()
+ax.axhline(y=0, color='black', linestyle='--')
+plt.plot(t, x, label='x')
+plt.plot(t, y, label='y')
+
+plt.xlabel('t')
+plt.ylabel('position')
+plt.title('Pendulum Position')
+plt.legend()
+
+# plot velocity over time
+fig, ax = plt.subplots()
+ax.axhline(y=0, color='black', linestyle='--')
+plt.plot(t, vx, label='$v_x$')
+plt.plot(t, vy, label='$v_y$')
+
+plt.xlabel('t')
+plt.ylabel('velocity')
+plt.title('Pendulum Velocity')
+plt.legend()
+
+# display plots
+plt.show()
+
+##### end of script #####
diff --git a/examples/cvode/superludist/CMakeLists.txt b/examples/cvode/superludist/CMakeLists.txt
index f29d1bda1c..b16b991d39 100644
--- a/examples/cvode/superludist/CMakeLists.txt
+++ b/examples/cvode/superludist/CMakeLists.txt
@@ -45,6 +45,11 @@ else()
   set(SUNLS_LIB sundials_sunlinsolsuperludist_shared)
 endif()
 
+if(SUNDIALS_BUILD_PACKAGE_FUSED_KERNELS)
+  list(APPEND CVODE_LIB
+       sundials_cvode_fused_stubs_${LINK_LIBRARY_TYPE})
+endif()
+
 # Set-up linker flags and link libraries
 set(SUNDIALS_LIBS ${CVODE_LIB} ${NVECP_LIB} ${SUNMAT_LIB} ${SUNLS_LIB} ${EXTRA_LINK_LIBS})
 
@@ -98,6 +103,9 @@ if(EXAMPLES_INSTALL)
     "-lsundials_nvecparallel "
     "-lsundials_sunmatrixslunrloc "
     "-lsundials_sunlinsolsuperludist")
+  if(SUNDIALS_BUILD_PACKAGE_FUSED_KERNELS)
+    set(LIBS "-lsundials_cvode_fused_stubs ${LIBS}")
+  endif()
 
   examples2string(CVODE_examples EXAMPLES)
 
diff --git a/examples/cvodes/serial/CMakeLists.txt b/examples/cvodes/serial/CMakeLists.txt
index 6f19041c9b..ac03f65cb7 100644
--- a/examples/cvodes/serial/CMakeLists.txt
+++ b/examples/cvodes/serial/CMakeLists.txt
@@ -360,7 +360,7 @@ if(EXAMPLES_INSTALL)
       set(THREAD_LIBRARY_SLUMT "")
     endif()
   else()
-    set(EXAMPLES_SLUMTU "")
+    set(EXAMPLES_SLUMT "")
     set(THREAD_LIBRARY_SLUMT "")
   endif()
 
diff --git a/examples/nvector/cuda/CMakeLists.txt b/examples/nvector/cuda/CMakeLists.txt
index deb2ef7dc3..6a8b2dbfff 100644
--- a/examples/nvector/cuda/CMakeLists.txt
+++ b/examples/nvector/cuda/CMakeLists.txt
@@ -19,7 +19,9 @@
 
 # Examples using SUNDIALS cuda nvector
 set(nvector_cuda_examples
-  "test_nvector_cuda\;1000 0\;\;\;"
+  "test_nvector_cuda\;3 32 0\;\;\;"
+  "test_nvector_cuda\;500 128 0\;\;\;"
+  "test_nvector_cuda\;1000 -1 0\;\;\;"
   )
 
 # Dependencies for nvector examples
@@ -34,9 +36,9 @@ include_directories(. ..)
 # Specify libraries to link against (through the target that was used to
 # generate them) based on the value of the variable LINK_LIBRARY_TYPE
 if(LINK_LIBRARY_TYPE MATCHES "static")
-  set(NVECS_LIB sundials_nveccuda_static)
+  set(NVECS_LIB sundials_nveccuda_static sundials_nvecserial_static)
 else()
-  set(NVECS_LIB sundials_nveccuda_shared)
+  set(NVECS_LIB sundials_nveccuda_shared sundials_nvecserial_shared)
 endif()
 
 # Set-up linker flags and link libraries
diff --git a/examples/nvector/cuda/test_nvector_cuda.cu b/examples/nvector/cuda/test_nvector_cuda.cu
index 2e1287ad24..7dbd8af68c 100644
--- a/examples/nvector/cuda/test_nvector_cuda.cu
+++ b/examples/nvector/cuda/test_nvector_cuda.cu
@@ -20,13 +20,10 @@
 
 #include <sundials/sundials_math.h>
 #include <sundials/sundials_types.h>
+#include <nvector/nvector_serial.h>
 #include <nvector/nvector_cuda.h>
-#include <nvector/cuda/ThreadPartitioning.hpp>
 #include "test_nvector.h"
 
-
-using namespace suncudavec;
-
 /* private custom allocator functions */
 static void* sunalloc(size_t);
 static void sunfree(void* ptr);
@@ -34,9 +31,11 @@ static void sunfree(void* ptr);
 /* CUDA vector specific tests */
 static int Test_N_VMake_Cuda(N_Vector X, sunindextype length, int myid);
 static int Test_N_VMakeManaged_Cuda(N_Vector X, sunindextype length, int myid);
+static int Test_N_VMakeWithManagedAllocator_Cuda(sunindextype length, int myid);
 
-/* CUDA vector can use unmanaged or managed memory */
+/* CUDA vector variants */
 enum mem_type { UNMANAGED, MANAGED, CUSTOM };
+enum pol_type { DEFAULT_POL, DEFAULT_POL_W_STREAM, GRID_STRIDE };
 
 /* ----------------------------------------------------------------------
  * Main NVector Testing Routine
@@ -48,202 +47,298 @@ int main(int argc, char *argv[])
   sunindextype length;            /* vector length             */
   N_Vector     U, V, X, Y, Z;     /* test vectors              */
   int          print_timing;      /* turn timing on/off        */
-  int          i;
+  int          threadsPerBlock;   /* cuda block size           */
+  cudaStream_t stream;            /* cuda stream               */
+  int          memtype, policy;
+
 
   /* check input and set vector length */
-  if (argc < 3){
-    printf("ERROR: TWO (2) Inputs required: vector length, print timing \n");
+  if (argc < 4){ 
+    printf("ERROR: THREE (3) Inputs required: vector length, CUDA threads per block (-1 for default), print timing \n");
     return(-1);
   }
 
   length = (sunindextype) atol(argv[1]);
   if (length <= 0) {
-    printf("ERROR: length of vector must be a positive integer \n");
+    printf("ERROR: length of vector must be a positive integer\n");
     return(-1);
   }
 
-  print_timing = atoi(argv[2]);
-  SetTiming(print_timing, 0);
-
-  /* test with unmanaged and managed memory */
-  for (i=UNMANAGED; i<=CUSTOM; ++i) {
-    if (i==UNMANAGED) {
-      printf("Testing CUDA N_Vector \n");
-    } else if (i==MANAGED) {
-      printf("\nTesting CUDA N_Vector with managed memory \n");
-    } else {
-      printf("\nTesting CUDA N_Vector with custom allocator \n");
-    }
-    printf("Vector length %ld \n\n", (long int) length);
-
-    /* Create new vectors */
-    if (i == UNMANAGED)    X = N_VNew_Cuda(length);
-    else if (i == MANAGED) X = N_VNewManaged_Cuda(length);
-    else                   X = N_VMakeWithManagedAllocator_Cuda(length, sunalloc, sunfree);
-    if (X == NULL) {
-      printf("FAIL: Unable to create a new vector \n\n");
-      return(1);
-    }
-
-    /* Check vector ID */
-    fails += Test_N_VGetVectorID(X, SUNDIALS_NVEC_CUDA, 0);
-
-    /* Check vector length */
-    fails += Test_N_VGetLength(X, 0);
-
-    /* Check vector communicator */
-    fails += Test_N_VGetCommunicator(X, NULL, 0);
-
-    /* Test clone functions */
-    fails += Test_N_VCloneEmpty(X, 0);
-    fails += Test_N_VClone(X, length, 0);
-    fails += Test_N_VCloneEmptyVectorArray(5, X, 0);
-    fails += Test_N_VCloneVectorArray(5, X, length, 0);
-
-    /* Clone additional vectors for testing */
-    Y = N_VClone(X);
-    if (Y == NULL) {
-      N_VDestroy(X);
-      printf("FAIL: Unable to create a new vector \n\n");
-      return(1);
-    }
+  threadsPerBlock = (int) atoi(argv[2]);
+  if (threadsPerBlock != -1 && threadsPerBlock % 32) {
+    printf("ERROR: CUDA threads per block must be -1 to use the default or a multiple of 32\n");
+    return(-1);
+  }
 
-    Z = N_VClone(X);
-    if (Z == NULL) {
-      N_VDestroy(X);
-      N_VDestroy(Y);
-      printf("FAIL: Unable to create a new vector \n\n");
-      return(1);
-    }
+  print_timing = atoi(argv[3]);
+  SetTiming(print_timing, 0);
 
-    /* Standard vector operation tests */
-    printf("\nTesting standard vector operations:\n\n");
-
-    fails += Test_N_VConst(X, length, 0);
-    fails += Test_N_VLinearSum(X, Y, Z, length, 0);
-    fails += Test_N_VProd(X, Y, Z, length, 0);
-    fails += Test_N_VDiv(X, Y, Z, length, 0);
-    fails += Test_N_VScale(X, Z, length, 0);
-    fails += Test_N_VAbs(X, Z, length, 0);
-    fails += Test_N_VInv(X, Z, length, 0);
-    fails += Test_N_VAddConst(X, Z, length, 0);
-    fails += Test_N_VDotProd(X, Y, length, 0);
-    fails += Test_N_VMaxNorm(X, length, 0);
-    fails += Test_N_VWrmsNorm(X, Y, length, 0);
-    fails += Test_N_VWrmsNormMask(X, Y, Z, length, 0);
-    fails += Test_N_VMin(X, length, 0);
-    fails += Test_N_VWL2Norm(X, Y, length, 0);
-    fails += Test_N_VL1Norm(X, length, 0);
-    fails += Test_N_VCompare(X, Z, length, 0);
-    fails += Test_N_VInvTest(X, Z, length, 0);
-    fails += Test_N_VConstrMask(X, Y, Z, length, 0);
-    fails += Test_N_VMinQuotient(X, Y, length, 0);
-
-    /* Fused and vector array operations tests (disabled) */
-    printf("\nTesting fused and vector array operations (disabled):\n\n");
-
-    /* create vector and disable all fused and vector array operations */
-    if (i == UNMANAGED)    U = N_VNew_Cuda(length);
-    else if (i == MANAGED) U = N_VNewManaged_Cuda(length);
-    else                   U = N_VMakeWithManagedAllocator_Cuda(length, sunalloc, sunfree);
-    if (U == NULL) {
-      printf("FAIL: Unable to create a new vector \n\n");
-      return(1);
-    }
-    retval = N_VEnableFusedOps_Cuda(U, SUNFALSE);
-    if (retval != 0) {
-      N_VDestroy(X);
-      N_VDestroy(Y);
-      N_VDestroy(Z);
-      printf("FAIL: Unable to create a new vector \n\n");
-      return(1);
+  /* test with all policy variants */
+  for (policy=DEFAULT_POL; policy<=GRID_STRIDE; ++policy) {
+    int actualThreadsPerBlock = (threadsPerBlock == -1) ? 256 : threadsPerBlock;
+    SUNCudaExecPolicy* stream_exec_policy = NULL;
+    SUNCudaExecPolicy* reduce_exec_policy = NULL;
+    cudaStreamCreate(&stream);
+
+    if (policy == DEFAULT_POL_W_STREAM) {
+      stream_exec_policy = new SUNCudaThreadDirectExecPolicy(actualThreadsPerBlock, stream);
+      reduce_exec_policy = new SUNCudaBlockReduceExecPolicy(actualThreadsPerBlock, 0, stream);
+    } else if (policy == GRID_STRIDE) {
+      stream_exec_policy = new SUNCudaGridStrideExecPolicy(actualThreadsPerBlock, 1);
+      reduce_exec_policy = new SUNCudaBlockReduceExecPolicy(actualThreadsPerBlock, 1);
     }
 
-    /* fused operations */
-    fails += Test_N_VLinearCombination(U, length, 0);
-    fails += Test_N_VScaleAddMulti(U, length, 0);
-    fails += Test_N_VDotProdMulti(U, length, 0);
-
-    /* vector array operations */
-    fails += Test_N_VLinearSumVectorArray(U, length, 0);
-    fails += Test_N_VScaleVectorArray(U, length, 0);
-    fails += Test_N_VConstVectorArray(U, length, 0);
-    fails += Test_N_VWrmsNormVectorArray(U, length, 0);
-    fails += Test_N_VWrmsNormMaskVectorArray(U, length, 0);
-    fails += Test_N_VScaleAddMultiVectorArray(U, length, 0);
-    fails += Test_N_VLinearCombinationVectorArray(U, length, 0);
-
-    /* Fused and vector array operations tests (enabled) */
-    printf("\nTesting fused and vector array operations (enabled):\n\n");
-
-    /* create vector and enable all fused and vector array operations */
-    if (i == UNMANAGED)    V = N_VNew_Cuda(length);
-    else if (i == MANAGED) V = N_VNewManaged_Cuda(length);
-    else                   V = N_VMakeWithManagedAllocator_Cuda(length, sunalloc, sunfree);
-    retval = N_VEnableFusedOps_Cuda(V, SUNTRUE);
-    if (V == NULL) {
-      printf("FAIL: Unable to create a new vector \n\n");
-      return(1);
-    }
-    if (retval != 0) {
+    /* test with all memory variants */
+    for (memtype=UNMANAGED; memtype<=CUSTOM; ++memtype) {
+      printf("=====> Beginning setup\n\n");
+
+      if (memtype==UNMANAGED) {
+        printf("Testing CUDA N_Vector, policy %d\n", policy);
+      } else if (memtype==MANAGED) {
+        printf("Testing CUDA N_Vector with managed memory, policy %d\n", policy);
+      } else if (memtype==CUSTOM) {
+        printf("Testing CUDA N_Vector with custom allocator, policy %d\n", policy);
+      }
+      printf("Vector length: %ld \n", (long int) length);
+
+      /* Create new vectors */
+      if (memtype == UNMANAGED)    X = N_VNew_Cuda(length);
+      else if (memtype == MANAGED) X = N_VNewManaged_Cuda(length);
+      else if (memtype == CUSTOM)  X = N_VMakeWithManagedAllocator_Cuda(length, sunalloc, sunfree);
+      if (X == NULL) {
+        delete stream_exec_policy;
+        delete reduce_exec_policy;
+        printf("FAIL: Unable to create a new vector \n\n");
+        return(1);
+      }
+
+      if (stream_exec_policy != NULL && reduce_exec_policy != NULL) {
+        if (N_VSetKernelExecPolicy_Cuda(X, stream_exec_policy, reduce_exec_policy)) {
+          N_VDestroy(X);
+          delete stream_exec_policy;
+          delete reduce_exec_policy;
+          printf("FAIL: Unable to set kernel execution policy \n\n");
+          return(1);
+        }
+        printf("Using non-default kernel execution policy\n");
+        printf("Threads per block: %d\n\n", actualThreadsPerBlock);
+      }
+
+      /* Fill vector with uniform random data in [-1,1] */
+      realtype* xdata = N_VGetHostArrayPointer_Cuda(X);
+      for (sunindextype j=0; j<length; j++)
+        xdata[j] = ((realtype) rand() / (realtype) RAND_MAX)*2-1;
+      N_VCopyToDevice_Cuda(X);
+
+      /* Clone additional vectors for testing */
+      Y = N_VClone(X);
+      if (Y == NULL) {
+        N_VDestroy(X);
+        printf("FAIL: Unable to create a new vector \n\n");
+        delete stream_exec_policy;
+        delete reduce_exec_policy;
+        return(1);
+      }
+
+      Z = N_VClone(X);
+      if (Z == NULL) {
+        N_VDestroy(X);
+        N_VDestroy(Y);
+        delete stream_exec_policy;
+        delete reduce_exec_policy;
+        printf("FAIL: Unable to create a new vector \n\n");
+        return(1);
+      }
+
+      /* Fill vectors with uniform random data in [-1,1] */
+      realtype* ydata = N_VGetHostArrayPointer_Cuda(Y);
+      realtype* zdata = N_VGetHostArrayPointer_Cuda(Z);
+      for (sunindextype j=0; j<length; j++) {
+        ydata[j] = ((realtype) rand() / (realtype) RAND_MAX)*2-1;
+        zdata[j] = ((realtype) rand() / (realtype) RAND_MAX)*2-1;
+      }
+      N_VCopyToDevice_Cuda(Y);
+      N_VCopyToDevice_Cuda(Z);
+
+      printf("=====> Setup complete\n");
+      printf("=====> Beginning tests\n\n");
+
+      /* Standard vector operation tests */
+      printf("\nTesting standard vector operations:\n\n");
+
+      /* Check vector ID */
+      fails += Test_N_VGetVectorID(X, SUNDIALS_NVEC_CUDA, 0);
+
+      /* Check vector length */
+      fails += Test_N_VGetLength(X, 0);
+
+      /* Check vector communicator */
+      fails += Test_N_VGetCommunicator(X, NULL, 0);
+
+      /* Test clone functions */
+      fails += Test_N_VCloneEmpty(X, 0);
+      fails += Test_N_VClone(X, length, 0);
+      fails += Test_N_VCloneEmptyVectorArray(5, X, 0);
+      fails += Test_N_VCloneVectorArray(5, X, length, 0);
+
+      /* Test vector math kernels */
+      fails += Test_N_VConst(X, length, 0);
+      fails += Test_N_VLinearSum(X, Y, Z, length, 0);
+      fails += Test_N_VProd(X, Y, Z, length, 0);
+      fails += Test_N_VDiv(X, Y, Z, length, 0);
+      fails += Test_N_VScale(X, Z, length, 0);
+      fails += Test_N_VAbs(X, Z, length, 0);
+      fails += Test_N_VInv(X, Z, length, 0);
+      fails += Test_N_VAddConst(X, Z, length, 0);
+      fails += Test_N_VDotProd(X, Y, length, 0);
+      fails += Test_N_VMaxNorm(X, length, 0);
+      fails += Test_N_VWrmsNorm(X, Y, length, 0);
+      fails += Test_N_VWrmsNormMask(X, Y, Z, length, 0);
+      fails += Test_N_VMin(X, length, 0);
+      fails += Test_N_VWL2Norm(X, Y, length, 0);
+      fails += Test_N_VL1Norm(X, length, 0);
+      if (length >= 3) fails += Test_N_VCompare(X, Z, length, 0);
+      fails += Test_N_VInvTest(X, Z, length, 0);
+      if (length >= 7) fails += Test_N_VConstrMask(X, Y, Z, length, 0);
+      fails += Test_N_VMinQuotient(X, Y, length, 0);
+
+      /* Fused and vector array operations tests (disabled) */
+      printf("\nTesting fused and vector array operations (disabled):\n\n");
+
+      /* create vector and disable all fused and vector array operations */
+      if (memtype == UNMANAGED)    U = N_VNew_Cuda(length);
+      else if (memtype == MANAGED) U = N_VNewManaged_Cuda(length);
+      else                   U = N_VMakeWithManagedAllocator_Cuda(length, sunalloc, sunfree);
+      if (U == NULL) {
+        N_VDestroy(X);
+        N_VDestroy(Y);
+        delete stream_exec_policy;
+        delete reduce_exec_policy;
+        printf("FAIL: Unable to create a new vector \n\n");
+        return(1);
+      }
+      retval = N_VEnableFusedOps_Cuda(U, SUNFALSE);
+      if (retval != 0) {
+        N_VDestroy(X);
+        N_VDestroy(Y);
+        N_VDestroy(Z);
+        N_VDestroy(U);
+        delete stream_exec_policy;
+        delete reduce_exec_policy;
+        printf("FAIL: Unable to create a new vector \n\n");
+        return(1);
+      }
+
+      /* fused operations */
+      fails += Test_N_VLinearCombination(U, length, 0);
+      fails += Test_N_VScaleAddMulti(U, length, 0);
+      fails += Test_N_VDotProdMulti(U, length, 0);
+
+      /* vector array operations */
+      fails += Test_N_VLinearSumVectorArray(U, length, 0);
+      fails += Test_N_VScaleVectorArray(U, length, 0);
+      fails += Test_N_VConstVectorArray(U, length, 0);
+      fails += Test_N_VWrmsNormVectorArray(U, length, 0);
+      fails += Test_N_VWrmsNormMaskVectorArray(U, length, 0);
+      fails += Test_N_VScaleAddMultiVectorArray(U, length, 0);
+      fails += Test_N_VLinearCombinationVectorArray(U, length, 0);
+
+      /* Fused and vector array operations tests (enabled) */
+      printf("\nTesting fused and vector array operations (enabled):\n\n");
+
+      /* create vector and enable all fused and vector array operations */
+      if (memtype == UNMANAGED)    V = N_VNew_Cuda(length);
+      else if (memtype == MANAGED) V = N_VNewManaged_Cuda(length);
+      else                         V = N_VMakeWithManagedAllocator_Cuda(length, sunalloc, sunfree);
+      retval = N_VEnableFusedOps_Cuda(V, SUNTRUE);
+      if (V == NULL) {
+        N_VDestroy(X);
+        N_VDestroy(Y);
+        N_VDestroy(Z);
+        N_VDestroy(U);
+        printf("FAIL: Unable to create a new vector \n\n");
+        delete stream_exec_policy;
+        delete reduce_exec_policy;
+        return(1);
+      }
+      if (retval != 0) {
+        N_VDestroy(X);
+        N_VDestroy(Y);
+        N_VDestroy(Z);
+        N_VDestroy(U);
+        N_VDestroy(V);
+        delete stream_exec_policy;
+        delete reduce_exec_policy;
+        printf("FAIL: Unable to create a new vector \n\n");
+        return(1);
+      }
+
+      /* fused operations */
+      fails += Test_N_VLinearCombination(V, length, 0);
+      fails += Test_N_VScaleAddMulti(V, length, 0);
+      fails += Test_N_VDotProdMulti(V, length, 0);
+
+      /* vector array operations */
+      fails += Test_N_VLinearSumVectorArray(V, length, 0);
+      fails += Test_N_VScaleVectorArray(V, length, 0);
+      fails += Test_N_VConstVectorArray(V, length, 0);
+      fails += Test_N_VWrmsNormVectorArray(V, length, 0);
+      fails += Test_N_VWrmsNormMaskVectorArray(V, length, 0);
+      fails += Test_N_VScaleAddMultiVectorArray(V, length, 0);
+      fails += Test_N_VLinearCombinationVectorArray(V, length, 0);
+
+      /* local reduction operations */
+      printf("\nTesting local reduction operations:\n\n");
+
+      fails += Test_N_VDotProdLocal(X, Y, length, 0);
+      fails += Test_N_VMaxNormLocal(X, length, 0);
+      fails += Test_N_VMinLocal(X, length, 0);
+      fails += Test_N_VL1NormLocal(X, length, 0);
+      fails += Test_N_VWSqrSumLocal(X, Y, length, 0);
+      fails += Test_N_VWSqrSumMaskLocal(X, Y, Z, length, 0);
+      fails += Test_N_VInvTestLocal(X, Z, length, 0);
+      if (length >= 7) fails += Test_N_VConstrMaskLocal(X, Y, Z, length, 0);
+      fails += Test_N_VMinQuotientLocal(X, Y, length, 0);
+
+      /* CUDA specific tests */
+      printf("\nTesting cuda vector specific operations:\n\n");
+      if (memtype==UNMANAGED) {
+        fails += Test_N_VMake_Cuda(X, length, 0);
+      } else if (memtype==MANAGED) {
+        fails += Test_N_VMakeManaged_Cuda(X, length, 0);
+      } else if (memtype==CUSTOM) {
+        fails += Test_N_VMakeWithManagedAllocator_Cuda(length, 0);
+      }
+
+      printf("\n=====> Beginning teardown\n");
+
+      /* Free vectors */
       N_VDestroy(X);
       N_VDestroy(Y);
       N_VDestroy(Z);
       N_VDestroy(U);
-      printf("FAIL: Unable to create a new vector \n\n");
-      return(1);
-    }
+      N_VDestroy(V);
 
-    /* fused operations */
-    fails += Test_N_VLinearCombination(V, length, 0);
-    fails += Test_N_VScaleAddMulti(V, length, 0);
-    fails += Test_N_VDotProdMulti(V, length, 0);
-
-    /* vector array operations */
-    fails += Test_N_VLinearSumVectorArray(V, length, 0);
-    fails += Test_N_VScaleVectorArray(V, length, 0);
-    fails += Test_N_VConstVectorArray(V, length, 0);
-    fails += Test_N_VWrmsNormVectorArray(V, length, 0);
-    fails += Test_N_VWrmsNormMaskVectorArray(V, length, 0);
-    fails += Test_N_VScaleAddMultiVectorArray(V, length, 0);
-    fails += Test_N_VLinearCombinationVectorArray(V, length, 0);
-
-    /* local reduction operations */
-    printf("\nTesting local reduction operations:\n\n");
-
-    fails += Test_N_VDotProdLocal(X, Y, length, 0);
-    fails += Test_N_VMaxNormLocal(X, length, 0);
-    fails += Test_N_VMinLocal(X, length, 0);
-    fails += Test_N_VL1NormLocal(X, length, 0);
-    fails += Test_N_VWSqrSumLocal(X, Y, length, 0);
-    fails += Test_N_VWSqrSumMaskLocal(X, Y, Z, length, 0);
-    fails += Test_N_VInvTestLocal(X, Z, length, 0);
-    fails += Test_N_VConstrMaskLocal(X, Y, Z, length, 0);
-    fails += Test_N_VMinQuotientLocal(X, Y, length, 0);
-
-    /* CUDA specific tests */
-    printf("\nTesting cuda vector specific operations:\n\n");
-    if (i==UNMANAGED) {
-      fails += Test_N_VMake_Cuda(X, length, 0);
-    } else if (i==MANAGED) {
-      fails += Test_N_VMakeManaged_Cuda(X, length, 0);
+      /* Synchronize */
+      cudaDeviceSynchronize();
+
+      printf("=====> Teardown complete\n\n");
     }
 
-    /* Free vectors */
-    N_VDestroy(X);
-    N_VDestroy(Y);
-    N_VDestroy(Z);
-    N_VDestroy(U);
-    N_VDestroy(V);
-  }
+    /* Print result */
+    if (fails) {
+      printf("\n\nFAIL: NVector module failed %i tests \n\n", fails);
+    } else {
+      printf("\n\nSUCCESS: NVector module passed all tests \n\n");
+    }
 
-  /* Print result */
-  if (fails) {
-    printf("\n\nFAIL: NVector module failed %i tests \n\n", fails);
-  } else {
-    printf("\n\nSUCCESS: NVector module passed all tests \n\n");
+    cudaStreamDestroy(stream);
+    delete stream_exec_policy;
+    delete reduce_exec_policy;
   }
-
+  
+  cudaDeviceSynchronize();
+  cudaDeviceReset();
   return(fails);
 }
 
@@ -349,6 +444,18 @@ int Test_N_VMakeManaged_Cuda(N_Vector X, sunindextype length, int myid)
   }
 
   failure += check_ans(NEG_HALF, Y, length);
+  if (failure) {
+    printf(">>> FAILED test -- N_VMakeManaged_Cuda Case 1, Proc %d \n", myid);
+    printf("    Failed N_VConst check \n \n");
+    N_VDestroy(Y);
+    return(1);
+  }
+
+  if (myid == 0) {
+    printf("PASSED test -- N_VMakeManaged_Cuda Case 1\n");
+  }
+
+  N_VDestroy(Y);
 
   /* Case 2: data is null */
   Y = N_VMakeManaged_Cuda(length, NULL);
@@ -367,6 +474,46 @@ int Test_N_VMakeManaged_Cuda(N_Vector X, sunindextype length, int myid)
   return(failure);
 }
 
+/* --------------------------------------------------------------------
+ * Test for the CUDA N_Vector N_VMakeWithManagedAllocator_Cuda function.
+ * Requires N_VConst to check data. X must be using managed memory.
+ */
+int Test_N_VMakeWithManagedAllocator_Cuda(sunindextype length, int myid)
+{
+  int failure = 0;
+  N_Vector Y;
+
+  Y = N_VMakeWithManagedAllocator_Cuda(length, sunalloc, sunfree);
+  if (Y == NULL) {
+    printf(">>> FAILED test -- N_VMakeWithManagedAllocator_Cuda, Proc %d \n", myid);
+    printf("    Vector is NULL \n \n");
+    return(1);
+  }
+
+  N_VConst(NEG_HALF, Y);
+
+  if(!N_VIsManagedMemory_Cuda(Y)) {
+    printf(">>> FAILED test -- N_VMakeWithManagedAllocator_Cuda, Proc %d \n", myid);
+    N_VDestroy(Y);
+    return(1);
+  }
+  
+  failure += check_ans(NEG_HALF, Y, length);
+  if (failure) {
+    printf(">>> FAILED test -- N_VMakeWithManagedAllocator_Cuda, Proc %d \n", myid);
+    printf("    Failed N_VConst check \n \n");
+    N_VDestroy(Y);
+    return(1);
+  }
+
+  if (myid == 0) {
+    printf("PASSED test -- N_VMakeWithManagedAllocator_Cuda\n");
+  }
+ 
+  N_VDestroy(Y);
+ 
+  return(failure);
+}
 
 /* ----------------------------------------------------------------------
  * Implementation specific utility functions for vector tests
@@ -382,7 +529,9 @@ int check_ans(realtype ans, N_Vector X, sunindextype length)
 
   /* check vector data */
   for (i = 0; i < length; i++) {
-    failure += FNEQ(Xdata[i], ans);
+    if (failure += FNEQ(Xdata[i], ans)) {
+      printf("check_ans fail: Xdata[%d] = %f, expected Xdata[%d] = %f\n", i, Xdata[i], i, ans);
+    }
   }
 
   return (failure > ZERO) ? (1) : (0);
@@ -390,8 +539,11 @@ int check_ans(realtype ans, N_Vector X, sunindextype length)
 
 booleantype has_data(N_Vector X)
 {
-  /* check if vector content is non-null */
-  return (X->content == NULL ? SUNFALSE : SUNTRUE);
+  /* check if vector data is non-null */
+  if ((N_VGetHostArrayPointer_Cuda(X) == NULL) &&
+      (N_VGetDeviceArrayPointer_Cuda(X) == NULL))
+    return SUNFALSE;
+  return SUNTRUE;
 }
 
 void set_element(N_Vector X, sunindextype i, realtype val)
diff --git a/examples/nvector/mpicuda/test_nvector_mpicuda.cu b/examples/nvector/mpicuda/test_nvector_mpicuda.cu
index 0593507755..ef00bfe1c0 100644
--- a/examples/nvector/mpicuda/test_nvector_mpicuda.cu
+++ b/examples/nvector/mpicuda/test_nvector_mpicuda.cu
@@ -19,7 +19,6 @@
 #include <stdlib.h>
 
 #include <sundials/sundials_types.h>
-#include <nvector/cuda/Vector.hpp>
 #include <nvector/nvector_cuda.h>
 #include <nvector/nvector_mpiplusx.h>
 #include <sundials/sundials_math.h>
@@ -306,7 +305,12 @@ int check_ans(realtype ans, N_Vector plusX, sunindextype local_length)
 
 booleantype has_data(N_Vector plusX)
 {
-  return (N_VGetLocalVector_MPIPlusX(plusX)->content == NULL) ? SUNFALSE : SUNTRUE;
+  N_Vector X = N_VGetLocalVector_MPIPlusX(plusX);
+  /* check if vector data is non-null */
+  if ((N_VGetHostArrayPointer_Cuda(X) == NULL) &&
+      (N_VGetDeviceArrayPointer_Cuda(X) == NULL))
+    return SUNFALSE;
+  return SUNTRUE;
 }
 
 void set_element(N_Vector plusX, sunindextype i, realtype val)
diff --git a/examples/nvector/test_nvector.c b/examples/nvector/test_nvector.c
index 62a48e48d6..b963079a3d 100644
--- a/examples/nvector/test_nvector.c
+++ b/examples/nvector/test_nvector.c
@@ -27,23 +27,16 @@
 #include <stdio.h>
 #include <stdlib.h>
 
+#include <sundials/sundials_nvector.h>
+#include <sundials/sundials_math.h>
+#include "test_nvector.h"
+
 /* POSIX timers */
 #if defined(SUNDIALS_HAVE_POSIX_TIMERS)
 #include <time.h>
 #include <unistd.h>
 #endif
 
-#include <sundials/sundials_nvector.h>
-#include <sundials/sundials_types.h>
-#include <sundials/sundials_math.h>
-#include "test_nvector.h"
-
-/* private functions */
-static double get_time();
-
-/* private variables */
-static int print_time = 0;
-
 #if defined(SUNDIALS_HAVE_POSIX_TIMERS) && defined(_POSIX_TIMERS)
 static time_t base_time_tv_sec = 0; /* Base time; makes time values returned
                                        by get_time easier to read when
@@ -52,6 +45,12 @@ static time_t base_time_tv_sec = 0; /* Base time; makes time values returned
                                     */
 #endif
 
+/* private functions */
+static double get_time();
+
+/* private variables */
+static int print_time = 0;
+
 /* macro for printing timings */
 #define FMT "%s Time: %22.15e\n\n"
 #define PRINT_TIME(test, time) if (print_time) printf(FMT, test, time)
@@ -1672,7 +1671,7 @@ int Test_N_VInvTest(N_Vector X, N_Vector Z, sunindextype local_length, int myid)
   int          fails = 0, failure = 0;
   double       start_time, stop_time, maxt;
   sunindextype i;
-  booleantype  test;
+  booleantype  ans, exp;
 
   if (local_length < 2) {
     printf("Error Test_N_VInvTest: Local vector length is %ld, length must be >= 2\n",
@@ -1689,14 +1688,17 @@ int Test_N_VInvTest(N_Vector X, N_Vector Z, sunindextype local_length, int myid)
   N_VConst(ZERO, Z);
 
   start_time = get_time();
-  test = N_VInvTest(X, Z);
+  ans = N_VInvTest(X, Z);
   sync_device();
   stop_time = get_time();
 
+  /* we expect no zeros */
+  exp = SUNTRUE;
+
   /* Z should be vector of +2 */
   failure = check_ans(TWO, Z, local_length);
 
-  if (failure || !test) {
+  if (failure || (ans != exp)) {
     printf(">>> FAILED test -- N_VInvTest Case 1, Proc %d \n", myid);
     fails++;
   } else if (myid == 0) {
@@ -1717,14 +1719,16 @@ int Test_N_VInvTest(N_Vector X, N_Vector Z, sunindextype local_length, int myid)
   /* fill vector data */
   N_VConst(ZERO, Z);
   for(i=0; i < local_length; i++){
-    if (i % 2)
+    if (i % 2) {
       set_element(X, i, HALF);
-    else
+    } else {
+      exp = SUNFALSE;
       set_element(X, i, ZERO);
+    }
   }
 
   start_time = get_time();
-  test = N_VInvTest(X, Z);
+  ans = N_VInvTest(X, Z);
   sync_device();
   stop_time = get_time();
 
@@ -1739,7 +1743,7 @@ int Test_N_VInvTest(N_Vector X, N_Vector Z, sunindextype local_length, int myid)
     }
   }
 
-  if (failure || test) {
+  if (failure || (ans != exp)) {
     printf(">>> FAILED test -- N_VInvTest Case 2, Proc %d \n", myid);
     fails++;
   } else if (myid == 0) {
diff --git a/examples/sunlinsol/cusolversp/CMakeLists.txt b/examples/sunlinsol/cusolversp/CMakeLists.txt
index be9e9548b5..04a0271ad1 100644
--- a/examples/sunlinsol/cusolversp/CMakeLists.txt
+++ b/examples/sunlinsol/cusolversp/CMakeLists.txt
@@ -124,8 +124,8 @@ if(EXAMPLES_INSTALL)
 
   # Prepare substitution variables for Makefile and/or CMakeLists templates
   set(SOLVER_LIB "")
-  set(NVECTOR_LIB "sundials_nveccuda")
-  set(SUNMAT_LIB "sundials_sunmatrixdense sundials_sunmatrixsparse")
+  set(NVECTOR_LIB "sundials_nvecserial sundials_nveccuda")
+  set(SUNMAT_LIB "sundials_sunmatrixdense sundials_sunmatrixsparse sundials_sunmatrixcusparse")
   set(SUNLS_LIB "sundials_sunlinsolcusolversp")
 
   examples2string(sunlinsol_cusolversp_examples EXAMPLES_CUSOLVER)
diff --git a/examples/sunmatrix/cusparse/CMakeLists.txt b/examples/sunmatrix/cusparse/CMakeLists.txt
index 54bd958193..08beeb5db5 100644
--- a/examples/sunmatrix/cusparse/CMakeLists.txt
+++ b/examples/sunmatrix/cusparse/CMakeLists.txt
@@ -29,6 +29,7 @@ set(sunmatrix_cusparse_examples
 set(sunmatrix_examples_dependencies
   test_sunmatrix
   sundials_matrix
+  dreadrb
   )
 
 # Specify libraries to link against (through the target that was used to
diff --git a/examples/sunmatrix/cusparse/test_sunmatrix_cusparse.cu b/examples/sunmatrix/cusparse/test_sunmatrix_cusparse.cu
index 43f2e77cca..d66d143394 100644
--- a/examples/sunmatrix/cusparse/test_sunmatrix_cusparse.cu
+++ b/examples/sunmatrix/cusparse/test_sunmatrix_cusparse.cu
@@ -33,6 +33,35 @@
 
 enum { IDENTITY, RANDOM, RBFILE };
 
+/* Implementation specific test of SUNMatrix_cuSparse_SetKernelExecPolicy */
+int Test_SetKernelExecPolicy(SUNMatrix A, int myid);
+
+class ATestExecPolicy : public SUNCudaExecPolicy
+{
+public:
+  ATestExecPolicy(){}
+
+  virtual size_t gridSize(size_t numWorkElements = 0, size_t blockDim = 0) const
+  {
+    return 1;
+  }
+
+  virtual size_t blockSize(size_t numWorkElements = 0, size_t gridDim = 0) const
+  {
+    return 1;
+  }
+
+  virtual cudaStream_t stream() const
+  {
+    return 0;
+  }
+
+  virtual SUNCudaExecPolicy* clone() const
+  {
+    return static_cast<SUNCudaExecPolicy*>(new ATestExecPolicy());
+  }
+};
+
  /* ----------------------------------------------------------------------
   * Main SUNMatrix Testing Routine
   * --------------------------------------------------------------------*/
@@ -369,7 +398,7 @@ enum { IDENTITY, RANDOM, RBFILE };
    N_VCopyToDevice_Cuda(d_y);
 
    printf("Setup complete\n");
-   printf("Beginning tests\n");
+   printf("Beginning tests\n\n");
 
    /* SUNMatrix Tests */
    fails += Test_SUNMatGetID(dA, SUNMATRIX_CUSPARSE, 0);
@@ -377,10 +406,9 @@ enum { IDENTITY, RANDOM, RBFILE };
    fails += Test_SUNMatCopy(dA, 0);
    fails += Test_SUNMatZero(dA, 0);
    fails += Test_SUNMatScaleAdd(dA, dI, 0);
-   if (square) {
-     fails += Test_SUNMatScaleAddI(dA, dI, 0);
-   }
+   if (square) fails += Test_SUNMatScaleAddI(dA, dI, 0);
    fails += Test_SUNMatMatvec(dA, d_x, d_y, 0);
+   if (square) fails += Test_SetKernelExecPolicy(dI, 0);
 
    /* Print result */
    if (fails) {
@@ -424,6 +452,58 @@ enum { IDENTITY, RANDOM, RBFILE };
    return(fails);
  }
 
+ /* ----------------------------------------------------------------------
+  * Test the SUNMatrix_cuSparse_SetKernelExecPolicy function.
+  * --------------------------------------------------------------------*/
+int Test_SetKernelExecPolicy(SUNMatrix I, int myid)
+{
+  printf("HERE\n");
+  int print_all_ranks = 0;
+  realtype  tol = 100*UNIT_ROUNDOFF;
+  SUNMatrix B = SUNMatClone(I);
+
+  /* check cloned matrix */
+  if (B == NULL) {
+    TEST_STATUS(">>> FAILED test -- SetKernelExecPolicy \n", myid);
+    TEST_STATUS("    After SUNMatClone, B == NULL \n \n", myid);
+    return(1);
+  }
+
+  /* copy data */
+  if (SUNMatCopy(I, B)) {
+    TEST_STATUS(">>> FAILED test -- SetKernelExecPolicy \n", myid);
+    TEST_STATUS("    SUNMatCopy returned nonzero \n \n", myid);
+    SUNMatDestroy(B);
+    return(1);
+  }
+
+  /* set kernel exec policy */
+  ATestExecPolicy exec_policy;
+  SUNMatrix_cuSparse_SetKernelExecPolicy(B, &exec_policy);
+
+  /* try out an operation */
+  if (SUNMatScaleAddI(RCONST(-1.0), B)) {
+    TEST_STATUS(">>> FAILED test -- SetKernelExecPolicy \n", myid);
+    TEST_STATUS("    SUNMatScaleAddI returned nonzero \n \n", myid);
+    SUNMatDestroy(B);
+    return(1);
+  }
+
+  /* check matrix */
+  if (check_matrix_entry(B, ZERO, tol)) {
+    TEST_STATUS(">>> FAILED test -- SetKernelExecPolicy \n", myid);
+    TEST_STATUS("    check_matrix_entry returned nonzero \n \n", myid);
+    SUNMatDestroy(B);
+    return(1);
+  }
+
+  TEST_STATUS("    PASSED test -- SetKernelExecPolicy \n", myid);
+
+  SUNMatDestroy(B);
+
+  return 0;
+}
+
  /* ----------------------------------------------------------------------
   * Check matrix
   * --------------------------------------------------------------------*/
diff --git a/examples/templates/cmakelists_openmp_C_ex.in b/examples/templates/cmakelists_openmp_C_ex.in
index e7549f2906..9f173877a5 100644
--- a/examples/templates/cmakelists_openmp_C_ex.in
+++ b/examples/templates/cmakelists_openmp_C_ex.in
@@ -77,6 +77,7 @@ endif()
 
 # List of SUNDIALS libraries
 set(SUNDIALS_LIBRARIES
+  -L${SUNDIALS_LIBRARY_DIR}
   ${SUNDIALS_SOLVER_LIB}
   ${SUNDIALS_NVEC_LIB}
   ${SUNDIALS_EXTRA_LIBS})
diff --git a/examples/templates/cmakelists_openmp_F77_ex.in b/examples/templates/cmakelists_openmp_F77_ex.in
index 7ec09b380d..b45d03a8b8 100644
--- a/examples/templates/cmakelists_openmp_F77_ex.in
+++ b/examples/templates/cmakelists_openmp_F77_ex.in
@@ -94,6 +94,7 @@ endif()
 
 # List of SUNDIALS libraries
 set(SUNDIALS_LIBRARIES
+  -L${SUNDIALS_LIBRARY_DIR}
   ${SUNDIALS_SOLVER_FLIB}
   ${SUNDIALS_SOLVER_LIB}
   ${SUNDIALS_NVEC_FLIB}
diff --git a/examples/templates/cmakelists_openmpdev_ex.in b/examples/templates/cmakelists_openmpdev_ex.in
index f817948d68..eec99952d3 100644
--- a/examples/templates/cmakelists_openmpdev_ex.in
+++ b/examples/templates/cmakelists_openmpdev_ex.in
@@ -76,6 +76,7 @@ endif()
 
 # List of SUNDIALS libraries
 set(SUNDIALS_LIBRARIES
+  -L${SUNDIALS_LIBRARY_DIR}
   ${SUNDIALS_SOLVER_LIB}
   ${SUNDIALS_NVEC_LIB}
   ${SUNDIALS_EXTRA_LIB})
diff --git a/examples/templates/cmakelists_parallel_CUDA_ex.in b/examples/templates/cmakelists_parallel_CUDA_ex.in
index c1d70f2df7..85709eb8ef 100644
--- a/examples/templates/cmakelists_parallel_CUDA_ex.in
+++ b/examples/templates/cmakelists_parallel_CUDA_ex.in
@@ -87,6 +87,7 @@ endif()
 
 # List of SUNDIALS libraries
 set(SUNDIALS_LIBRARIES
+  -L${SUNDIALS_LIBRARY_DIR}
   ${SUNDIALS_SOLVER_LIB}
   ${SUNDIALS_NVEC_LIB}
   ${SUNDIALS_MPIPLUSX_LIB}
diff --git a/examples/templates/cmakelists_parallel_CXX_ex.in b/examples/templates/cmakelists_parallel_CXX_ex.in
index 1bf7f0ab44..620f90da16 100644
--- a/examples/templates/cmakelists_parallel_CXX_ex.in
+++ b/examples/templates/cmakelists_parallel_CXX_ex.in
@@ -79,6 +79,7 @@ endif()
 
 # List of SUNDIALS libraries
 set(SUNDIALS_LIBRARIES
+  -L${SUNDIALS_LIBRARY_DIR}
   ${SUNDIALS_SOLVER_LIB}
   ${SUNDIALS_NVECPAR_LIB}
   ${SUNDIALS_NVECSER_LIB}
diff --git a/examples/templates/cmakelists_parallel_C_ex.in b/examples/templates/cmakelists_parallel_C_ex.in
index 9e82b86d2c..dbfe674513 100644
--- a/examples/templates/cmakelists_parallel_C_ex.in
+++ b/examples/templates/cmakelists_parallel_C_ex.in
@@ -79,6 +79,7 @@ endif()
 
 # List of SUNDIALS libraries
 set(SUNDIALS_LIBRARIES
+  -L${SUNDIALS_LIBRARY_DIR}
   ${SUNDIALS_SOLVER_LIB}
   ${SUNDIALS_NVECPAR_LIB}
   ${SUNDIALS_NVECSER_LIB}
diff --git a/examples/templates/cmakelists_parallel_F77_ex.in b/examples/templates/cmakelists_parallel_F77_ex.in
index 92464747e6..b3cf3fee24 100644
--- a/examples/templates/cmakelists_parallel_F77_ex.in
+++ b/examples/templates/cmakelists_parallel_F77_ex.in
@@ -75,6 +75,7 @@ endif()
 
 # List of SUNDIALS libraries
 set(SUNDIALS_LIBRARIES
+  -L${SUNDIALS_LIBRARY_DIR}
   ${SUNDIALS_SOLVER_FLIB}
   ${SUNDIALS_SOLVER_LIB}
   ${SUNDIALS_NVEC_FLIB}
diff --git a/examples/templates/cmakelists_parallel_F90_ex.in b/examples/templates/cmakelists_parallel_F90_ex.in
index 67de04b1b1..e71d30b256 100644
--- a/examples/templates/cmakelists_parallel_F90_ex.in
+++ b/examples/templates/cmakelists_parallel_F90_ex.in
@@ -74,6 +74,7 @@ endif()
 
 # List of SUNDIALS libraries
 set(SUNDIALS_LIBRARIES
+  -L${SUNDIALS_LIBRARY_DIR}
   ${SUNDIALS_SOLVER_FLIB}
   ${SUNDIALS_SOLVER_LIB}
   ${SUNDIALS_NVEC_FLIB}
diff --git a/examples/templates/cmakelists_parallel_RAJA_ex.in b/examples/templates/cmakelists_parallel_RAJA_ex.in
index a539182717..a3adcc64fa 100644
--- a/examples/templates/cmakelists_parallel_RAJA_ex.in
+++ b/examples/templates/cmakelists_parallel_RAJA_ex.in
@@ -87,6 +87,7 @@ endif()
 
 # List of SUNDIALS libraries
 set(SUNDIALS_LIBRARIES
+  -L${SUNDIALS_LIBRARY_DIR}
   ${SUNDIALS_SOLVER_LIB}
   ${SUNDIALS_NVEC_LIB}
   ${SUNDIALS_MPIPLUSX_LIB}
diff --git a/examples/templates/cmakelists_parhyp_CXX_ex.in b/examples/templates/cmakelists_parhyp_CXX_ex.in
index bef742c30b..a5ef0587df 100644
--- a/examples/templates/cmakelists_parhyp_CXX_ex.in
+++ b/examples/templates/cmakelists_parhyp_CXX_ex.in
@@ -71,6 +71,7 @@ endif()
 
 # List of SUNDIALS libraries
 set(SUNDIALS_LIBRARIES
+  -L${SUNDIALS_LIBRARY_DIR}
   ${SUNDIALS_SOLVER_LIB}
   ${SUNDIALS_NVEC_LIB}
   ${SUNDIALS_NVECPH_LIB}
diff --git a/examples/templates/cmakelists_parhyp_C_ex.in b/examples/templates/cmakelists_parhyp_C_ex.in
index 7fbe707f39..f7d0f0c6f3 100644
--- a/examples/templates/cmakelists_parhyp_C_ex.in
+++ b/examples/templates/cmakelists_parhyp_C_ex.in
@@ -67,6 +67,7 @@ endif()
 
 # List of SUNDIALS libraries
 set(SUNDIALS_LIBRARIES
+  -L${SUNDIALS_LIBRARY_DIR}
   ${SUNDIALS_SOLVER_LIB}
   ${SUNDIALS_NVEC_LIB}
   ${SUNDIALS_EXTRA_LIBS})
diff --git a/examples/templates/cmakelists_petsc_C_ex.in b/examples/templates/cmakelists_petsc_C_ex.in
index 6a67634136..d24302d1d0 100644
--- a/examples/templates/cmakelists_petsc_C_ex.in
+++ b/examples/templates/cmakelists_petsc_C_ex.in
@@ -71,6 +71,7 @@ endif()
 
 # List of SUNDIALS libraries
 set(SUNDIALS_LIBRARIES
+  -L${SUNDIALS_LIBRARY_DIR}
   ${SUNDIALS_SOLVER_LIB}
   ${SUNDIALS_NVEC_LIB}
   ${SUNDIALS_NLS_LIB}
diff --git a/examples/templates/cmakelists_pthreads_C_ex.in b/examples/templates/cmakelists_pthreads_C_ex.in
index d4add4881a..8fc2cfc393 100644
--- a/examples/templates/cmakelists_pthreads_C_ex.in
+++ b/examples/templates/cmakelists_pthreads_C_ex.in
@@ -73,6 +73,7 @@ endif()
 
 # List of SUNDIALS libraries
 set(SUNDIALS_LIBRARIES
+  -L${SUNDIALS_LIBRARY_DIR}
   ${SUNDIALS_SOLVER_LIB}
   ${SUNDIALS_NVEC_LIB}
   ${SUNDIALS_EXTRA_LIBS})
diff --git a/examples/templates/cmakelists_pthreads_F77_ex.in b/examples/templates/cmakelists_pthreads_F77_ex.in
index 8fa7678067..e206a5756a 100644
--- a/examples/templates/cmakelists_pthreads_F77_ex.in
+++ b/examples/templates/cmakelists_pthreads_F77_ex.in
@@ -89,6 +89,7 @@ endif()
 
 # List of SUNDIALS libraries
 set(SUNDIALS_LIBRARIES
+  -L${SUNDIALS_LIBRARY_DIR}
   ${SUNDIALS_SOLVER_FLIB}
   ${SUNDIALS_SOLVER_LIB}
   ${SUNDIALS_NVEC_FLIB}
diff --git a/examples/templates/cmakelists_serial_CUDA_ex.in b/examples/templates/cmakelists_serial_CUDA_ex.in
index f3ebcb3f42..819facd449 100644
--- a/examples/templates/cmakelists_serial_CUDA_ex.in
+++ b/examples/templates/cmakelists_serial_CUDA_ex.in
@@ -77,28 +77,39 @@ find_library(SUNDIALS_NVEC_LIB
   sundials_nveccuda ${SUNDIALS_LIBRARY_DIR}
   DOC "NVECTOR_CUDA library")
 
+find_library(SUNDIALS_CUSPARSEMAT_LIB
+  sundials_sunmatrixcusparse ${SUNDIALS_LIBRARY_DIR}
+  DOC "SUNMATRIX_CUSPARSE library")
+
 find_library(SUNDIALS_CUSOLVERSP_LIB
   sundials_sunlinsolcusolversp
   ${SUNDIALS_LIBRARY_DIR}
   DOC "SUNLINSOL_CUSOLVERSP linear solver library")
 
-# Set additional libraries
-set(SUNDIALS_EXTRA_LIBS -L${SUNDIALS_LIBRARY_DIR} @SUNMAT_LIB@ @LIBS@ CACHE STRING "Additional libraries")
-
 # For SUNDIALS module examples the solver library is not needed
 if(NOT SUNDIALS_SOLVER_LIB)
   set(SUNDIALS_SOLVER_LIB "")
 endif()
 
+# For some examples the SUNDIALS_CUSPARSEMAT_LIB library is not needed
+if(NOT SUNDIALS_CUSPARSEMAT_LIB)
+  set(SUNDIALS_CUSPARSEMAT_LIB "")
+endif()
+
 # For some examples the SUNLINSOL_CUSOLVERSP library is not needed
 if(NOT SUNDIALS_CUSOLVERSP_LIB)
   set(SUNDIALS_CUSOLVERSP_LIB "")
 endif()
 
+# Set additional libraries
+set(SUNDIALS_EXTRA_LIBS -L${SUNDIALS_LIBRARY_DIR} @LIBS@ CACHE STRING "Additional libraries")
+
 # List of SUNDIALS libraries
 set(SUNDIALS_LIBRARIES
+  -L${SUNDIALS_LIBRARY_DIR}
   ${SUNDIALS_SOLVER_LIB}
   ${SUNDIALS_NVEC_LIB}
+  ${SUNDIALS_CUSPARSEMAT_LIB}
   ${SUNDIALS_EXTRA_LIBS})
 
 # ------------------------------------------------------------------------------
@@ -142,6 +153,7 @@ endif()
 set(SUNDIALS_LIBRARIES
   ${SUNDIALS_SOLVER_LIB}
   ${SUNDIALS_NVEC_LIB}
+  ${SUNDIALS_CUSPARSEMAT_LIB}
   ${SUNDIALS_CUSOLVERSP_LIB}
   ${SUNDIALS_EXTRA_LIBS})
 
diff --git a/examples/templates/cmakelists_serial_CXX_ex.in b/examples/templates/cmakelists_serial_CXX_ex.in
index 1ca362f0de..3d2dc1ba39 100644
--- a/examples/templates/cmakelists_serial_CXX_ex.in
+++ b/examples/templates/cmakelists_serial_CXX_ex.in
@@ -72,6 +72,7 @@ endif()
 
 # List of SUNDIALS libraries
 set(SUNDIALS_LIBRARIES
+  -L${SUNDIALS_LIBRARY_DIR}
   ${SUNDIALS_SOLVER_LIB}
   ${SUNDIALS_NVEC_LIB}
   ${SUNDIALS_MANYVEC_LIB}
diff --git a/examples/templates/cmakelists_serial_C_ex.in b/examples/templates/cmakelists_serial_C_ex.in
index 69d9afc010..cb69468b68 100644
--- a/examples/templates/cmakelists_serial_C_ex.in
+++ b/examples/templates/cmakelists_serial_C_ex.in
@@ -72,6 +72,7 @@ endif()
 
 # List of SUNDIALS libraries
 set(SUNDIALS_LIBRARIES
+  -L${SUNDIALS_LIBRARY_DIR}
   ${SUNDIALS_SOLVER_LIB}
   ${SUNDIALS_NVEC_LIB}
   ${SUNDIALS_MANYVEC_LIB}
diff --git a/examples/templates/cmakelists_serial_F2003_ex.in b/examples/templates/cmakelists_serial_F2003_ex.in
index 22f31c26a2..946761697f 100644
--- a/examples/templates/cmakelists_serial_F2003_ex.in
+++ b/examples/templates/cmakelists_serial_F2003_ex.in
@@ -101,6 +101,7 @@ endif()
 
 # List of SUNDIALS libraries
 set(SUNDIALS_LIBRARIES
+  -L${SUNDIALS_LIBRARY_DIR}
   ${SUNDIALS_SOLVER_FLIB}
   ${SUNDIALS_NVEC_FLIB}
   ${SUNDIALS_SUNLS_FLIB}
diff --git a/examples/templates/cmakelists_serial_F77_ex.in b/examples/templates/cmakelists_serial_F77_ex.in
index 3a7ea68193..fa36715a2f 100644
--- a/examples/templates/cmakelists_serial_F77_ex.in
+++ b/examples/templates/cmakelists_serial_F77_ex.in
@@ -84,6 +84,7 @@ endif()
 
 # List of SUNDIALS libraries
 set(SUNDIALS_LIBRARIES
+  -L${SUNDIALS_LIBRARY_DIR}
   ${SUNDIALS_SOLVER_FLIB}
   ${SUNDIALS_SOLVER_LIB}
   ${SUNDIALS_NVEC_FLIB}
diff --git a/examples/templates/cmakelists_serial_F90_ex.in b/examples/templates/cmakelists_serial_F90_ex.in
index d5c7d04496..2d31bc13a0 100644
--- a/examples/templates/cmakelists_serial_F90_ex.in
+++ b/examples/templates/cmakelists_serial_F90_ex.in
@@ -75,6 +75,7 @@ endif()
 
 # List of SUNDIALS libraries
 set(SUNDIALS_LIBRARIES
+  -L${SUNDIALS_LIBRARY_DIR}
   ${SUNDIALS_SOLVER_FLIB}
   ${SUNDIALS_SOLVER_LIB}
   ${SUNDIALS_NVEC_FLIB}
diff --git a/examples/templates/cmakelists_serial_RAJA_ex.in b/examples/templates/cmakelists_serial_RAJA_ex.in
index be1db1403e..832bea9ceb 100644
--- a/examples/templates/cmakelists_serial_RAJA_ex.in
+++ b/examples/templates/cmakelists_serial_RAJA_ex.in
@@ -83,6 +83,7 @@ endif()
 
 # List of SUNDIALS libraries
 set(SUNDIALS_LIBRARIES
+  -L${SUNDIALS_LIBRARY_DIR}
   ${SUNDIALS_SOLVER_LIB}
   ${SUNDIALS_NVEC_LIB}
   ${SUNDIALS_EXTRA_LIBS})
diff --git a/examples/templates/cmakelists_trilinos_CXX_ex.in b/examples/templates/cmakelists_trilinos_CXX_ex.in
index ab9a0d713a..30793bbc64 100644
--- a/examples/templates/cmakelists_trilinos_CXX_ex.in
+++ b/examples/templates/cmakelists_trilinos_CXX_ex.in
@@ -75,6 +75,7 @@ endif()
 
 # List of SUNDIALS libraries
 set(SUNDIALS_LIBRARIES
+  -L${SUNDIALS_LIBRARY_DIR}
   ${SUNDIALS_SOLVER_LIB}
   ${SUNDIALS_NVEC_LIB}
   ${SUNDIALS_EXTRA_LIBS})
diff --git a/examples/templates/makefile_serial_CUDA_ex.in b/examples/templates/makefile_serial_CUDA_ex.in
index e6691af983..9f9b0802dc 100644
--- a/examples/templates/makefile_serial_CUDA_ex.in
+++ b/examples/templates/makefile_serial_CUDA_ex.in
@@ -34,7 +34,7 @@ NVCC        = @CMAKE_CUDA_COMPILER@
 NVCCFLAGS   = -ccbin=${CXX} -std=c++11 @CMAKE_CUDA_FLAGS@
 LD          = ${NVCC}
 LDFLAGS     = @LDFLAGS@ ${NVCCFLAGS} -Xcompiler \"-Wl,-rpath,${libdir}\"
-LIBS        = @LIBS@
+LIBS        = @LIBS@ -lcusolver -lcusparse
 
 TMP_INCS = ${includedir}
 INCLUDES = $(addprefix -I, ${TMP_INCS})
diff --git a/include/arkode/arkode_arkstep.h b/include/arkode/arkode_arkstep.h
index 7bc7ac3cde..c2431c200f 100644
--- a/include/arkode/arkode_arkstep.h
+++ b/include/arkode/arkode_arkstep.h
@@ -140,6 +140,8 @@ SUNDIALS_EXPORT int ARKStepSetErrorBias(void *arkode_mem,
                                         realtype bias);
 SUNDIALS_EXPORT int ARKStepSetMaxGrowth(void *arkode_mem,
                                         realtype mx_growth);
+SUNDIALS_EXPORT int ARKStepSetMinReduction(void *arkode_mem,
+                                           realtype eta_min);
 SUNDIALS_EXPORT int ARKStepSetFixedStepBounds(void *arkode_mem,
                                               realtype lb, realtype ub);
 SUNDIALS_EXPORT int ARKStepSetAdaptivityMethod(void *arkode_mem,
@@ -237,6 +239,8 @@ SUNDIALS_EXPORT int ARKStepSetMassPreconditioner(void *arkode_mem,
 SUNDIALS_EXPORT int ARKStepSetJacTimes(void *arkode_mem,
                                        ARKLsJacTimesSetupFn jtsetup,
                                        ARKLsJacTimesVecFn jtimes);
+SUNDIALS_EXPORT int ARKStepSetJacTimesRhsFn(void *arkode_mem,
+                                            ARKRhsFn jtimesRhsFn);
 SUNDIALS_EXPORT int ARKStepSetMassTimes(void *arkode_mem,
                                         ARKLsMassTimesSetupFn msetup,
                                         ARKLsMassTimesVecFn mtimes,
diff --git a/include/arkode/arkode_erkstep.h b/include/arkode/arkode_erkstep.h
index 2431b7be65..6cb58aea4e 100644
--- a/include/arkode/arkode_erkstep.h
+++ b/include/arkode/arkode_erkstep.h
@@ -86,6 +86,8 @@ SUNDIALS_EXPORT int ERKStepSetErrorBias(void *arkode_mem,
                                         realtype bias);
 SUNDIALS_EXPORT int ERKStepSetMaxGrowth(void *arkode_mem,
                                         realtype mx_growth);
+SUNDIALS_EXPORT int ERKStepSetMinReduction(void *arkode_mem,
+                                           realtype eta_min);
 SUNDIALS_EXPORT int ERKStepSetFixedStepBounds(void *arkode_mem,
                                               realtype lb, realtype ub);
 SUNDIALS_EXPORT int ERKStepSetAdaptivityMethod(void *arkode_mem,
diff --git a/include/cvode/cvode.h b/include/cvode/cvode.h
index 77d32d8e7b..2a2bfbef43 100644
--- a/include/cvode/cvode.h
+++ b/include/cvode/cvode.h
@@ -22,6 +22,7 @@
 #include <sundials/sundials_nvector.h>
 #include <sundials/sundials_nonlinearsolver.h>
 #include <cvode/cvode_ls.h>
+#include <cvode/cvode_proj.h>
 
 #ifdef __cplusplus  /* wrapper to enable C++ usage */
 extern "C" {
@@ -76,6 +77,10 @@ extern "C" {
 #define CV_TOO_CLOSE            -27
 #define CV_VECTOROP_ERR         -28
 
+#define CV_PROJ_MEM_NULL        -29
+#define CV_PROJFUNC_FAIL        -30
+#define CV_REPTD_PROJFUNC_ERR   -31
+
 #define CV_UNRECOGNIZED_ERR     -99
 
 
@@ -95,6 +100,8 @@ typedef void (*CVErrHandlerFn)(int error_code,
                                const char *module, const char *function,
                                char *msg, void *user_data);
 
+typedef int (*CVMonitorFn)(void *cvode_mem, void *user_data);
+
 /* -------------------
  * Exported Functions
  * ------------------- */
@@ -118,6 +125,8 @@ SUNDIALS_EXPORT int CVodeSetErrHandlerFn(void *cvode_mem, CVErrHandlerFn ehfun,
                                          void *eh_data);
 SUNDIALS_EXPORT int CVodeSetErrFile(void *cvode_mem, FILE *errfp);
 SUNDIALS_EXPORT int CVodeSetUserData(void *cvode_mem, void *user_data);
+SUNDIALS_EXPORT int CVodeSetMonitorFn(void *cvode_mem, CVMonitorFn fn);
+SUNDIALS_EXPORT int CVodeSetMonitorFrequency(void *cvode_mem, long int nst);
 SUNDIALS_EXPORT int CVodeSetMaxOrd(void *cvode_mem, int maxord);
 SUNDIALS_EXPORT int CVodeSetMaxNumSteps(void *cvode_mem, long int mxsteps);
 SUNDIALS_EXPORT int CVodeSetMaxHnilWarns(void *cvode_mem, int mxhnil);
@@ -131,9 +140,9 @@ SUNDIALS_EXPORT int CVodeSetMaxNonlinIters(void *cvode_mem, int maxcor);
 SUNDIALS_EXPORT int CVodeSetMaxConvFails(void *cvode_mem, int maxncf);
 SUNDIALS_EXPORT int CVodeSetNonlinConvCoef(void *cvode_mem, realtype nlscoef);
 SUNDIALS_EXPORT int CVodeSetConstraints(void *cvode_mem, N_Vector constraints);
-
 SUNDIALS_EXPORT int CVodeSetNonlinearSolver(void *cvode_mem,
                                             SUNNonlinearSolver NLS);
+SUNDIALS_EXPORT int CVodeSetUseIntegratorFusedKernels(void *cvode_mem, booleantype onoff);
 
 /* Rootfinding initialization function */
 SUNDIALS_EXPORT int CVodeRootInit(void *cvode_mem, int nrtfn, CVRootFn g);
@@ -192,6 +201,9 @@ SUNDIALS_EXPORT char *CVodeGetReturnFlagName(long int flag);
 /* Free function */
 SUNDIALS_EXPORT void CVodeFree(void **cvode_mem);
 
+/* CVLS interface function that depends on CVRhsFn */
+SUNDIALS_EXPORT int CVodeSetJacTimesRhsFn(void *cvode_mem,
+                                          CVRhsFn jtimesRhsFn);
 
 #ifdef __cplusplus
 }
diff --git a/include/cvode/cvode_ls.h b/include/cvode/cvode_ls.h
index 31e51019e6..cd79eb3c7b 100644
--- a/include/cvode/cvode_ls.h
+++ b/include/cvode/cvode_ls.h
@@ -124,6 +124,11 @@ SUNDIALS_EXPORT int CVodeGetNumJtimesEvals(void *cvode_mem,
                                            long int *njvevals);
 SUNDIALS_EXPORT int CVodeGetNumLinRhsEvals(void *cvode_mem,
                                            long int *nfevalsLS);
+SUNDIALS_EXPORT int CVodeGetLinSolveStats(void* cvode_mem,
+                                          long int* njevals, long int* nfevalsLS,
+                                          long int* nliters, long int* nlcfails,
+                                          long int* npevals, long int* npsolves,
+                                          long int* njtsetups, long int* njtimes);                                        
 SUNDIALS_EXPORT int CVodeGetLastLinFlag(void *cvode_mem,
                                         long int *flag);
 SUNDIALS_EXPORT char *CVodeGetLinReturnFlagName(long int flag);
diff --git a/include/cvode/cvode_proj.h b/include/cvode/cvode_proj.h
new file mode 100644
index 0000000000..127ee69904
--- /dev/null
+++ b/include/cvode/cvode_proj.h
@@ -0,0 +1,58 @@
+/* -----------------------------------------------------------------------------
+ * Programmer(s): David J. Gardner @ LLNL
+ * -----------------------------------------------------------------------------
+ * Based on CPODES by Radu Serban @ LLNL
+ * -----------------------------------------------------------------------------
+ * SUNDIALS Copyright Start
+ * Copyright (c) 2002-2020, Lawrence Livermore National Security
+ * and Southern Methodist University.
+ * All rights reserved.
+ *
+ * See the top-level LICENSE and NOTICE files for details.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ * SUNDIALS Copyright End
+ * -----------------------------------------------------------------------------
+ * This is the header file for CVODE's projection interface.
+ * ---------------------------------------------------------------------------*/
+
+#ifndef _CVPROJ_H
+#define _CVPROJ_H
+
+#include <sundials/sundials_nvector.h>
+
+#ifdef __cplusplus  /* wrapper to enable C++ usage */
+extern "C" {
+#endif
+
+/* -----------------------------------------------------------------------------
+ * CVProj user-supplied function prototypes
+ * ---------------------------------------------------------------------------*/
+
+typedef int (*CVProjFn)(realtype t, N_Vector ycur, N_Vector corr,
+                        realtype epsProj, N_Vector err, void *user_data);
+
+
+/* -----------------------------------------------------------------------------
+ * CVProj Exported functions
+ * ---------------------------------------------------------------------------*/
+
+/* Projection initialization functions */
+SUNDIALS_EXPORT int CVodeSetProjFn(void *cvode_mem, CVProjFn pfun);
+
+/* Optional input functions */
+SUNDIALS_EXPORT int CVodeSetProjErrEst(void *cvode_mem, booleantype onoff);
+SUNDIALS_EXPORT int CVodeSetProjFrequency(void *cvode_mem, long int proj_freq);
+SUNDIALS_EXPORT int CVodeSetMaxNumProjFails(void *cvode_mem, int max_fails);
+SUNDIALS_EXPORT int CVodeSetEpsProj(void *cvode_mem, realtype eps);
+SUNDIALS_EXPORT int CVodeSetProjFailEta(void *cvode_mem, realtype eta);
+
+/* Optional output functions */
+SUNDIALS_EXPORT int CVodeGetNumProjEvals(void *cvode_mem, long int *nproj);
+SUNDIALS_EXPORT int CVodeGetNumProjFails(void *cvode_mem, long int *nprf);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/cvodes/cvodes.h b/include/cvodes/cvodes.h
index 29e294b7b7..479c6cca34 100644
--- a/include/cvodes/cvodes.h
+++ b/include/cvodes/cvodes.h
@@ -270,6 +270,10 @@ SUNDIALS_EXPORT char *CVodeGetReturnFlagName(long int flag);
 /* Free function */
 SUNDIALS_EXPORT void CVodeFree(void **cvode_mem);
 
+/* CVLS interface function that depends on CVRhsFn */
+SUNDIALS_EXPORT int CVodeSetJacTimesRhsFn(void *cvode_mem,
+                                          CVRhsFn jtimesRhsFn);
+
 
 /* ---------------------------------
  * Exported Functions -- Quadrature
@@ -537,6 +541,9 @@ typedef struct {
 SUNDIALS_EXPORT int CVodeGetAdjCheckPointsInfo(void *cvode_mem,
                                                CVadjCheckPointRec *ckpnt);
 
+/* CVLS interface function that depends on CVRhsFn */
+int CVodeSetJacTimesRhsFnB(void *cvode_mem, int which, CVRhsFn jtimesRhsFn);
+
 
 /* Undocumented Optional Output Functions For Backward Problems */
 
diff --git a/include/ida/ida.h b/include/ida/ida.h
index e340543694..d06b633da2 100644
--- a/include/ida/ida.h
+++ b/include/ida/ida.h
@@ -207,6 +207,10 @@ SUNDIALS_EXPORT char *IDAGetReturnFlagName(long int flag);
 /* Free function */
 SUNDIALS_EXPORT void IDAFree(void **ida_mem);
 
+/* IDALS interface function that depends on IDAResFn */
+SUNDIALS_EXPORT int IDASetJacTimesResFn(void *ida_mem,
+                                        IDAResFn jtimesResFn);
+
 
 #ifdef __cplusplus
 }
diff --git a/include/idas/idas.h b/include/idas/idas.h
index a0c59809c4..ad5469e80d 100644
--- a/include/idas/idas.h
+++ b/include/idas/idas.h
@@ -285,6 +285,10 @@ SUNDIALS_EXPORT char *IDAGetReturnFlagName(long int flag);
 /* Free function */
 SUNDIALS_EXPORT void IDAFree(void **ida_mem);
 
+/* IDALS interface function that depends on IDAResFn */
+SUNDIALS_EXPORT int IDASetJacTimesResFn(void *ida_mem,
+                                        IDAResFn jtimesResFn);
+
 
 /* ---------------------------------
  * Exported Functions -- Quadrature
@@ -549,6 +553,10 @@ typedef struct {
 SUNDIALS_EXPORT int IDAGetAdjCheckPointsInfo(void *ida_mem,
                                              IDAadjCheckPointRec *ckpnt);
 
+/* IDALS interface function that depends on IDAResFn */
+SUNDIALS_EXPORT int IDASetJacTimesResFnB(void *ida_mem, int which,
+                                         IDAResFn jtimesResFn);
+
 
 /* Undocumented Optional Output Functions For Backward Problems */
 
diff --git a/include/kinsol/kinsol.h b/include/kinsol/kinsol.h
index 10ac581c7c..142ec0495f 100644
--- a/include/kinsol/kinsol.h
+++ b/include/kinsol/kinsol.h
@@ -72,7 +72,7 @@ extern "C" {
  * User-Supplied Function Types
  * ------------------------------ */
 
-typedef int (*KINSysFn)(N_Vector uu, N_Vector fval, void *user_data );
+typedef int (*KINSysFn)(N_Vector uu, N_Vector fval, void *user_data);
 
 typedef void (*KINErrHandlerFn)(int error_code,
                                 const char *module, const char *function,
@@ -142,6 +142,8 @@ SUNDIALS_EXPORT char *KINGetReturnFlagName(long int flag);
 /* Free function */
 SUNDIALS_EXPORT void KINFree(void **kinmem);
 
+/* KINLS interface function that depends on KINSysFn */
+SUNDIALS_EXPORT int KINSetJacTimesVecSysFn(void *kinmem, KINSysFn jtimesSysFn);
 
 #ifdef __cplusplus
 }
diff --git a/include/nvector/cuda/ThreadPartitioning.hpp b/include/nvector/cuda/ThreadPartitioning.hpp
deleted file mode 100644
index 4e830ac51b..0000000000
--- a/include/nvector/cuda/ThreadPartitioning.hpp
+++ /dev/null
@@ -1,386 +0,0 @@
-/*
- * -----------------------------------------------------------------
- * Programmer(s): Slaven Peles @ LLNL
- * -----------------------------------------------------------------
- * SUNDIALS Copyright Start
- * Copyright (c) 2002-2020, Lawrence Livermore National Security
- * and Southern Methodist University.
- * All rights reserved.
- *
- * See the top-level LICENSE and NOTICE files for details.
- *
- * SPDX-License-Identifier: BSD-3-Clause
- * SUNDIALS Copyright End
- * -----------------------------------------------------------------
- */
-
-
-
-#ifndef _THREAD_PARTITIONING_HPP_
-#define _THREAD_PARTITIONING_HPP_
-
-#include <iostream>
-#include <cuda_runtime.h>
-
-#include <sundials/sundials_types.h>
-
-namespace suncudavec
-{
-  
-using SUNAllocFn = void* (*)(size_t);
-using SUNFreeFn  = void (*)(void*);
-
-template<class T, class I>
-class ThreadPartitioning
-{
-
-public:
-  ThreadPartitioning()
-  : block_(1),
-    grid_(1),
-    shMemSize_(0),
-    stream_(0),
-    bufferSize_(0),
-    allocfn_(nullptr),
-    freefn_(nullptr),
-    d_buffer_(nullptr),
-    h_buffer_(nullptr),
-    ownBuffer_(true)
-  {}
-
-  ThreadPartitioning(unsigned block,
-                     SUNAllocFn allocfn = nullptr,
-                     SUNFreeFn freefn = nullptr)
-  : block_(block),
-    grid_(1),
-    shMemSize_(0),
-    stream_(0),
-    bufferSize_(0),
-    allocfn_(allocfn),
-    freefn_(freefn),
-    d_buffer_(nullptr),
-    h_buffer_(nullptr),
-    ownBuffer_(true)
-  {}
-  
-  explicit ThreadPartitioning(ThreadPartitioning<T, I>& p)
-  : block_(p.block_),
-    grid_(p.grid_),
-    shMemSize_(p.shMemSize_),
-    stream_(p.stream_),
-    allocfn_(p.allocfn_),
-    freefn_(p.freefn_)
-  {}
-
-  virtual ~ThreadPartitioning(){}
-
-  unsigned grid() const
-  {
-    return grid_;
-  }
-
-  unsigned block() const
-  {
-    return block_;
-  }
-
-  unsigned shmem() const
-  {
-    return shMemSize_;
-  }
-
-  cudaStream_t stream() const
-  {
-    return stream_;
-  }
-
-  unsigned int bufferSize()
-  {
-    return bufferSize_;
-  }
-
-  T* devBuffer()
-  {
-    return d_buffer_;
-  }
-
-  const T* devBuffer() const
-  {
-    return d_buffer_;
-  }
-
-  T* hostBuffer()
-  {
-    return h_buffer_;
-  }
-
-  const T* hostBuffer() const
-  {
-    return h_buffer_;
-  }
-
-  void setStream(const cudaStream_t& stream)
-  {
-    stream_ = stream;
-  }
-
-  virtual void copyFromDevBuffer(unsigned int n) const
-  {
-    std::cerr << "Trying to copy buffer from base class in "
-              << "suncudavec::ThreadPartitioning::copyFromDevBuffer\n";
-  }
-
-  /* pure virtual functions to get the relevant partitioning information */
-  virtual int calcPartitioning(I N, unsigned& grid, unsigned& block, unsigned& shMemSize, cudaStream_t& stream) = 0;
-  virtual int calcPartitioning(I N, unsigned& grid, unsigned& block, unsigned& shMemSize) = 0;
-  
-protected:
-  unsigned block_;
-  unsigned grid_;
-  unsigned shMemSize_;
-  unsigned bufferSize_;
-  cudaStream_t stream_;
-  T* d_buffer_;
-  T* h_buffer_;
-  bool ownBuffer_;
-
-  /* custom allocators for the internal buffers */
-  SUNAllocFn allocfn_;
-  SUNFreeFn freefn_;
-
-}; // class ThreadPartitioning
-
-
-
-template<class T, class I>
-class StreamPartitioning : public ThreadPartitioning<T, I>
-{
-  using ThreadPartitioning<T, I>::block_;
-  using ThreadPartitioning<T, I>::grid_;
-  using ThreadPartitioning<T, I>::stream_;
-
-public:
-  StreamPartitioning(I N, unsigned block, cudaStream_t stream)
-  : ThreadPartitioning<T, I>(block)
-  {
-    grid_ = (N + block_ - 1) / block_;
-    stream_ = stream;
-  }
-  
-  StreamPartitioning(I N, unsigned block)
-  : ThreadPartitioning<T, I>(block)
-  {
-    grid_ = (N + block_ - 1) / block_;
-  }
-
-  explicit StreamPartitioning(StreamPartitioning<T, I>& p)
-  : ThreadPartitioning<T, I>(p)
-  {
-  }
-
-  virtual int calcPartitioning(I N, unsigned& grid, unsigned& block, unsigned& shMemSize,
-                               cudaStream_t& stream)
-  {
-    block = block_;
-    grid  = (N + block_ - 1) / block_;
-    shMemSize = 0;
-    stream = stream_;
-
-    return 0;
-  }
-  
-  virtual int calcPartitioning(I N, unsigned& grid, unsigned& block, unsigned& shMemSize)
-  {
-    block = block_;
-    grid  = (N + block_ - 1) / block_;
-    shMemSize = 0;
-
-    return 0;
-  }
-
-}; // class StreamPartitioning
-
-
-template<class T, class I=int>
-class ReducePartitioning : public ThreadPartitioning<T, I>
-{
-  using ThreadPartitioning<T, I>::block_;
-  using ThreadPartitioning<T, I>::grid_;
-  using ThreadPartitioning<T, I>::shMemSize_;
-  using ThreadPartitioning<T, I>::stream_;
-  using ThreadPartitioning<T, I>::bufferSize_;
-  using ThreadPartitioning<T, I>::d_buffer_;
-  using ThreadPartitioning<T, I>::h_buffer_;
-  using ThreadPartitioning<T, I>::ownBuffer_;
-  using ThreadPartitioning<T, I>::allocfn_;
-  using ThreadPartitioning<T, I>::freefn_;
-
-public:
-  ReducePartitioning(I N, unsigned block,
-                     SUNAllocFn allocfn = nullptr, SUNFreeFn freefn = nullptr)
-  : ThreadPartitioning<T, I>(block, allocfn, freefn)
-  {
-    grid_ = (N + (block_ * 2 - 1)) / (block_ * 2);
-    shMemSize_ = block_*sizeof(T);
-    allocateBuffer(false, allocfn != nullptr);
-  }
-  
-  ReducePartitioning(I N, unsigned block, cudaStream_t stream,
-                     SUNAllocFn allocfn = nullptr, SUNFreeFn freefn = nullptr)
-  : ThreadPartitioning<T, I>(block, allocfn, freefn)
-  {
-    grid_ = (N + (block_ * 2 - 1)) / (block_ * 2);
-    shMemSize_ = block_*sizeof(T);
-    stream_ = stream;
-    allocateBuffer(false, allocfn != nullptr);
-  }
-  
-  ReducePartitioning(T *h_buffer, T *d_buffer, I N, unsigned block, cudaStream_t stream = 0)
-  : ThreadPartitioning<T, I>(block)
-  {
-    grid_ = (N + (block_ * 2 - 1)) / (block_ * 2);
-    shMemSize_ = block_*sizeof(T);
-    stream_ = stream;
-    h_buffer_ = h_buffer;
-    d_buffer_ = d_buffer;
-    ownBuffer_ = false;
-  }
-  
-  explicit ReducePartitioning(ReducePartitioning<T, I>& p)
-  : ThreadPartitioning<T, I>(p)
-  {
-    shMemSize_ = p.shMemSize_;
-    /* if device buffer and host buffer are the same, then assume managed memory */
-    allocateBuffer(p.d_buffer_ == p.h_buffer_, p.allocfn_ != nullptr);
-  }
-
-  ~ReducePartitioning()
-  {
-    cudaError_t err;
-
-    if (ownBuffer_ && bufferSize_ > 0) {
-
-      if (d_buffer_ == h_buffer_) {
-        /* managed memory */
-        if (freefn_) {
-          freefn_(d_buffer_);
-        } else {
-          err = cudaFree(d_buffer_);
-          if(err != cudaSuccess)
-            std::cerr << "Failed to free device vector "
-                      << "in suncudavec::ReducePartitioning::~ReducePartitioning "
-                      << "(CUDA error code " << err << ")\n";
-        }
-        d_buffer_ = h_buffer_ = nullptr;
-      } else {
-        /* unmanaged memory */
-        err = cudaFree(d_buffer_);
-        if(err != cudaSuccess)
-          std::cerr << "Failed to free device vector "
-                    << "in suncudavec::ReducePartitioning::~ReducePartitioning "
-                    << "(CUDA error code " << err << ")\n";
-        free(h_buffer_);
-        d_buffer_ = nullptr;
-        h_buffer_ = nullptr;
-      }
-
-    }
-  }
-
-  virtual int calcPartitioning(I N, unsigned& grid, unsigned& block, unsigned& shMemSize,
-                               cudaStream_t& stream)
-  {
-    block = block_;
-    grid  = (N + (block_ * 2 - 1)) / (block_ * 2);
-    shMemSize = block_ * sizeof(T);
-    stream = stream_;
-
-    return 0;
-  }
-  
-  virtual int calcPartitioning(I N, unsigned& grid, unsigned& block, unsigned& shMemSize)
-  {
-    block = block_;
-    grid  = (N + (block_ * 2 - 1)) / (block_ * 2);
-    shMemSize = block_ * sizeof(T);
-
-    return 0;
-  }
-
-  virtual void copyFromDevBuffer(unsigned int n) const
-  {
-    cudaError_t err;
-
-    /* If the host and device pointers are the same, then we don't need
-       to do a copy (this happens in the managed memory case), but we
-       still need to synchronize the device to adhere to the unified
-       memory access rules. */
-    if (h_buffer_ == d_buffer_) { 
-      err = cudaStreamSynchronize(stream_);
-      if(err != cudaSuccess)
-        std::cerr << "Failed to synchronize stream in "
-                  << "suncudavec::ReducePartitioning::copyFromDevBuffer " 
-                  << "(CUDA error code " << err << ")\n";
-    } else {
-      err = cudaMemcpyAsync(h_buffer_, d_buffer_, n*sizeof(T), cudaMemcpyDeviceToHost,
-                            stream_);
-      if(err != cudaSuccess)
-        std::cerr << "Failed to copy vector from device to host in "
-                  << "suncudavec::ReducePartitioning::copyFromDevBuffer " 
-                  << "(CUDA error code " << err << ")\n";
-    }
-  }
-
-  static unsigned calcBufferSize(I N, unsigned block)
-  {
-    return (N + (block * 2 - 1)) / (block * 2) * sizeof(T);
-  }
-
-private:
-  int allocateBuffer(bool use_managed_memory = false, bool custom_allocator = false)
-  {
-    cudaError_t err;
-
-    bufferSize_ = grid_ * sizeof(T);
-    if (bufferSize_ == 0) return 0;
-
-    if (custom_allocator) {
-
-      d_buffer_ = static_cast<T*>(allocfn_(bufferSize_));
-      if(d_buffer_ == NULL)
-        std::cerr << "Failed to allocate managed buffer with custom allocator in "
-                  << "suncudavec::ReducePartitioning::allocateBuffer\n";
-      h_buffer_ = d_buffer_;
-
-    } else if (use_managed_memory) {
-
-      err = cudaMallocManaged((void**) &d_buffer_, bufferSize_);
-      if(err != cudaSuccess)
-        std::cerr << "Failed to allocate internal managed buffer in "
-                  << "suncudavec::ReducePartitioning::allocateBuffer "
-                  << "(CUDA error code " << err << ")\n";
-      h_buffer_ = d_buffer_;
-
-    } else {
-
-      h_buffer_ = static_cast<T*>(malloc(bufferSize_));
-      if(h_buffer_ == NULL)
-        std::cerr << "Failed to allocate internal host buffer in "
-                  << "suncudavec::ReducePartitioning::allocateBuffer\n";
-      err = cudaMalloc((void**) &d_buffer_, bufferSize_);
-      if(err != cudaSuccess)
-        std::cerr << "Failed to allocate internal device buffer "
-                  << "in suncudavec::ReducePartitioning::allocateBuffer "
-                  << "(CUDA error code " << err << ")\n";
-
-    }
-
-    return 0;
-  }
-
-}; // class ReducePartitioning
-
-
-} // namespace suncudavec
-
-#endif // _THREAD_PARTITIONING_HPP_
diff --git a/include/nvector/cuda/Vector.hpp b/include/nvector/cuda/Vector.hpp
deleted file mode 100644
index 2d3f9aaa99..0000000000
--- a/include/nvector/cuda/Vector.hpp
+++ /dev/null
@@ -1,371 +0,0 @@
-/*
- * -----------------------------------------------------------------
- * Programmer(s): Slaven Peles, and Cody J. Balos @ LLNL
- * -----------------------------------------------------------------
- * SUNDIALS Copyright Start
- * Copyright (c) 2002-2020, Lawrence Livermore National Security
- * and Southern Methodist University.
- * All rights reserved.
- *
- * See the top-level LICENSE and NOTICE files for details.
- *
- * SPDX-License-Identifier: BSD-3-Clause
- * SUNDIALS Copyright End
- * -----------------------------------------------------------------
- */
-
-
-/*
- * Vector class
- *
- * Manages vector data layout for CUDA implementation of N_Vector.
- *
- */
-
-#ifndef _NVECTOR_CUDA_HPP_
-#define _NVECTOR_CUDA_HPP_
-
-#include <cstdlib>
-#include <iostream>
-
-#include <cuda_runtime.h>
-#include "ThreadPartitioning.hpp"
-
-#include <nvector/nvector_cuda.h>
-#include <sundials/sundials_config.h>
-
-namespace suncudavec
-{
-
-template <typename T, typename I>
-class Vector : public _N_VectorContent_Cuda
-{
-
-public:
-  Vector(I N,
-         bool use_managed_memory = false, bool allocate_data = true,
-         T* const h_vec = nullptr, T* const d_vec = nullptr)
-  : size_(N),
-    mem_size_(N*sizeof(T)),
-    ownPartitioning_(true),
-    ownData_(allocate_data),
-    managed_mem_(use_managed_memory),
-    allocfn_(nullptr),
-    freefn_(nullptr),
-    h_vec_(h_vec),
-    d_vec_(d_vec)
-  {
-    // Set partitioning
-    partStream_ = new StreamPartitioning<T, I>(N, 256);
-    partReduce_ = new ReducePartitioning<T, I>(N, 256);
-
-    // Allocate data arrays
-    if (allocate_data)
-      allocate();
-  }
-
-  Vector(I N, cudaStream_t stream,
-         bool use_managed_memory = false, bool allocate_data = true,
-         T* const h_vec = nullptr, T* const d_vec = nullptr)
-  : size_(N),
-    mem_size_(N*sizeof(T)),
-    ownPartitioning_(true),
-    ownData_(allocate_data),
-    managed_mem_(use_managed_memory),
-    allocfn_(nullptr),
-    freefn_(nullptr),
-    h_vec_(h_vec),
-    d_vec_(d_vec)
-  {
-    // Set partitioning
-    partStream_ = new StreamPartitioning<T, I>(N, 256, stream);
-    partReduce_ = new ReducePartitioning<T, I>(N, 256, stream);
-
-    // Allocate data arrays
-    if (allocate_data)
-      allocate();
-  }
-
-  Vector(I N,
-         SUNAllocFn allocfn, SUNFreeFn freefn,
-         bool allocate_data = true)
-  : size_(N),
-    mem_size_(N*sizeof(T)),
-    ownPartitioning_(true),
-    ownData_(allocate_data),
-    managed_mem_(true),
-    allocfn_(allocfn),
-    freefn_(freefn),
-    h_vec_(nullptr),
-    d_vec_(nullptr)
-  {
-    // Set partitioning
-    partStream_ = new StreamPartitioning<T, I>(N, 256);
-    partReduce_ = new ReducePartitioning<T, I>(N, 256, allocfn, freefn);
-
-    // Allocate data arrays
-    if (allocate_data)
-      allocate();
-  }
-
-  Vector(I N, cudaStream_t stream,
-         SUNAllocFn allocfn, SUNFreeFn freefn,
-         bool allocate_data = true)
-  : size_(N),
-    mem_size_(N*sizeof(T)),
-    ownPartitioning_(true),
-    ownData_(allocate_data),
-    managed_mem_(true),
-    allocfn_(allocfn),
-    freefn_(freefn),
-    h_vec_(nullptr),
-    d_vec_(nullptr)
-  {
-    // Set partitioning
-    partStream_ = new StreamPartitioning<T, I>(N, 256, stream);
-    partReduce_ = new ReducePartitioning<T, I>(N, 256, stream, allocfn, freefn);
-
-    // Allocate data arrays
-    if (allocate_data)
-      allocate();
-  }
-
-  // Copy constructor does not copy data array values
-  explicit Vector(const Vector& v)
-  : size_(v.size()),
-    mem_size_(size_*sizeof(T)),
-    partStream_(v.partStream_),
-    partReduce_(v.partReduce_),
-    ownPartitioning_(false),
-    ownData_(true),
-    managed_mem_(v.managed_mem_),
-    allocfn_(v.allocfn_),
-    freefn_(v.freefn_),
-    h_vec_(nullptr),
-    d_vec_(nullptr)
-  {
-    allocate();
-  }
-
-  ~Vector()
-  {
-    cudaError_t err;
-
-    if (ownPartitioning_) {
-      delete partReduce_;
-      delete partStream_;
-    }
-
-    if (ownData_) {
-      if (freefn_) {
-        freefn_(d_vec_);
-        d_vec_ = nullptr;
-        h_vec_ = nullptr;
-      } else {
-        if (!managed_mem_)
-          free(h_vec_);
-        err = cudaFree(d_vec_);
-        if(err != cudaSuccess)
-          std::cerr << "Failed to free device vector "
-                    << "in suncudavec::Vector::~Vector "
-                    << "(error code " << err << ")\n";
-        d_vec_ = nullptr;
-        h_vec_ = nullptr;
-      }
-    }
-  }
-
-  void allocate()
-  {
-    if (allocfn_) {
-      allocateCustom();
-    } else if (managed_mem_) {
-      allocateManaged();
-    } else {
-      allocateUnmanaged();
-    }
-  }
-
-  void allocateManaged()
-  {
-    cudaError_t err;
-    err = cudaMallocManaged((void**) &d_vec_, mem_size_);
-    if (err != cudaSuccess)
-      std::cerr << "Failed to allocate managed vector "
-                << "in suncudavec::Vector::allocateManaged "
-                << "(error code " << err << ")\n";
-    h_vec_ = d_vec_;
-  }
-
-  void allocateUnmanaged()
-  {
-    cudaError_t err;
-
-    h_vec_ = static_cast<T*>(malloc(mem_size_));
-    if(h_vec_ == nullptr)
-      std::cerr << "Failed to allocate host vector "
-                << "in suncudavec::Vector::allocateUnmanaged\n";
-
-    err = cudaMalloc((void**) &d_vec_, mem_size_);
-    if(err != cudaSuccess)
-      std::cerr << "Failed to allocate device vector "
-                << "in suncudavec::Vector::allocateUnmanaged "
-                << "(error code " << err << ")\n";
-  }
-
-  void allocateCustom()
-  {
-    /* We assume managed memory when a custom allocator is provided */
-    d_vec_ = (realtype *) allocfn_(mem_size_);
-    if (d_vec_ == nullptr)
-      std::cerr << "Failed to allocate vector with user-provied allocator "
-                << "in suncudavec::Vector::allocateCustom()\n";
-    h_vec_ = d_vec_;
-  }
-
-  int size() const
-  {
-    return size_;
-  }
-
-  T* host()
-  {
-    // If the vector is using managed memory, and a user
-    // is accessing a data array, then we need to synchronzie
-    // to ensure all kernels have completed since a memcpy
-    // won't have to happen.
-    if (managed_mem_)
-      cudaStreamSynchronize(partReduce_->stream());
-    return h_vec_;
-  }
-
-  const T* host() const
-  {
-    // If the vector is using managed memory, and a user
-    // is accessing a data array, then we need to synchronzie
-    // to ensure all kernels have completed since a memcpy
-    // won't have to happen.
-    if (managed_mem_)
-      cudaStreamSynchronize(partReduce_->stream());
-    return h_vec_;
-  }
-
-  T* device()
-  {
-    // If the vector is using managed memory, and a user
-    // is accessing a data array, then we need to synchronzie
-    // to ensure all kernels have completed since a memcpy
-    // won't have to happen.
-    if (managed_mem_)
-      cudaStreamSynchronize(partReduce_->stream());
-    return d_vec_;
-  }
-
-  const T* device() const
-  {
-    // If the vector is using managed memory, and a user
-    // is accessing a data array, then we need to synchronzie
-    // to ensure all kernels have completed since a memcpy
-    // won't have to happen.
-    if (managed_mem_)
-      cudaStreamSynchronize(partReduce_->stream());
-    return d_vec_;
-  }
-
-  bool isManaged() const
-  {
-    return managed_mem_;
-  }
-
-  void copyToDev()
-  {
-    cudaError_t err;
-
-    /* If the host and device pointers are the same, then we don't need
-       to do a copy (this happens in the managed memory case), but we
-       still need to synchronize the device to adhere to the unified
-       memory access rules. */
-    if (h_vec_ == d_vec_) {
-      err = cudaStreamSynchronize(partReduce_->stream());
-      if(err != cudaSuccess)
-        std::cerr << "Failed to synchronize stream in "
-                  << "suncudavec::Vector::copyToDev "
-                  << "(error code " << err << ")\n";
-    } else {
-      err = cudaMemcpyAsync(d_vec_, h_vec_, mem_size_, cudaMemcpyHostToDevice,
-                            partReduce_->stream());
-      if(err != cudaSuccess)
-        std::cerr << "Failed to copy vector from host to device in "
-                  << "suncudavec::Vector::copyToDev "
-                  << "(error code " << err << ")\n";
-    }
-  }
-
-  void copyFromDev()
-  {
-    cudaError_t err;
-
-    /* If the host and device pointers are the same, then we don't need
-       to do a copy (this happens in the managed memory case), but we
-       still need to synchronize the device to adhere to the unified
-       memory access rules. */
-    if (h_vec_ == d_vec_) {
-      err = cudaStreamSynchronize(partReduce_->stream());
-      if(err != cudaSuccess)
-        std::cerr << "Failed to synchronize stream in "
-                  << "suncudavec::Vector::copyFromDev "
-                  << "(error code " << err << ")\n";
-    } else {
-      err = cudaMemcpyAsync(h_vec_, d_vec_, mem_size_, cudaMemcpyDeviceToHost,
-                            partReduce_->stream());
-      if(err != cudaSuccess)
-        std::cerr << "Failed to copy vector from device to host in "
-                  << "suncudavec::Vector::copyFromDev "
-                  << "(error code " << err << ")\n";
-    }
-  }
-
-  void setPartitioning(ThreadPartitioning<T, I>* stream, ThreadPartitioning<T, I>* reduce)
-  {
-     if (ownPartitioning_) {
-       delete partStream_;
-       delete partReduce_;
-     }
-    partStream_ = stream;
-    partReduce_ = reduce;
-    ownPartitioning_ = false;
-  }
-
-  ThreadPartitioning<T, I>& partStream() const
-  {
-    return *partStream_;
-  }
-
-  ThreadPartitioning<T, I>& partReduce() const
-  {
-    return *partReduce_;
-  }
-
-
-private:
-  I size_;
-  I mem_size_;
-  T* h_vec_;
-  T* d_vec_;
-  ThreadPartitioning<T, I>* partStream_;
-  ThreadPartitioning<T, I>* partReduce_;
-  bool ownPartitioning_;
-  bool ownData_;
-  bool managed_mem_;
-  SUNAllocFn allocfn_;
-  SUNFreeFn freefn_;
-
-};
-
-
-} // namespace suncudavec
-
-
-
-
-#endif // _NVECTOR_CUDA_HPP_
diff --git a/include/nvector/cuda/VectorArrayKernels.cuh b/include/nvector/cuda/VectorArrayKernels.cuh
deleted file mode 100644
index 61e0c56e94..0000000000
--- a/include/nvector/cuda/VectorArrayKernels.cuh
+++ /dev/null
@@ -1,1069 +0,0 @@
-/*
- * -----------------------------------------------------------------
- * Programmer(s): David Gardner @ LLNL
- * -----------------------------------------------------------------
- * SUNDIALS Copyright Start
- * Copyright (c) 2002-2020, Lawrence Livermore National Security
- * and Southern Methodist University.
- * All rights reserved.
- *
- * See the top-level LICENSE and NOTICE files for details.
- *
- * SPDX-License-Identifier: BSD-3-Clause
- * SUNDIALS Copyright End
- * -----------------------------------------------------------------
- */
-
-
-#ifndef _VECTOR_ARRAY_KERNELS_CUH_
-#define _VECTOR_ARRAY_KERNELS_CUH_
-
-#include <limits>
-#include <cuda_runtime.h>
-
-
-namespace suncudavec
-{
-
-
-/* -----------------------------------------------------------------
- * The namespace for CUDA kernels
- *
- * Reduction CUDA kernels in nvector are based in part on "reduction"
- * example in NVIDIA Corporation CUDA Samples, and parallel reduction
- * examples in textbook by J. Cheng at al. "CUDA C Programming".
- * -----------------------------------------------------------------
- */
-namespace math_kernels
-{
-
-
-/*
- * -----------------------------------------------------------------------------
- * fused vector operation kernels
- * -----------------------------------------------------------------------------
- */
-
-/*
- * Computes the linear combination of nv vectors
- */
-template <typename T, typename I>
-__global__ void
-linearCombinationKernel(int nv, T* c, T** xd, T* zd, I n)
-{
-  I i = blockDim.x * blockIdx.x + threadIdx.x;
-
-  if (i < n) {
-    zd[i] = c[0]*xd[0][i];
-    for (int j=1; j<nv; j++)
-      zd[i] += c[j]*xd[j][i];
-  }
-}
-
-/*
- * Computes the scaled sum of one vector with nv other vectors
- */
-template <typename T, typename I>
-__global__ void
-scaleAddMultiKernel(int nv, T* c, T* xd, T** yd, T** zd, I n)
-{
-  I i = blockDim.x * blockIdx.x + threadIdx.x;
-
-  if (i < n)
-    for (int j=0; j<nv; j++)
-      zd[j][i] = c[j] * xd[i] + yd[j][i];
-}
-
-
-/*
- * Dot product of one vector with nv other vectors.
- *
- */
-template <typename T, typename I>
-__global__ void
-dotProdMultiKernel(int nv, T* xd, T** yd, T* out, I n)
-{
-  extern __shared__ T shmem[];
-
-  I tid = threadIdx.x;
-  I i = blockIdx.x*(blockDim.x*2) + threadIdx.x;
-
-  // Initialize shared memory to zero
-  for (int k=0; k<nv; k++)
-    shmem[tid + k*blockDim.x] = 0.0;
-
-  // First reduction step before storing data in shared memory.
-  if (i < n)
-    for (int k=0; k<nv; k++)
-      shmem[tid + k*blockDim.x] = xd[i] * yd[k][i];
-  if (i + blockDim.x < n)
-    for (int k=0; k<nv; k++)
-      shmem[tid + k*blockDim.x] += (xd[i + blockDim.x] * yd[k][i + blockDim.x]);
-
-  __syncthreads();
-
-  // Perform blockwise reduction in shared memory
-  for (I j = blockDim.x/2; j > 0; j >>= 1) {
-    if (tid < j)
-      for (int k=0; k<nv; k++)
-        shmem[tid + k*blockDim.x] += shmem[tid + j + k*blockDim.x];
-
-    __syncthreads();
-  }
-
-  // Copy reduction result for each block to global memory
-  if (tid == 0)
-    for (int k=0; k<nv; k++)
-      out[blockIdx.x + k*gridDim.x] = shmem[k*blockDim.x];
-}
-
-
-/*
- * Sums all elements of the vector.
- *
- */
-template <typename T, typename I>
-__global__ void
-sumReduceVectorKernel(int nv, T* x, T* out, I n)
-{
-  extern __shared__ T shmem[];
-
-  I tid = threadIdx.x;
-  I i = blockIdx.x*(blockDim.x*2) + threadIdx.x;
-
-  // First reduction step before storing data in shared memory.
-  if (i < n)
-    for (int k=0; k<nv; k++)
-      shmem[tid + k*blockDim.x] = x[i];
-  if (i + blockDim.x < n)
-    for (int k=0; k<nv; k++)
-      shmem[tid + k*blockDim.x] += x[i+blockDim.x];
-
-  __syncthreads();
-
-  // Perform reduction block-wise in shared memory.
-  for (I j = blockDim.x/2; j > 0; j >>= 1) {
-    if (tid < j)
-      for (int k=0; k<nv; k++)
-        shmem[tid + k*blockDim.x] += shmem[tid + j + k*blockDim.x];
-
-    __syncthreads();
-  }
-
-  // Copy reduction result for each block to global memory
-  if (tid == 0)
-    for (int k=0; k<nv; k++)
-      out[blockIdx.x + k*gridDim.x] = shmem[k*blockDim.x];
-}
-
-
-
-/*
- * -----------------------------------------------------------------------------
- * vector array operation kernels
- * -----------------------------------------------------------------------------
- */
-
-/*
- * Computes the linear sum of multiple vectors
- */
-template <typename T, typename I>
-__global__ void
-linearSumVectorArrayKernel(int nv, T a, T** xd, T b, T** yd, T** zd, I n)
-{
-  I i = blockDim.x * blockIdx.x + threadIdx.x;
-
-  if (i < n)
-    for (int j=0; j<nv; j++)
-      zd[j][i] = a * xd[j][i] + b * yd[j][i];
-}
-
-
-/*
- * Scales multiple vectors
- */
-template <typename T, typename I>
-__global__ void
-scaleVectorArrayKernel(int nv, T* c, T** xd, T** zd, I n)
-{
-  I i = blockDim.x * blockIdx.x + threadIdx.x;
-
-  if (i < n)
-    for (int j=0; j<nv; j++)
-      zd[j][i] = c[j] * xd[j][i];
-}
-
-
-/*
- * Sets multiple vectors equal to a constant
- */
-template <typename T, typename I>
-__global__ void
-constVectorArrayKernel(int nv, T c, T** zd, I n)
-{
-  I i = blockDim.x * blockIdx.x + threadIdx.x;
-
-  if (i < n)
-    for (int j=0; j<nv; j++)
-      zd[j][i] = c;
-}
-
-
-/*
- * WRMS norm of nv vectors.
- *
- */
-template <typename T, typename I>
-__global__ void
-wL2NormSquareVectorArrayKernel(int nv, T** xd, T** wd, T* out, I n)
-{
-  extern __shared__ T shmem[];
-
-  I tid = threadIdx.x;
-  I i = blockIdx.x*(blockDim.x*2) + threadIdx.x;
-
-  // Initialize shared memory to zero
-  for (int k=0; k<nv; k++)
-    shmem[tid + k*blockDim.x] = 0.0;
-
-  // First reduction step before storing data in shared memory.
-  if (i < n)
-    for (int k=0; k<nv; k++)
-      shmem[tid + k*blockDim.x] = xd[k][i] * wd[k][i] * xd[k][i] * wd[k][i];
-  if (i + blockDim.x < n)
-    for (int k=0; k<nv; k++)
-      shmem[tid + k*blockDim.x] += (xd[k][i + blockDim.x] * wd[k][i + blockDim.x]
-                                    * xd[k][i + blockDim.x] * wd[k][i + blockDim.x]);
-
-  __syncthreads();
-
-  // Perform blockwise reduction in shared memory
-  for (I j = blockDim.x/2; j > 0; j >>= 1) {
-    if (tid < j)
-      for (int k=0; k<nv; k++)
-        shmem[tid + k*blockDim.x] += shmem[tid + j + k*blockDim.x];
-
-    __syncthreads();
-  }
-
-  // Copy reduction result for each block to global memory
-  if (tid == 0)
-    for (int k=0; k<nv; k++)
-      out[blockIdx.x + k*gridDim.x] = shmem[k*blockDim.x];
-}
-
-
-/*
- * Masked WRMS norm of nv vectors.
- *
- */
-template <typename T, typename I>
-__global__ void
-wL2NormSquareMaskVectorArrayKernel(int nv, T** xd, T** wd, T* id, T* out, I n)
-{
-  extern __shared__ T shmem[];
-
-  I tid = threadIdx.x;
-  I i = blockIdx.x*(blockDim.x*2) + threadIdx.x;
-
-  // Initialize shared memory to zero
-  for (int k=0; k<nv; k++)
-    shmem[tid + k*blockDim.x] = 0.0;
-
-  // First reduction step before storing data in shared memory.
-  if (i < n && id[i] > 0.0)
-    for (int k=0; k<nv; k++)
-      shmem[tid + k*blockDim.x] = xd[k][i] * wd[k][i] * xd[k][i] * wd[k][i];
-  if (i + blockDim.x < n && id[i + blockDim.x] > 0.0)
-    for (int k=0; k<nv; k++)
-      shmem[tid + k*blockDim.x] += (xd[k][i + blockDim.x] * wd[k][i + blockDim.x]
-                                    * xd[k][i + blockDim.x] * wd[k][i + blockDim.x]);
-
-  __syncthreads();
-
-  // Perform blockwise reduction in shared memory
-  for (I j = blockDim.x/2; j > 0; j >>= 1) {
-    if (tid < j)
-      for (int k=0; k<nv; k++)
-        shmem[tid + k*blockDim.x] += shmem[tid + j + k*blockDim.x];
-
-    __syncthreads();
-  }
-
-  // Copy reduction result for each block to global memory
-  if (tid == 0)
-    for (int k=0; k<nv; k++)
-      out[blockIdx.x + k*gridDim.x] = shmem[k*blockDim.x];
-}
-
-
-/*
- * Computes the scaled sum of a vector array with multiple other vector arrays
- */
-template <typename T, typename I>
-__global__ void
-scaleAddMultiVectorArrayKernel(int nv, int ns, T* c, T** xd, T** yd, T** zd, I n)
-{
-  I i = blockDim.x * blockIdx.x + threadIdx.x;
-
-  if (i < n)
-    for (int k=0; k<nv; k++)
-      for (int j=0; j<ns; j++)
-        zd[k*ns+j][i] = c[j] * xd[k][i] + yd[k*ns+j][i];
-}
-
-
-/*
- * Computes the scaled sum of a vector array with multiple other vector arrays
- */
-template <typename T, typename I>
-__global__ void
-linearCombinationVectorArrayKernel(int nv, int ns, T* c, T** xd, T** zd, I n)
-{
-  I i = blockDim.x * blockIdx.x + threadIdx.x;
-
-  if (i < n) {
-    for (int k=0; k<nv; k++) {
-      zd[k][i] = c[0]*xd[k*ns][i];
-      for (int j=1; j<ns; j++) {
-        zd[k][i] += c[j]*xd[k*ns+j][i];
-      }
-    }
-  }
-}
-
-} // namespace math_kernels
-
-
-
-
-
-
-/*
- * -----------------------------------------------------------------------------
- * fused vector operations
- * -----------------------------------------------------------------------------
- */
-
-template <typename T, typename I>
-inline cudaError_t linearCombination(int nvec, T* c, Vector<T,I>** X, Vector<T,I>* Z)
-{
-  cudaError_t err;
-
-  // Copy c array to device
-  T* d_c;
-  err = cudaMalloc((void**) &d_c, nvec*sizeof(T));
-  if (err != cudaSuccess) return cudaGetLastError();
-  err = cudaMemcpy(d_c, c, nvec*sizeof(T), cudaMemcpyHostToDevice);
-  if (err != cudaSuccess) return cudaGetLastError();
-
-  // Create array of device pointers on host
-  T** h_Xd = new T*[nvec];
-  for (int i=0; i<nvec; i++)
-    h_Xd[i] = X[i]->device();
-
-  // Copy array of device pointers to device from host
-  T** d_Xd;
-  err = cudaMalloc((void**) &d_Xd, nvec*sizeof(T*));
-  if (err != cudaSuccess) return cudaGetLastError();
-  err = cudaMemcpy(d_Xd, h_Xd, nvec*sizeof(T*), cudaMemcpyHostToDevice);
-  if (err != cudaSuccess) return cudaGetLastError();
-
-  // Set partitioning
-  ThreadPartitioning<T, I>& p = X[0]->partStream();
-  const I grid                = p.grid();
-  const unsigned block        = p.block();
-  const cudaStream_t stream   = p.stream();
-
-  math_kernels::linearCombinationKernel<<<grid, block, 0, stream>>>(
-      nvec,
-      d_c,
-      d_Xd,
-      Z->device(),
-      Z->size()
-  );
-
-  // Free host array
-  delete[] h_Xd;
-
-  // Free device arrays
-  err = cudaFree(d_c);
-  if (err != cudaSuccess) return cudaGetLastError();
-  err = cudaFree(d_Xd);
-  if (err != cudaSuccess) return cudaGetLastError();
-
-  return cudaGetLastError();
-}
-
-
-template <typename T, typename I>
-inline cudaError_t scaleAddMulti(int nvec, T* c, Vector<T,I>* X,
-                                 Vector<T,I>** Y, Vector<T,I>** Z)
-{
-  cudaError_t err;
-
-  // Copy c array to device
-  T* d_c;
-  err = cudaMalloc((void**) &d_c, nvec*sizeof(T));
-  if (err != cudaSuccess) return cudaGetLastError();
-  err = cudaMemcpy(d_c, c, nvec*sizeof(T), cudaMemcpyHostToDevice);
-  if (err != cudaSuccess) return cudaGetLastError();
-
-  // Create array of device pointers on host
-  T** h_Yd = new T*[nvec];
-  for (int i=0; i<nvec; i++)
-    h_Yd[i] = Y[i]->device();
-
-  T** h_Zd = new T*[nvec];
-  for (int i=0; i<nvec; i++)
-    h_Zd[i] = Z[i]->device();
-
-  // Copy array of device pointers to device from host
-  T** d_Yd;
-  err = cudaMalloc((void**) &d_Yd, nvec*sizeof(T*));
-  if (err != cudaSuccess) return cudaGetLastError();
-  err = cudaMemcpy(d_Yd, h_Yd, nvec*sizeof(T*), cudaMemcpyHostToDevice);
-  if (err != cudaSuccess) return cudaGetLastError();
-
-  T** d_Zd;
-  err = cudaMalloc((void**) &d_Zd, nvec*sizeof(T*));
-  if (err != cudaSuccess) return cudaGetLastError();
-  err = cudaMemcpy(d_Zd, h_Zd, nvec*sizeof(T*), cudaMemcpyHostToDevice);
-  if (err != cudaSuccess) return cudaGetLastError();
-
-  // Set partitioning
-  ThreadPartitioning<T, I>& p = Z[0]->partStream();
-  const I grid                = p.grid();
-  const unsigned block        = p.block();
-  const cudaStream_t stream   = p.stream();
-
-  math_kernels::scaleAddMultiKernel<<<grid, block, 0, stream>>>(
-      nvec,
-      d_c,
-      X->device(),
-      d_Yd,
-      d_Zd,
-      X->size()
-  );
-
-  // Free host array
-  delete[] h_Yd;
-  delete[] h_Zd;
-
-  // Free device arrays
-  err = cudaFree(d_c);
-  if (err != cudaSuccess) return cudaGetLastError();
-  err = cudaFree(d_Yd);
-  if (err != cudaSuccess) return cudaGetLastError();
-  err = cudaFree(d_Zd);
-  if (err != cudaSuccess) return cudaGetLastError();
-
-  return cudaGetLastError();
-}
-
-
-template <typename T, typename I>
-inline cudaError_t dotProdMulti(int nvec, Vector<T,I>* x, Vector<T,I>** Y,
-                                T* dots)
-{
-  cudaError_t err;
-
-  // Create array of device pointers on host
-  T** h_Yd = new T*[nvec];
-  for (int i=0; i<nvec; i++)
-    h_Yd[i] = Y[i]->device();
-
-  // Copy array of device pointers to device from host
-  T** d_Yd;
-  err = cudaMalloc((void**) &d_Yd, nvec*sizeof(T*));
-  if (err != cudaSuccess) return cudaGetLastError();
-  err = cudaMemcpy(d_Yd, h_Yd, nvec*sizeof(T*), cudaMemcpyHostToDevice);
-  if (err != cudaSuccess) return cudaGetLastError();
-
-  // Set partitioning
-  ThreadPartitioning<T, I>& p = x->partReduce();
-  unsigned grid               = p.grid();
-  unsigned block              = p.block();
-  unsigned shMemSize          = nvec*block*sizeof(T);
-  const cudaStream_t stream   = p.stream();
-
-  // Allocate reduction buffer on device
-  T* d_buff;
-  err = cudaMalloc((void**) &d_buff, nvec*grid*sizeof(T));
-  if (err != cudaSuccess) return cudaGetLastError();
-
-  math_kernels::dotProdMultiKernel<T,I><<<grid, block, shMemSize, stream>>>(
-      nvec,
-      x->device(),
-      d_Yd,
-      d_buff,
-      x->size()
-  );
-
-  unsigned n = grid;
-  unsigned nmax = 2*block;
-  while (n > nmax) {
-
-    // Recompute partitioning
-    grid = (n + block - 1)/block;
-
-    // Rerun reduction kernel
-    math_kernels::sumReduceVectorKernel<T,I><<<grid, block, shMemSize, stream>>>(
-        nvec,
-        d_buff,
-        d_buff,
-        n
-    );
-
-    // update buffer array working length
-    n = grid;
-  }
-
-  // Finish reduction on CPU if there are less than two blocks of data left.
-  T* h_buff = new T[nvec*n*sizeof(T)];
-  err = cudaMemcpy(h_buff, d_buff, nvec*n*sizeof(T), cudaMemcpyDeviceToHost);
-  if (err != cudaSuccess) return cudaGetLastError();
-
-  for (int k=0; k<nvec; k++) {
-    dots[k] = h_buff[k*n];
-    for (unsigned i=1; i<n; i++){
-      dots[k] += h_buff[i + k*n];
-    }
-  }
-
-  // Free host array
-  delete[] h_Yd;
-  delete[] h_buff;
-
-  err = cudaFree(d_Yd);
-  if (err != cudaSuccess) return cudaGetLastError();
-  err = cudaFree(d_buff);
-  if (err != cudaSuccess) return cudaGetLastError();
-
-  return cudaGetLastError();
-}
-
-
-/*
- * -----------------------------------------------------------------------------
- * vector array operations
- * -----------------------------------------------------------------------------
- */
-
-template <typename T, typename I>
-inline cudaError_t linearSumVectorArray(int nvec, T a, Vector<T,I>** X, T b,
-                                        Vector<T,I>** Y, Vector<T,I>** Z)
-{
-  cudaError_t err;
-
-  // Create array of device pointers on host
-  T** h_Xd = new T*[nvec];
-  for (int i=0; i<nvec; i++)
-    h_Xd[i] = X[i]->device();
-
-  T** h_Yd = new T*[nvec];
-  for (int i=0; i<nvec; i++)
-    h_Yd[i] = Y[i]->device();
-
-  T** h_Zd = new T*[nvec];
-  for (int i=0; i<nvec; i++)
-    h_Zd[i] = Z[i]->device();
-
-  // Copy array of device pointers to device from host
-  T** d_Xd;
-  err = cudaMalloc((void**) &d_Xd, nvec*sizeof(T*));
-  if (err != cudaSuccess) return cudaGetLastError();
-  err = cudaMemcpy(d_Xd, h_Xd, nvec*sizeof(T*), cudaMemcpyHostToDevice);
-  if (err != cudaSuccess) return cudaGetLastError();
-
-  T** d_Yd;
-  err = cudaMalloc((void**) &d_Yd, nvec*sizeof(T*));
-  if (err != cudaSuccess) return cudaGetLastError();
-  err = cudaMemcpy(d_Yd, h_Yd, nvec*sizeof(T*), cudaMemcpyHostToDevice);
-  if (err != cudaSuccess) return cudaGetLastError();
-
-  T** d_Zd;
-  err = cudaMalloc((void**) &d_Zd, nvec*sizeof(T*));
-  if (err != cudaSuccess) return cudaGetLastError();
-  err = cudaMemcpy(d_Zd, h_Zd, nvec*sizeof(T*), cudaMemcpyHostToDevice);
-  if (err != cudaSuccess) return cudaGetLastError();
-
-  // Set partitioning
-  ThreadPartitioning<T, I>& p = Z[0]->partStream();
-  const I grid                = p.grid();
-  const unsigned block        = p.block();
-  const cudaStream_t stream   = p.stream();
-
-  math_kernels::linearSumVectorArrayKernel<<<grid, block, 0, stream>>>(
-      nvec,
-      a,
-      d_Xd,
-      b,
-      d_Yd,
-      d_Zd,
-      Z[0]->size()
-  );
-
-  // Free host array
-  delete[] h_Xd;
-  delete[] h_Yd;
-  delete[] h_Zd;
-
-  // Free device arrays
-  err = cudaFree(d_Xd);
-  if (err != cudaSuccess) return cudaGetLastError();
-  err = cudaFree(d_Yd);
-  if (err != cudaSuccess) return cudaGetLastError();
-  err = cudaFree(d_Zd);
-  if (err != cudaSuccess) return cudaGetLastError();
-
-  return cudaGetLastError();
-}
-
-
-template <typename T, typename I>
-inline cudaError_t scaleVectorArray(int nvec, T* c, Vector<T,I>** X,
-                                    Vector<T,I>** Z)
-{
-  cudaError_t err;
-
-  // Copy c array to device
-  T* d_c;
-  err = cudaMalloc((void**) &d_c, nvec*sizeof(T));
-  if (err != cudaSuccess) return cudaGetLastError();
-  err = cudaMemcpy(d_c, c, nvec*sizeof(T), cudaMemcpyHostToDevice);
-  if (err != cudaSuccess) return cudaGetLastError();
-
-  // Create array of device pointers on host
-  T** h_Xd = new T*[nvec];
-  for (int i=0; i<nvec; i++)
-    h_Xd[i] = X[i]->device();
-
-  T** h_Zd = new T*[nvec];
-  for (int i=0; i<nvec; i++)
-    h_Zd[i] = Z[i]->device();
-
-  // Copy array of device pointers to device from host
-  T** d_Xd;
-  err = cudaMalloc((void**) &d_Xd, nvec*sizeof(T*));
-  if (err != cudaSuccess) return cudaGetLastError();
-  err = cudaMemcpy(d_Xd, h_Xd, nvec*sizeof(T*), cudaMemcpyHostToDevice);
-  if (err != cudaSuccess) return cudaGetLastError();
-
-  T** d_Zd;
-  err = cudaMalloc((void**) &d_Zd, nvec*sizeof(T*));
-  if (err != cudaSuccess) return cudaGetLastError();
-  err = cudaMemcpy(d_Zd, h_Zd, nvec*sizeof(T*), cudaMemcpyHostToDevice);
-  if (err != cudaSuccess) return cudaGetLastError();
-
-  // Set partitioning
-  ThreadPartitioning<T, I>& p = Z[0]->partStream();
-  const I grid                = p.grid();
-  const unsigned block        = p.block();
-  const cudaStream_t stream   = p.stream();
-
-  math_kernels::scaleVectorArrayKernel<<<grid, block, 0, stream>>>(
-      nvec,
-      d_c,
-      d_Xd,
-      d_Zd,
-      Z[0]->size()
-  );
-
-  // Free host array
-  delete[] h_Xd;
-  delete[] h_Zd;
-
-  // Free device arrays
-  err = cudaFree(d_Xd);
-  if (err != cudaSuccess) return cudaGetLastError();
-  err = cudaFree(d_Zd);
-  if (err != cudaSuccess) return cudaGetLastError();
-
-  return cudaGetLastError();
-}
-
-
-template <typename T, typename I>
-inline cudaError_t constVectorArray(int nvec, T c, Vector<T,I>** Z)
-{
-  cudaError_t err;
-
-  // Create array of device pointers on host
-  T** h_Zd = new T*[nvec];
-  for (int i=0; i<nvec; i++)
-    h_Zd[i] = Z[i]->device();
-
-  // Copy array of device pointers to device from host
-  T** d_Zd;
-  err = cudaMalloc((void**) &d_Zd, nvec*sizeof(T*));
-  if (err != cudaSuccess) return cudaGetLastError();
-  err = cudaMemcpy(d_Zd, h_Zd, nvec*sizeof(T*), cudaMemcpyHostToDevice);
-  if (err != cudaSuccess) return cudaGetLastError();
-
-  // Set partitioning
-  ThreadPartitioning<T, I>& p = Z[0]->partStream();
-  const I grid                = p.grid();
-  const unsigned block        = p.block();
-  const cudaStream_t stream   = p.stream();
-
-  math_kernels::constVectorArrayKernel<<<grid, block, 0, stream>>>(
-      nvec,
-      c,
-      d_Zd,
-      Z[0]->size()
-  );
-
-  // Free host array
-  delete[] h_Zd;
-
-  // Free device arrays
-  err = cudaFree(d_Zd);
-  if (err != cudaSuccess) return cudaGetLastError();
-
-  return cudaGetLastError();
-}
-
-
-template <typename T, typename I>
-inline cudaError_t wL2NormSquareVectorArray(int nvec, Vector<T,I>** X,
-                                            Vector<T,I>** W, T* nrm)
-{
-  cudaError_t err;
-
-  // Create array of device pointers on host
-  T** h_Xd = new T*[nvec];
-  for (int i=0; i<nvec; i++)
-    h_Xd[i] = X[i]->device();
-
-  T** h_Wd = new T*[nvec];
-  for (int i=0; i<nvec; i++)
-    h_Wd[i] = W[i]->device();
-
-  // Copy array of device pointers to device from host
-  T** d_Xd;
-  err = cudaMalloc((void**) &d_Xd, nvec*sizeof(T*));
-  if (err != cudaSuccess) return cudaGetLastError();
-  err = cudaMemcpy(d_Xd, h_Xd, nvec*sizeof(T*), cudaMemcpyHostToDevice);
-  if (err != cudaSuccess) return cudaGetLastError();
-
-  T** d_Wd;
-  err = cudaMalloc((void**) &d_Wd, nvec*sizeof(T*));
-  if (err != cudaSuccess) return cudaGetLastError();
-  err = cudaMemcpy(d_Wd, h_Wd, nvec*sizeof(T*), cudaMemcpyHostToDevice);
-  if (err != cudaSuccess) return cudaGetLastError();
-
-  // Set partitioning
-  ThreadPartitioning<T, I>& p = X[0]->partReduce();
-  unsigned grid               = p.grid();
-  unsigned block              = p.block();
-  unsigned shMemSize          = nvec*block*sizeof(T);
-  const cudaStream_t stream   = p.stream();
-
-  // Allocate reduction buffer on device
-  T* d_buff;
-  err = cudaMalloc((void**) &d_buff, nvec*grid*sizeof(T));
-  if (err != cudaSuccess) return cudaGetLastError();
-
-  math_kernels::wL2NormSquareVectorArrayKernel<<<grid, block, shMemSize, stream>>>(
-      nvec,
-      d_Xd,
-      d_Wd,
-      d_buff,
-      X[0]->size()
-  );
-
-  unsigned n = grid;
-  unsigned nmax = 2*block;
-  while (n > nmax) {
-
-    // Recompute partitioning
-    grid = (n + block - 1)/block;
-
-    // Rerun reduction kernel
-    math_kernels::sumReduceVectorKernel<T,I><<<grid, block, shMemSize, stream>>>(
-        nvec,
-        d_buff,
-        d_buff,
-        n
-    );
-
-    // update buffer array working length
-    n = grid;
-  }
-
-  // Finish reduction on CPU if there are less than two blocks of data left.
-  T* h_buff = new T[nvec*n*sizeof(T)];
-  err = cudaMemcpy(h_buff, d_buff, nvec*n*sizeof(T), cudaMemcpyDeviceToHost);
-  if (err != cudaSuccess) return cudaGetLastError();
-
-  for (int k=0; k<nvec; k++) {
-    nrm[k] = h_buff[k*n];
-    for (unsigned i=1; i<n; i++){
-      nrm[k] += h_buff[i + k*n];
-    }
-  }
-
-  // Free host array
-  delete[] h_Xd;
-  delete[] h_Wd;
-  delete[] h_buff;
-
-  // Free device arrays
-  err = cudaFree(d_Xd);
-  if (err != cudaSuccess) return cudaGetLastError();
-  err = cudaFree(d_Wd);
-  if (err != cudaSuccess) return cudaGetLastError();
-  err = cudaFree(d_buff);
-  if (err != cudaSuccess) return cudaGetLastError();
-
-  return cudaGetLastError();
-}
-
-
-template <typename T, typename I>
-inline cudaError_t wL2NormSquareMaskVectorArray(int nvec, Vector<T,I>** X,
-                                           Vector<T,I>** W, Vector<T,I>* ID,
-                                           T* nrm)
-{
-  cudaError_t err;
-
-  // Create array of device pointers on host
-  T** h_Xd = new T*[nvec];
-  for (int i=0; i<nvec; i++)
-    h_Xd[i] = X[i]->device();
-
-  T** h_Wd = new T*[nvec];
-  for (int i=0; i<nvec; i++)
-    h_Wd[i] = W[i]->device();
-
-  // Copy array of device pointers to device from host
-  T** d_Xd;
-  err = cudaMalloc((void**) &d_Xd, nvec*sizeof(T*));
-  if (err != cudaSuccess) return cudaGetLastError();
-  err = cudaMemcpy(d_Xd, h_Xd, nvec*sizeof(T*), cudaMemcpyHostToDevice);
-  if (err != cudaSuccess) return cudaGetLastError();
-
-  T** d_Wd;
-  err = cudaMalloc((void**) &d_Wd, nvec*sizeof(T*));
-  if (err != cudaSuccess) return cudaGetLastError();
-  err = cudaMemcpy(d_Wd, h_Wd, nvec*sizeof(T*), cudaMemcpyHostToDevice);
-  if (err != cudaSuccess) return cudaGetLastError();
-
-  // Set partitioning
-  ThreadPartitioning<T, I>& p = X[0]->partReduce();
-  unsigned grid               = p.grid();
-  unsigned block              = p.block();
-  unsigned shMemSize          = nvec*block*sizeof(T);
-  const cudaStream_t stream   = p.stream();
-
-  // Allocate reduction buffer on device
-  T* d_buff;
-  err = cudaMalloc((void**) &d_buff, nvec*grid*sizeof(T));
-  if (err != cudaSuccess) return cudaGetLastError();
-
-  math_kernels::wL2NormSquareMaskVectorArrayKernel<<<grid, block, shMemSize, stream>>>(
-      nvec,
-      d_Xd,
-      d_Wd,
-      ID->device(),
-      d_buff,
-      X[0]->size()
-  );
-
-  unsigned n = grid;
-  unsigned nmax = 2*block;
-  while (n > nmax) {
-
-    // Recompute partitioning
-    grid = (n + block - 1)/block;
-
-    // Rerun reduction kernel
-    math_kernels::sumReduceVectorKernel<T,I><<<grid, block, shMemSize, stream>>>(
-        nvec,
-        d_buff,
-        d_buff,
-        n
-    );
-
-    // update buffer array working length
-    n = grid;
-  }
-
-  // Finish reduction on CPU if there are less than two blocks of data left.
-  T* h_buff = new T[nvec*n*sizeof(T)];
-  err = cudaMemcpy(h_buff, d_buff, nvec*n*sizeof(T), cudaMemcpyDeviceToHost);
-  if (err != cudaSuccess) return cudaGetLastError();
-
-  for (int k=0; k<nvec; k++) {
-    nrm[k] = h_buff[k*n];
-    for (unsigned i=1; i<n; i++){
-      nrm[k] += h_buff[i + k*n];
-    }
-  }
-
-  // Free host array
-  delete[] h_Xd;
-  delete[] h_Wd;
-  delete[] h_buff;
-
-  // Free device arrays
-  err = cudaFree(d_Xd);
-  if (err != cudaSuccess) return cudaGetLastError();
-  err = cudaFree(d_Wd);
-  if (err != cudaSuccess) return cudaGetLastError();
-  err = cudaFree(d_buff);
-  if (err != cudaSuccess) return cudaGetLastError();
-
-  return cudaGetLastError();
-}
-
-
-template <typename T, typename I>
-inline cudaError_t scaleAddMultiVectorArray(int nvec, int nsum, T* c,
-                                            Vector<T,I>** X, Vector<T,I>** Y,
-                                            Vector<T,I>** Z)
-{
-  cudaError_t err;
-
-  // Copy c array to device
-  T* d_c;
-  err = cudaMalloc((void**) &d_c, nsum*sizeof(T));
-  if (err != cudaSuccess) return cudaGetLastError();
-  err = cudaMemcpy(d_c, c, nsum*sizeof(T), cudaMemcpyHostToDevice);
-  if (err != cudaSuccess) return cudaGetLastError();
-
-  // Create array of device pointers on host
-  T** h_Xd = new T*[nvec];
-  for (int i=0; i<nvec; i++)
-    h_Xd[i] = X[i]->device();
-
-  T** h_Yd = new T*[nsum*nvec];
-  for (int i=0; i<nsum*nvec; i++)
-    h_Yd[i] = Y[i]->device();
-
-  T** h_Zd = new T*[nsum*nvec];
-  for (int i=0; i<nsum*nvec; i++)
-    h_Zd[i] = Z[i]->device();
-
-  // Copy array of device pointers to device from host
-  T** d_Xd;
-  err = cudaMalloc((void**) &d_Xd, nvec*sizeof(T*));
-  if (err != cudaSuccess) return cudaGetLastError();
-  err = cudaMemcpy(d_Xd, h_Xd, nvec*sizeof(T*), cudaMemcpyHostToDevice);
-  if (err != cudaSuccess) return cudaGetLastError();
-
-  T** d_Yd;
-  err = cudaMalloc((void**) &d_Yd, nsum*nvec*sizeof(T*));
-  if (err != cudaSuccess) return cudaGetLastError();
-  err = cudaMemcpy(d_Yd, h_Yd, nsum*nvec*sizeof(T*), cudaMemcpyHostToDevice);
-  if (err != cudaSuccess) return cudaGetLastError();
-
-  T** d_Zd;
-  err = cudaMalloc((void**) &d_Zd, nsum*nvec*sizeof(T*));
-  if (err != cudaSuccess) return cudaGetLastError();
-  err = cudaMemcpy(d_Zd, h_Zd, nsum*nvec*sizeof(T*), cudaMemcpyHostToDevice);
-  if (err != cudaSuccess) return cudaGetLastError();
-
-  // Set partitioning
-  ThreadPartitioning<T, I>& p = Z[0]->partStream();
-  const I grid                = p.grid();
-  const unsigned block        = p.block();
-  const cudaStream_t stream   = p.stream();
-
-  math_kernels::scaleAddMultiVectorArrayKernel<<<grid, block, 0, stream>>>(
-      nvec,
-      nsum,
-      d_c,
-      d_Xd,
-      d_Yd,
-      d_Zd,
-      Z[0]->size()
-  );
-
-  // Free host array
-  delete[] h_Xd;
-  delete[] h_Yd;
-  delete[] h_Zd;
-
-  // Free device arrays
-  err = cudaFree(d_Xd);
-  if (err != cudaSuccess) return cudaGetLastError();
-  err = cudaFree(d_Yd);
-  if (err != cudaSuccess) return cudaGetLastError();
-  err = cudaFree(d_Zd);
-  if (err != cudaSuccess) return cudaGetLastError();
-
-  return cudaGetLastError();
-}
-
-
-template <typename T, typename I>
-inline cudaError_t linearCombinationVectorArray(int nvec, int nsum, T* c,
-                                                Vector<T,I>** X, Vector<T,I>** Z)
-{
-  cudaError_t err;
-
-  // Copy c array to device
-  T* d_c;
-  err = cudaMalloc((void**) &d_c, nsum*sizeof(T));
-  if (err != cudaSuccess) return cudaGetLastError();
-  err = cudaMemcpy(d_c, c, nsum*sizeof(T), cudaMemcpyHostToDevice);
-  if (err != cudaSuccess) return cudaGetLastError();
-
-  // Create array of device pointers on host
-  T** h_Xd = new T*[nsum*nvec];
-  for (int i=0; i<nsum*nvec; i++)
-    h_Xd[i] = X[i]->device();
-
-  T** h_Zd = new T*[nvec];
-  for (int i=0; i<nvec; i++)
-    h_Zd[i] = Z[i]->device();
-
-  // Copy array of device pointers to device from host
-  T** d_Xd;
-  err = cudaMalloc((void**) &d_Xd, nsum*nvec*sizeof(T*));
-  if (err != cudaSuccess) return cudaGetLastError();
-  err = cudaMemcpy(d_Xd, h_Xd, nsum*nvec*sizeof(T*), cudaMemcpyHostToDevice);
-  if (err != cudaSuccess) return cudaGetLastError();
-
-  T** d_Zd;
-  err = cudaMalloc((void**) &d_Zd, nvec*sizeof(T*));
-  if (err != cudaSuccess) return cudaGetLastError();
-  err = cudaMemcpy(d_Zd, h_Zd, nvec*sizeof(T*), cudaMemcpyHostToDevice);
-  if (err != cudaSuccess) return cudaGetLastError();
-
-  // Set partitioning
-  ThreadPartitioning<T, I>& p = Z[0]->partStream();
-  const I grid                = p.grid();
-  const unsigned block        = p.block();
-  const cudaStream_t stream   = p.stream();
-
-  math_kernels::linearCombinationVectorArrayKernel<<<grid, block, 0, stream>>>(
-      nvec,
-      nsum,
-      d_c,
-      d_Xd,
-      d_Zd,
-      Z[0]->size()
-  );
-
-  // Free host array
-  delete[] h_Xd;
-  delete[] h_Zd;
-
-  // Free device arrays
-  err = cudaFree(d_Xd);
-  if (err != cudaSuccess) return cudaGetLastError();
-  err = cudaFree(d_Zd);
-  if (err != cudaSuccess) return cudaGetLastError();
-
-  return cudaGetLastError();
-}
-
-
-} // namespace nvec
-
-
-
-#endif // _VECTOR_ARRAY_KERNELS_CUH_
diff --git a/include/nvector/cuda/VectorKernels.cuh b/include/nvector/cuda/VectorKernels.cuh
deleted file mode 100644
index 9146bcde3b..0000000000
--- a/include/nvector/cuda/VectorKernels.cuh
+++ /dev/null
@@ -1,1072 +0,0 @@
-/*
- * -----------------------------------------------------------------
- * Programmer(s): Slaven Peles @ LLNL
- * -----------------------------------------------------------------
- * SUNDIALS Copyright Start
- * Copyright (c) 2002-2020, Lawrence Livermore National Security
- * and Southern Methodist University.
- * All rights reserved.
- *
- * See the top-level LICENSE and NOTICE files for details.
- *
- * SPDX-License-Identifier: BSD-3-Clause
- * SUNDIALS Copyright End
- * -----------------------------------------------------------------
- */
-
-
-#ifndef _VECTOR_KERNELS_CUH_
-#define _VECTOR_KERNELS_CUH_
-
-#include <limits>
-#include <cuda_runtime.h>
-
-
-namespace suncudavec
-{
-
-/* -----------------------------------------------------------------
- * The namespace for CUDA kernels
- *
- * Reduction CUDA kernels in nvector are based in part on "reduction"
- * example in NVIDIA Corporation CUDA Samples, and parallel reduction
- * examples in textbook by J. Cheng at al. "CUDA C Programming".
- * -----------------------------------------------------------------
- */
-namespace math_kernels
-{
-
-
-/*
- * Sets all elements of the vector X to constant value a.
- *
- */
-
-template <typename T, typename I>
-__global__ void
-setConstKernel(T a, T *X, I n)
-{
-  I i = blockDim.x * blockIdx.x + threadIdx.x;
-
-  if (i < n)
-  {
-    X[i] = a;
-  }
-}
-
-
-/*
- * Computes linear sum (combination) of two vectors.
- *
- */
-
-template <typename T, typename I>
-__global__ void
-linearSumKernel(T a, const T *X, T b, const T *Y, T *Z, I n)
-{
-  I i = blockDim.x * blockIdx.x + threadIdx.x;
-
-  if (i < n)
-  {
-    Z[i] = a*X[i] + b*Y[i];
-  }
-}
-
-
-/*
- * Elementwise product of two vectors.
- *
- */
-
-template <typename T, typename I>
-__global__ void
-prodKernel(const T *X, const T *Y, T *Z, I n)
-{
-  I i = blockDim.x * blockIdx.x + threadIdx.x;
-
-  if (i < n)
-  {
-    Z[i] = X[i]*Y[i];
-  }
-}
-
-
-/*
- * Elementwise division of two vectors.
- *
- */
-
-template <typename T, typename I>
-__global__ void
-divKernel(const T *X, const T *Y, T *Z, I n)
-{
-  I i = blockDim.x * blockIdx.x + threadIdx.x;
-
-  if (i < n)
-  {
-    Z[i] = X[i]/Y[i];
-  }
-}
-
-
-/*
- * Scale vector with scalar value 'a'.
- *
- */
-
-template <typename T, typename I>
-__global__ void
-scaleKernel(T a, const T *X, T *Z, I n)
-{
-  I i = blockDim.x * blockIdx.x + threadIdx.x;
-
-  if (i < n)
-  {
-    Z[i] = a*X[i];
-  }
-}
-
-
-/*
- * Stores absolute values of vector X elements into vector Z.
- *
- */
-
-template <typename T, typename I>
-__global__ void
-absKernel(const T *X, T *Z, I n)
-{
-  I i = blockDim.x * blockIdx.x + threadIdx.x;
-
-  if (i < n)
-  {
-    Z[i] = abs(X[i]);
-  }
-}
-
-
-/*
- * Elementwise inversion.
- *
- */
-
-template <typename T, typename I>
-__global__ void
-invKernel(const T *X, T *Z, I n)
-{
-  I i = blockDim.x * blockIdx.x + threadIdx.x;
-
-  if (i < n)
-  {
-    Z[i] = 1.0/(X[i]);
-  }
-}
-
-
-/*
- * Add constant 'c' to each vector element.
- *
- */
-
-template <typename T, typename I>
-__global__ void
-addConstKernel(T a, const T *X, T *Z, I n)
-{
-  I i = blockDim.x * blockIdx.x + threadIdx.x;
-
-  if (i < n)
-  {
-    Z[i] = a + X[i];
-  }
-}
-
-
-/*
- * Compare absolute values of vector 'X' with constant 'c'.
- *
- */
-
-template <typename T, typename I>
-__global__ void
-compareKernel(T c, const T *X, T *Z, I n)
-{
-  I i = blockDim.x * blockIdx.x + threadIdx.x;
-
-  if (i < n)
-  {
-    Z[i] = (abs(X[i]) >= c) ? 1.0 : 0.0;
-  }
-}
-
-
-/*
- * Sums all elements of the vector.
- *
- */
-template <typename T, typename I>
-__global__ void
-sumReduceKernel(const T *x, T *out, I n)
-{
-  extern __shared__ T shmem[];
-
-  I tid = threadIdx.x;
-  I i = blockIdx.x*(blockDim.x*2) + threadIdx.x;
-
-  T sum = 0.0;
-
-  // First reduction step before storing data in shared memory.
-  if (i < n)
-    sum = x[i];
-  if (i + blockDim.x < n)
-    sum += x[i+blockDim.x];
-  shmem[tid] = sum;
-  __syncthreads();
-
-  // Perform reduction block-wise in shared memory.
-  for (I j = blockDim.x/2; j > 0; j >>= 1) {
-    if (tid < j) {
-      sum += shmem[tid + j];
-      shmem[tid] = sum;
-    }
-    __syncthreads();
-  }
-
-  // Copy reduction result for each block to global memory
-  if (tid == 0)
-    out[blockIdx.x] = sum;
-}
-
-
-/*
- * Dot product of two vectors.
- *
- */
-template <typename T, typename I>
-__global__ void
-dotProdKernel(const T *x, const T *y, T *out, I n)
-{
-  extern __shared__ T shmem[];
-
-  I tid = threadIdx.x;
-  I i = blockIdx.x*(blockDim.x*2) + threadIdx.x;
-
-  T sum = 0.0;
-
-  // First reduction step before storing data in shared memory.
-  if (i < n)
-    sum = x[i] * y[i];
-  if (i + blockDim.x < n)
-    sum += ( x[i+blockDim.x] * y[i+blockDim.x]);
-  shmem[tid] = sum;
-  __syncthreads();
-
-  // Perform blockwise reduction in shared memory
-  for (I j = blockDim.x/2; j > 0; j >>= 1) {
-    if (tid < j) {
-      sum += shmem[tid + j];
-      shmem[tid] = sum;
-    }
-    __syncthreads();
-  }
-
-  // Copy reduction result for each block to global memory
-  if (tid == 0)
-    out[blockIdx.x] = sum;
-}
-
-
-/*
- * Finds max norm the vector.
- *
- */
-template <typename T, typename I>
-__global__ void
-maxNormKernel(const T *x, T *out, I n)
-{
-  extern __shared__ T shmem[];
-
-  I tid = threadIdx.x;
-  I i = blockIdx.x*(blockDim.x*2) + threadIdx.x;
-
-  T maximum = 0.0;
-
-  // First reduction step before storing data in shared memory.
-  if (i < n)
-    maximum = abs(x[i]);
-  if (i + blockDim.x < n)
-    maximum = max(abs(x[i+blockDim.x]), maximum);
-  shmem[tid] = maximum;
-  __syncthreads();
-
-  // Perform reduction block-wise in shared memory.
-  for (I j = blockDim.x/2; j > 0; j >>= 1) {
-    if (tid < j) {
-      maximum = max(shmem[tid + j], maximum);
-      shmem[tid] = maximum;
-    }
-    __syncthreads();
-  }
-
-  // Copy reduction result for each block to global memory
-  if (tid == 0)
-    out[blockIdx.x] = maximum;
-}
-
-
-/*
- * Weighted L2 norm squared.
- *
- */
-template <typename T, typename I>
-__global__ void
-wL2NormSquareKernel(const T *x, const T *w, T *out, I n)
-{
-  extern __shared__ T shmem[];
-
-  I tid = threadIdx.x;
-  I i = blockIdx.x*(blockDim.x*2) + threadIdx.x;
-
-  T sum = 0.0;
-
-  // First reduction step before storing data in shared memory.
-  if (i < n)
-    sum = x[i] * w[i] * x[i] * w[i];
-  if (i + blockDim.x < n)
-    sum += ( x[i+blockDim.x] * w[i+blockDim.x] * x[i+blockDim.x] * w[i+blockDim.x] );
-
-  shmem[tid] = sum;
-  __syncthreads();
-
-  // Perform reduction block-wise in shared memory.
-  for (I j = blockDim.x/2; j > 0; j >>= 1)
-  {
-    if (tid < j) {
-      sum += shmem[tid + j];
-      shmem[tid] = sum;
-    }
-    __syncthreads();
-  }
-
-  // Copy reduction result for each block to global memory
-  if (tid == 0)
-    out[blockIdx.x] = sum;
-}
-
-/*
- * Weighted L2 norm squared with mask. Vector id specifies the mask.
- *
- */
-template <typename T, typename I>
-__global__ void
-wL2NormSquareMaskKernel(const T *x, const T *w, const T *id, T *out, I n)
-{
-  extern __shared__ T shmem[];
-
-  I tid = threadIdx.x;
-  I i = blockIdx.x*(blockDim.x*2) + threadIdx.x;
-
-  T sum = 0.0;
-
-  // First reduction step before storing data in shared memory.
-  if (i < n && id[i] > 0.0)
-    sum = x[i] * w[i] * x[i] * w[i];
-  if ((i + blockDim.x < n) && (id[i+blockDim.x] > 0.0))
-    sum += ( x[i+blockDim.x] * w[i+blockDim.x] * x[i+blockDim.x] * w[i+blockDim.x]);
-  shmem[tid] = sum;
-  __syncthreads();
-
-  // Perform reduction block-wise in shared memory.
-  for (I j = blockDim.x/2; j > 0; j >>= 1) {
-    if (tid < j) {
-      sum += shmem[tid + j];
-      shmem[tid] = sum;
-    }
-    __syncthreads();
-  }
-
-  // Copy reduction result for each block to global memory
-  if (tid == 0)
-    out[blockIdx.x] = sum;
-}
-
-/*
- * Finds min value in the vector.
- *
- */
-template <typename T, typename I>
-__global__ void
-findMinKernel(T MAX_VAL, const T *x, T *out, I n)
-{
-  extern __shared__ T shmem[];
-
-  I tid = threadIdx.x;
-  I i = blockIdx.x*(blockDim.x*2) + threadIdx.x;
-
-  T minimum = MAX_VAL;
-
-  // First reduction step before storing data in shared memory.
-  if (i < n)
-    minimum = x[i];
-  if (i + blockDim.x < n)
-    minimum = min((x[i+blockDim.x]), minimum);
-  shmem[tid] = minimum;
-  __syncthreads();
-
-  // Perform reduction block-wise in shared memory.
-  for (I j = blockDim.x/2; j > 0; j >>= 1) {
-    if (tid < j) {
-      minimum = min(shmem[tid + j], minimum);
-      shmem[tid] = minimum;
-    }
-    __syncthreads();
-  }
-
-  // Copy reduction result for each block to global memory
-  if (tid == 0)
-    out[blockIdx.x] = minimum;
-}
-
-
-/*
- * Computes L1 norm of vector
- *
- */
-template <typename T, typename I>
-__global__ void
-L1NormKernel(const T *x, T *out, I n)
-{
-  extern __shared__ T shmem[];
-
-  I tid = threadIdx.x;
-  I i = blockIdx.x*(blockDim.x*2) + threadIdx.x;
-
-  T sum = 0.0;
-
-  // First reduction step before storing data in shared memory.
-  if (i < n)
-    sum = abs(x[i]);
-  if (i + blockDim.x < n)
-    sum += abs(x[i+blockDim.x]);
-  shmem[tid] = sum;
-  __syncthreads();
-
-  // Perform reduction block-wise in shared memory.
-  for (I j = blockDim.x/2; j > 0; j >>= 1) {
-    if (tid < j) {
-      sum += shmem[tid + j];
-      shmem[tid] = sum;
-    }
-    __syncthreads();
-  }
-
-  // Copy reduction result for each block to global memory
-  if (tid == 0)
-    out[blockIdx.x] = sum;
-}
-
-/*
- * Vector inverse  z[i] = 1/x[i] with check for zeros. Reduction is performed
- * to flag the result if any x[i] = 0.
- *
- */
-template <typename T, typename I>
-__global__ void
-invTestKernel(const T *x, T *z, T *out, I n)
-{
-  extern __shared__ T shmem[];
-
-  I tid = threadIdx.x;
-  I i = blockIdx.x*(blockDim.x*2) + threadIdx.x;
-
-  T flag = 0.0;
-
-  // First reduction step before storing data in shared memory.
-  if (i < n) {
-    if (x[i] == 0.0) {
-      flag = 1.0;
-    } else {
-      flag = 0.0;
-      z[i] = 1.0/x[i];
-    }
-  }
-
-  if (i + blockDim.x < n) {
-    if (x[i + blockDim.x] == 0.0) {
-      flag += 1.0;
-    } else {
-      z[i + blockDim.x] = 1.0/x[i + blockDim.x];
-    }
-  }
-
-  shmem[tid] = flag;
-  __syncthreads();
-
-  // Inverse calculation is done. Perform reduction block-wise in shared
-  // to find if any x[i] = 0.
-  for (I j = blockDim.x/2; j > 0; j >>= 1) {
-    if (tid < j) {
-      flag += shmem[tid + j];
-      shmem[tid] = flag;
-    }
-    __syncthreads();
-  }
-
-  // Copy reduction result for each block to global memory
-  if (tid == 0)
-    out[blockIdx.x] = flag;
-}
-
-/*
- * Checks if inequality constraints are satisfied. Constraint check
- * results are stored in vector 'm'. A sum reduction over all elements
- * of 'm' is performed to find if any of the constraints is violated.
- * If all constraints are satisfied sum == 0.
- *
- */
-template <typename T, typename I>
-__global__ void
-constrMaskKernel(const T *c, const T *x, T *m, T *out, I n)
-{
-  extern __shared__ T shmem[];
-
-  I tid = threadIdx.x;
-  I i = blockIdx.x*(blockDim.x*2) + threadIdx.x;
-
-  T sum = 0.0;
-
-  // First reduction step before storing data in shared memory.
-  if (i < n){
-    // test1 = true if constraints violated
-    bool test1 = (std::abs(c[i]) > 1.5 && c[i]*x[i] <= 0.0) ||
-                 (std::abs(c[i]) > 0.5 && c[i]*x[i] <  0.0);
-    m[i] = test1 ? 1.0 : 0.0;
-    sum = m[i];
-  }
-
-  if (i + blockDim.x < n) {
-    // test2 = true if constraints violated
-    bool test2 = (std::abs(c[i + blockDim.x]) > 1.5 && c[i + blockDim.x]*x[i + blockDim.x] <= 0.0) ||
-                 (std::abs(c[i + blockDim.x]) > 0.5 && c[i + blockDim.x]*x[i + blockDim.x] <  0.0);
-    m[i+blockDim.x] = test2 ? 1.0 : 0.0;
-    sum += m[i+blockDim.x];
-  }
-
-  shmem[tid] = sum;
-  __syncthreads();
-
-  // Perform reduction block-wise in shared memory.
-  for (I j = blockDim.x/2; j > 0; j >>= 1) {
-    if (tid < j) {
-      sum += shmem[tid + j];
-      shmem[tid] = sum;
-    }
-    __syncthreads();
-  }
-
-  // Copy reduction result for each block to global memory
-  if (tid == 0)
-    out[blockIdx.x] = sum;
-}
-
-/*
- * Finds minimum component-wise quotient.
- *
- */
-template <typename T, typename I>
-__global__ void
-minQuotientKernel(const T MAX_VAL, const T *num, const T *den, T *min_quotient, I n)
-{
-  extern __shared__ T shmem[];
-
-  I tid = threadIdx.x;
-  I i = blockIdx.x*(blockDim.x*2) + threadIdx.x;
-
-  // Initialize "minimum" to maximum floating point value.
-  T minimum = MAX_VAL;
-  const T zero = static_cast<T>(0.0);
-
-  // Load vector quotient in the shared memory. Skip if the denominator
-  // value is zero.
-  if (i < n && den[i] != zero)
-    minimum = num[i]/den[i];
-
-  // First level of reduction is upon storing values to shared memory.
-  if (i + blockDim.x < n && den[i + blockDim.x] != zero)
-    minimum = min(num[i+blockDim.x]/den[i+blockDim.x], minimum);
-
-  shmem[tid] = minimum;
-  __syncthreads();
-
-  // Perform reduction block-wise in shared memory.
-  for (I j = blockDim.x/2; j > 0; j >>= 1) {
-    if (tid < j) {
-      minimum = min(shmem[tid + j], minimum);
-      shmem[tid] = minimum;
-    }
-    __syncthreads();
-  }
-
-  // Copy reduction result for each block to global memory
-  if (tid == 0)
-    min_quotient[blockIdx.x] = minimum;
-}
-
-
-} // namespace math_kernels
-
-
-
-
-template <typename T, typename I>
-inline cudaError_t setConst(T a, Vector<T,I>& X)
-{
-  // Set partitioning
-  ThreadPartitioning<T, I>& p = X.partStream();
-  const I grid                = p.grid();
-  const unsigned block        = p.block();
-  const cudaStream_t stream   = p.stream();
-
-  math_kernels::setConstKernel<<<grid, block, 0, stream>>>(a, X.device(), X.size());
-  return cudaGetLastError();
-}
-
-template <typename T, typename I>
-inline cudaError_t linearSum(T a, const Vector<T,I>& X, T b, const Vector<T,I>& Y, Vector<T,I>& Z)
-{
-  // Set partitioning
-  ThreadPartitioning<T, I>& p = X.partStream();
-  const I grid                = p.grid();
-  const unsigned block        = p.block();
-  const cudaStream_t stream   = p.stream();
-
-  math_kernels::linearSumKernel<<<grid, block, 0, stream>>>(a, X.device(), b, Y.device(), Z.device(), X.size());
-  return cudaGetLastError();
-}
-
-template <typename T, typename I>
-inline cudaError_t prod(const Vector<T,I>& X, const Vector<T,I>& Y, Vector<T,I>& Z)
-{
-  // Set partitioning
-  ThreadPartitioning<T, I>& p = X.partStream();
-  const I grid                = p.grid();
-  const unsigned block        = p.block();
-  const cudaStream_t stream   = p.stream();
-
-  math_kernels::prodKernel<<<grid, block, 0, stream>>>(X.device(), Y.device(), Z.device(), X.size());
-  return cudaGetLastError();
-}
-
-template <typename T, typename I>
-inline cudaError_t div(const Vector<T,I>& X, const Vector<T,I>& Y, Vector<T,I>& Z)
-{
-  // Set partitioning
-  ThreadPartitioning<T, I>& p = X.partStream();
-  const I grid                = p.grid();
-  const unsigned block        = p.block();
-  const cudaStream_t stream   = p.stream();
-
-  math_kernels::divKernel<<<grid, block, 0, stream>>>(X.device(), Y.device(), Z.device(), X.size());
-  return cudaGetLastError();
-}
-
-template <typename T, typename I>
-inline cudaError_t scale(T const a, const Vector<T,I>& X, Vector<T,I>& Z)
-{
-  // Set partitioning
-  ThreadPartitioning<T, I>& p = X.partStream();
-  const I grid                = p.grid();
-  const unsigned block        = p.block();
-  const cudaStream_t stream   = p.stream();
-
-  math_kernels::scaleKernel<<<grid, block, 0, stream>>>(a, X.device(), Z.device(), X.size());
-  return cudaGetLastError();
-}
-
-template <typename T, typename I>
-inline cudaError_t absVal(const Vector<T,I>& X, Vector<T,I>& Z)
-{
-  // Set partitioning
-  ThreadPartitioning<T, I>& p = X.partStream();
-  const I grid                = p.grid();
-  const unsigned block        = p.block();
-  const cudaStream_t stream   = p.stream();
-
-  math_kernels::absKernel<<<grid, block, 0, stream>>>(X.device(), Z.device(), X.size());
-  return cudaGetLastError();
-}
-
-template <typename T, typename I>
-inline cudaError_t inv(const Vector<T,I>& X, Vector<T,I>& Z)
-{
-  // Set partitioning
-  ThreadPartitioning<T, I>& p = X.partStream();
-  const I grid                = p.grid();
-  const unsigned block        = p.block();
-  const cudaStream_t stream   = p.stream();
-
-  math_kernels::invKernel<<<grid, block, 0, stream>>>(X.device(), Z.device(), X.size());
-  return cudaGetLastError();
-}
-
-template <typename T, typename I>
-inline cudaError_t addConst(T const a, const Vector<T,I>& X, Vector<T,I>& Z)
-{
-  // Set partitioning
-  ThreadPartitioning<T, I>& p = X.partStream();
-  const I grid                = p.grid();
-  const unsigned block        = p.block();
-  const cudaStream_t stream   = p.stream();
-
-  math_kernels::addConstKernel<<<grid, block, 0, stream>>>(a, X.device(), Z.device(), X.size());
-  return cudaGetLastError();
-}
-
-
-template <typename T, typename I>
-inline cudaError_t compare(T const c, const Vector<T,I>& X, Vector<T,I>& Z)
-{
-  // Set partitioning
-  ThreadPartitioning<T, I>& p = X.partStream();
-  const I grid                = p.grid();
-  const unsigned block        = p.block();
-  const cudaStream_t stream   = p.stream();
-
-  math_kernels::compareKernel<<<grid, block, 0, stream>>>(c, X.device(), Z.device(), X.size());
-  return cudaGetLastError();
-}
-
-
-template <typename T, typename I>
-inline T dotProd(const Vector<T,I>& x, const Vector<T,I>& y)
-{
-  // Set partitioning
-  ThreadPartitioning<T, I>& p = x.partReduce();
-  unsigned grid               = p.grid();
-  unsigned block              = p.block();
-  unsigned shMemSize          = p.shmem();
-  const cudaStream_t stream   = p.stream();
-
-  math_kernels::dotProdKernel<T,I><<<grid, block, shMemSize, stream>>>(x.device(), y.device(), p.devBuffer(), x.size());
-
-  unsigned n = grid;
-  unsigned nmax = 2*block;
-  while (n > nmax)
-  {
-    // Recompute partitioning
-    p.calcPartitioning(n, grid, block, shMemSize);
-
-    // Rerun reduction kernel
-    math_kernels::sumReduceKernel<T,I><<<grid, block, shMemSize, stream>>>(p.devBuffer(), p.devBuffer(), n);
-    n = grid;
-  }
-
-  // Finish reduction on CPU if there are less than two blocks of data left.
-  p.copyFromDevBuffer(n);
-
-  T gpu_result = p.hostBuffer()[0];
-  for (unsigned int i=1; i<n; i++)
-  {
-    gpu_result += p.hostBuffer()[i];
-  }
-  return gpu_result;
-}
-
-template <typename T, typename I>
-inline T maxNorm(const Vector<T,I>& x)
-{
-  // Set partitioning
-  ThreadPartitioning<T, I>& p = x.partReduce();
-  unsigned grid               = p.grid();
-  unsigned block              = p.block();
-  unsigned shMemSize          = p.shmem();
-  const cudaStream_t stream   = p.stream();
-
-  math_kernels::maxNormKernel<T,I><<<grid, block, shMemSize, stream>>>(x.device(), p.devBuffer(), x.size());
-
-  unsigned n = grid;
-  unsigned nmax = 2*block;
-  while (n > nmax)
-  {
-    // Recompute partitioning
-    p.calcPartitioning(n, grid, block, shMemSize);
-
-    // (Re)run reduction kernel
-    math_kernels::maxNormKernel<T,I><<<grid, block, shMemSize, stream>>>(p.devBuffer(), p.devBuffer(), n);
-    n = grid;
-  }
-
-  // Finish reduction on CPU if there are less than two blocks of data left.
-  p.copyFromDevBuffer(n);
-
-  T gpu_result = p.hostBuffer()[0];
-  for (unsigned int i=1; i<n; i++)
-  {
-    if (p.hostBuffer()[i] > gpu_result)
-      gpu_result = p.hostBuffer()[i];
-  }
-  return gpu_result;
-}
-
-template <typename T, typename I>
-inline T wL2NormSquareMask(const Vector<T,I>& x, const Vector<T,I>& w, const Vector<T,I>& id)
-{
-  // Set partitioning
-  ThreadPartitioning<T, I>& p = x.partReduce();
-  unsigned grid               = p.grid();
-  unsigned block              = p.block();
-  unsigned shMemSize          = p.shmem();
-  const cudaStream_t stream   = p.stream();
-
-  math_kernels::wL2NormSquareMaskKernel<T,I><<<grid, block, shMemSize, stream>>>(x.device(), w.device(), id.device(), p.devBuffer(), x.size());
-
-  unsigned n = grid;
-  unsigned nmax = 2*block;
-  while (n > nmax)
-  {
-    // Recompute partitioning
-    p.calcPartitioning(n, grid, block, shMemSize);
-
-    // (Re)run reduction kernel
-    math_kernels::sumReduceKernel<T,I><<<grid, block, shMemSize, stream>>>(p.devBuffer(), p.devBuffer(), n);
-    n = grid;
-  }
-
-  // Finish reduction on CPU if there are less than two blocks of data left.
-  p.copyFromDevBuffer(n);
-
-  T gpu_result = p.hostBuffer()[0];
-  for (unsigned int i=1; i<n; i++)
-  {
-    gpu_result += p.hostBuffer()[i];
-  }
-  return gpu_result;
-}
-
-template <typename T, typename I>
-inline T findMin(const Vector<T,I>& x)
-{
-  T maxVal = std::numeric_limits<T>::max();
-
-  // Set partitioning
-  ThreadPartitioning<T, I>& p = x.partReduce();
-  unsigned grid               = p.grid();
-  unsigned block              = p.block();
-  unsigned shMemSize          = p.shmem();
-  const cudaStream_t stream   = p.stream();
-
-  math_kernels::findMinKernel<T,I><<<grid, block, shMemSize, stream>>>(maxVal, x.device(), p.devBuffer(), x.size());
-
-  unsigned n = grid;
-  unsigned nmax = 2*block;
-  while (n > nmax)
-  {
-    // Recompute partitioning
-    p.calcPartitioning(n, grid, block, shMemSize);
-
-    // Rerun reduction kernel
-    math_kernels::findMinKernel<T,I><<<grid, block, shMemSize, stream>>>(maxVal, p.devBuffer(), p.devBuffer(), n);
-    n = grid;
-  }
-
-  // Finish reduction on CPU if there are less than two blocks of data left.
-  p.copyFromDevBuffer(n);
-
-  T gpu_result = p.hostBuffer()[0];
-  for (unsigned int i=1; i<n; i++)
-  {
-    if (p.hostBuffer()[i] < gpu_result)
-      gpu_result = p.hostBuffer()[i];
-  }
-  return gpu_result;
-}
-
-
-template <typename T, typename I>
-inline T wL2NormSquare(const Vector<T,I>& x, const Vector<T,I>& y)
-{
-  // Set partitioning
-  ThreadPartitioning<T, I>& p = x.partReduce();
-  unsigned grid               = p.grid();
-  unsigned block              = p.block();
-  unsigned shMemSize          = p.shmem();
-  const cudaStream_t stream   = p.stream();
-
-  math_kernels::wL2NormSquareKernel<T,I><<<grid, block, shMemSize, stream>>>(x.device(), y.device(), p.devBuffer(), x.size());
-
-  unsigned n = grid;
-  unsigned nmax = 2*block;
-  while (n > nmax)
-  {
-    // Recompute partitioning
-    p.calcPartitioning(n, grid, block, shMemSize);
-
-    // Rerun reduction kernel
-    math_kernels::sumReduceKernel<T,I><<<grid, block, shMemSize, stream>>>(p.devBuffer(), p.devBuffer(), n);
-    n = grid;
-  }
-
-  // Finish reduction on CPU if there are less than two blocks of data left.
-  p.copyFromDevBuffer(n);
-
-  T gpu_result = p.hostBuffer()[0];
-  for (unsigned int i=1; i<n; i++)
-  {
-    gpu_result += p.hostBuffer()[i];
-  }
-  return (gpu_result);
-}
-
-
-template <typename T, typename I>
-inline T L1Norm(const Vector<T,I>& x)
-{
-  // Set partitioning
-  ThreadPartitioning<T, I>& p = x.partReduce();
-  unsigned grid               = p.grid();
-  unsigned block              = p.block();
-  unsigned shMemSize          = p.shmem();
-  const cudaStream_t stream   = p.stream();
-
-  math_kernels::L1NormKernel<T,I><<<grid, block, shMemSize, stream>>>(x.device(), p.devBuffer(), x.size());
-
-  unsigned n = grid;
-  unsigned nmax = 2*block;
-  while (n > nmax)
-  {
-    // Recompute partitioning
-    p.calcPartitioning(n, grid, block, shMemSize);
-
-    // Rerun reduction kernel
-    math_kernels::sumReduceKernel<T,I><<<grid, block, shMemSize, stream>>>(p.devBuffer(), p.devBuffer(), n);
-    n = grid;
-  }
-
-  // Finish reduction on CPU if there are less than two blocks of data left.
-  p.copyFromDevBuffer(n);
-
-  T gpu_result = p.hostBuffer()[0];
-  for (unsigned int i=1; i<n; i++)
-  {
-    gpu_result += p.hostBuffer()[i];
-  }
-  return gpu_result;
-}
-
-
-template <typename T, typename I>
-inline T invTest(const Vector<T,I>& x, Vector<T,I>& z)
-{
-  // Set partitioning
-  ThreadPartitioning<T, I>& p = x.partReduce();
-  unsigned grid               = p.grid();
-  unsigned block              = p.block();
-  unsigned shMemSize          = p.shmem();
-  const cudaStream_t stream   = p.stream();
-
-  math_kernels::invTestKernel<T,I><<<grid, block, shMemSize, stream>>>(x.device(), z.device(), p.devBuffer(), x.size());
-
-  unsigned n = grid;
-  unsigned nmax = 2*block;
-  while (n > nmax)
-  {
-    // Recompute partitioning
-    p.calcPartitioning(n, grid, block, shMemSize);
-
-    // Rerun reduction kernel
-    math_kernels::sumReduceKernel<T,I><<<grid, block, shMemSize, stream>>>(p.devBuffer(), p.devBuffer(), n);
-    n = grid;
-  }
-
-  // Finish reduction on CPU if there are less than two blocks of data left.
-  p.copyFromDevBuffer(n);
-
-  T gpu_result = p.hostBuffer()[0];
-  for (unsigned int i=1; i<n; i++)
-  {
-    gpu_result += p.hostBuffer()[i];
-  }
-  return gpu_result;
-}
-
-
-template <typename T, typename I>
-inline T constrMask(const Vector<T,I>& c, const Vector<T,I>& x, Vector<T,I>& m)
-{
-  // Set partitioning
-  ThreadPartitioning<T, I>& p = x.partReduce();
-  unsigned grid               = p.grid();
-  unsigned block              = p.block();
-  unsigned shMemSize          = p.shmem();
-  const cudaStream_t stream   = p.stream();
-
-  math_kernels::constrMaskKernel<T,I><<<grid, block, shMemSize, stream>>>(c.device(), x.device(), m.device(), p.devBuffer(), x.size());
-
-  unsigned n = grid;
-  unsigned nmax = 2*block;
-  while (n > nmax)
-  {
-    // Recompute partitioning
-    p.calcPartitioning(n, grid, block, shMemSize);
-
-    // Rerun reduction kernel
-    math_kernels::sumReduceKernel<T,I><<<grid, block, shMemSize, stream>>>(p.devBuffer(), p.devBuffer(), n);
-    n = grid;
-  }
-
-  // Finish reduction on CPU if there are less than two blocks of data left.
-  p.copyFromDevBuffer(n);
-
-  T gpu_result = p.hostBuffer()[0];
-  for (unsigned int i=1; i<n; i++)
-  {
-    gpu_result += p.hostBuffer()[i];
-  }
-  return gpu_result;
-}
-
-
-template <typename T, typename I>
-inline T minQuotient(const Vector<T,I>& num, const Vector<T,I>& den)
-{
-  // Starting value for min reduction
-  const T maxVal = std::numeric_limits<T>::max();
-
-  // Set partitioning
-  ThreadPartitioning<T, I>& p = num.partReduce();
-  unsigned grid               = p.grid();
-  unsigned block              = p.block();
-  unsigned shMemSize          = p.shmem();
-  const cudaStream_t stream   = p.stream();
-
-  math_kernels::minQuotientKernel<T,I><<<grid, block, shMemSize, stream>>>(maxVal, num.device(), den.device(), p.devBuffer(), num.size());
-
-  // All quotients are computed by now. Find the minimum.
-  unsigned n = grid;
-  unsigned nmax = 2*block;
-  while (n > nmax)
-  {
-    // Recompute partitioning
-    p.calcPartitioning(n, grid, block, shMemSize);
-
-    // Rerun reduction kernel
-    math_kernels::findMinKernel<T,I><<<grid, block, shMemSize, stream>>>(maxVal, p.devBuffer(), p.devBuffer(), n);
-    n = grid;
-  }
-
-  // Finish reduction on CPU if there are less than two blocks of data left.
-  p.copyFromDevBuffer(n);
-
-  T gpu_result = p.hostBuffer()[0];
-  for (unsigned int i=1; i<n; i++)
-  {
-    if (p.hostBuffer()[i] < gpu_result)
-      gpu_result = p.hostBuffer()[i];
-  }
-  return gpu_result;
-}
-
-
-} // namespace nvec
-
-
-#endif // _VECTOR_KERNELS_CUH_
diff --git a/include/nvector/nvector_cuda.h b/include/nvector/nvector_cuda.h
index 93cadcb091..ed331afc74 100644
--- a/include/nvector/nvector_cuda.h
+++ b/include/nvector/nvector_cuda.h
@@ -37,7 +37,9 @@
 #ifndef _NVECTOR_CUDA_H
 #define _NVECTOR_CUDA_H
 
+
 #include <stdio.h>
+#include <sundials/sundials_cuda_policies.hpp>
 #include <sundials/sundials_nvector.h>
 #include <sundials/sundials_config.h>
 
@@ -51,61 +53,62 @@ extern "C" {
  * -----------------------------------------------------------------
  */
 
-/*
- * CUDA implementation of the N_Vector 'content' is in C++ class
- * Vector. The class inherits from structure _N_VectorContent_Cuda
- * to create C <--> C++ interface.
- */
-
-struct _N_VectorContent_Cuda {};
+struct _N_VectorContent_Cuda 
+{
+    sunindextype       length;
+    booleantype        own_data;
+    realtype*          host_data;
+    realtype*          device_data;
+    SUNCudaExecPolicy* stream_exec_policy;
+    SUNCudaExecPolicy* reduce_exec_policy;
+    void*              priv; /* 'private' data */
+};
 
 typedef struct _N_VectorContent_Cuda *N_VectorContent_Cuda;
 
 /*
  * -----------------------------------------------------------------
- * Functions exported by nvector_cuda
+ * NVECTOR_CUDA implementation specific functions
  * -----------------------------------------------------------------
  */
 
 SUNDIALS_EXPORT N_Vector N_VNew_Cuda(sunindextype length);
-
 SUNDIALS_EXPORT N_Vector N_VNewManaged_Cuda(sunindextype length);
-
 SUNDIALS_EXPORT N_Vector N_VNewEmpty_Cuda();
-
 SUNDIALS_EXPORT N_Vector N_VMake_Cuda(sunindextype length,
                                       realtype *h_vdata,
                                       realtype *d_vdata);
-
 SUNDIALS_EXPORT N_Vector N_VMakeManaged_Cuda(sunindextype length,
                                              realtype *vdata);
-
 SUNDIALS_EXPORT N_Vector N_VMakeWithManagedAllocator_Cuda(sunindextype length,
                                                           void* (*allocfn)(size_t),
                                                           void (*freefn)(void*));
-
-SUNDIALS_EXPORT sunindextype N_VGetLength_Cuda(N_Vector v);
-
 SUNDIALS_EXPORT realtype *N_VGetHostArrayPointer_Cuda(N_Vector v);
-
 SUNDIALS_EXPORT realtype *N_VGetDeviceArrayPointer_Cuda(N_Vector v);
-
 SUNDIALS_EXPORT booleantype N_VIsManagedMemory_Cuda(N_Vector x);
-
-SUNDIALS_EXPORT void N_VSetCudaStream_Cuda(N_Vector x, cudaStream_t *stream);
-
+SUNDIALS_EXPORT int N_VSetKernelExecPolicy_Cuda(N_Vector x,
+                                                SUNCudaExecPolicy* stream_exec_policy,
+                                                SUNCudaExecPolicy* reduce_exec_policy);
 SUNDIALS_EXPORT void N_VCopyToDevice_Cuda(N_Vector v);
-
 SUNDIALS_EXPORT void N_VCopyFromDevice_Cuda(N_Vector v);
-
 SUNDIALS_EXPORT void N_VPrint_Cuda(N_Vector v);
-
 SUNDIALS_EXPORT void N_VPrintFile_Cuda(N_Vector v, FILE *outfile);
 
+ /* DEPRECATED (to be removed in SUNDIALS v6): use N_VSetKerrnelExecPolicy_Cuda instead */ 
+SUNDIALS_DEPRECATED void N_VSetCudaStream_Cuda(N_Vector x, cudaStream_t *stream);
+
+
+/*
+ * -----------------------------------------------------------------
+ * NVECTOR API functions
+ * -----------------------------------------------------------------
+ */
+
 SUNDIALS_EXPORT N_Vector N_VCloneEmpty_Cuda(N_Vector w);
 SUNDIALS_EXPORT N_Vector N_VClone_Cuda(N_Vector w);
 SUNDIALS_EXPORT void N_VDestroy_Cuda(N_Vector v);
 SUNDIALS_EXPORT void N_VSpace_Cuda(N_Vector v, sunindextype *lrw, sunindextype *liw);
+SUNDIALS_EXPORT sunindextype N_VGetLength_Cuda(N_Vector v);
 
 /* standard vector operations */
 SUNDIALS_EXPORT void N_VLinearSum_Cuda(realtype a, N_Vector x, realtype b, N_Vector y, N_Vector z);
diff --git a/include/sundials/sundials_config.in b/include/sundials/sundials_config.in
index 604518eba7..8d90c32668 100644
--- a/include/sundials/sundials_config.in
+++ b/include/sundials/sundials_config.in
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------- 
+/* -----------------------------------------------------------------
  * Programmer(s): Aaron Collier and Radu Serban @ LLNL
  * -----------------------------------------------------------------
  * LLNS/SMU Copyright Start
@@ -42,8 +42,8 @@
  */
 @F77_MANGLE_MACRO2@
 
-/* Define precision of SUNDIALS data type 'realtype' 
- * Depending on the precision level, one of the following 
+/* Define precision of SUNDIALS data type 'realtype'
+ * Depending on the precision level, one of the following
  * three macros will be defined:
  *     #define SUNDIALS_SINGLE_PRECISION 1
  *     #define SUNDIALS_DOUBLE_PRECISION 1
@@ -51,8 +51,8 @@
  */
 @PRECISION_LEVEL@
 
-/* Define type of vector indices in SUNDIALS 'sunindextype'. 
- * Depending on user choice of index type, one of the following 
+/* Define type of vector indices in SUNDIALS 'sunindextype'.
+ * Depending on user choice of index type, one of the following
  * two macros will be defined:
  *     #define SUNDIALS_INT64_T 1
  *     #define SUNDIALS_INT32_T 1
@@ -64,7 +64,7 @@
  */
 #define SUNDIALS_INDEX_TYPE @SUNDIALS_CINDEX_TYPE@
 
-/* Use generic math functions 
+/* Use generic math functions
  * If it was decided that generic math functions can be used, then
  *     #define SUNDIALS_USE_GENERIC_MATH
  */
@@ -75,6 +75,12 @@
  */
 #cmakedefine SUNDIALS_HAVE_POSIX_TIMERS
 
+/* Build monitoring code
+ * If it was decided that monitoring code should be built, then
+ *     #define SUNDIALS_BUILD_WITH_MONITORING
+ */
+#cmakedefine SUNDIALS_BUILD_WITH_MONITORING
+
 /* Blas/Lapack available
  * If working libraries for Blas/lapack support were found, then
  *     #define SUNDIALS_BLAS_LAPACK
@@ -83,7 +89,7 @@
 
 /* SUPERLUMT available
  * If working libraries for SUPERLUMT support were found, then
- *     #define SUNDIALS_SUPERLUMT 
+ *     #define SUNDIALS_SUPERLUMT
  */
 #cmakedefine SUNDIALS_SUPERLUMT
 #cmakedefine SUNDIALS_SUPERLUMT_THREAD_TYPE "@SUPERLUMT_THREAD_TYPE@"
@@ -96,7 +102,7 @@
 
 /* KLU available
  * If working libraries for KLU support were found, then
- *     #define SUNDIALS_KLU 
+ *     #define SUNDIALS_KLU
  */
 #cmakedefine SUNDIALS_KLU
 
@@ -112,10 +118,16 @@
 #cmakedefine SUNDIALS_TRILINOS_HAVE_MPI
 
 /* Set if SUNDIALS is built with MPI support.
- * 
+ *
  */
 @IS_MPI_ENABLED@
 
+
+ /* CVODE should use fused kernels if utilizing
+  * the CUDA NVector.
+  */
+#cmakedefine SUNDIALS_BUILD_PACKAGE_FUSED_KERNELS
+
 /* FNVECTOR: Allow user to specify different MPI communicator
  * If it was found that the MPI implementation supports MPI_Comm_f2c, then
  *      #define SUNDIALS_MPI_COMM_F2C 1
@@ -133,3 +145,7 @@
  * Windows), the SUNDIALS_EXPORT macro is empty
  */
 @SUNDIALS_EXPORT@
+
+/* Mark SUNDIALS API functions for deprecation.
+ */
+#define SUNDIALS_DEPRECATED SUNDIALS_EXPORT
\ No newline at end of file
diff --git a/include/sundials/sundials_cuda_policies.hpp b/include/sundials/sundials_cuda_policies.hpp
new file mode 100644
index 0000000000..c11843d9a0
--- /dev/null
+++ b/include/sundials/sundials_cuda_policies.hpp
@@ -0,0 +1,190 @@
+/*
+ * -----------------------------------------------------------------
+ * Programmer(s): Cody J. Balos @ LLNL
+ * -----------------------------------------------------------------
+ * SUNDIALS Copyright Start
+ * Copyright (c) 2002-2020, Lawrence Livermore National Security
+ * and Southern Methodist University.
+ * All rights reserved.
+ *
+ * See the top-level LICENSE and NOTICE files for details.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ * SUNDIALS Copyright End
+ * -----------------------------------------------------------------
+ * This header files defines the CudaExecPolicy classes which
+ * are utilized to determine CUDA kernel launch paramaters.
+ * -----------------------------------------------------------------
+ */
+
+#ifndef _SUNDIALS_CUDAEXECPOLICIES_HPP
+#define _SUNDIALS_CUDAEXECPOLICIES_HPP
+
+#include <cstdio>
+#include <stdexcept>
+#include <cuda_runtime.h>
+
+namespace sundials
+{
+
+class CudaExecPolicy
+{
+public:
+  virtual size_t gridSize(size_t numWorkUnits = 0, size_t blockDim = 0) const = 0;
+  virtual size_t blockSize(size_t numWorkUnits = 0, size_t gridDim = 0) const = 0; 
+  virtual cudaStream_t stream() const = 0;
+  virtual CudaExecPolicy* clone() const = 0;
+  virtual ~CudaExecPolicy() {}
+};
+
+
+/* 
+ * A kernel execution policy that maps each thread to a work unit.
+ * The number of threads per block (blockSize) can be set to anything.
+ * The grid size will be chosen so that there is enough threads for one
+ * thread per element. If a stream is provided, it will be used to
+ * execute the kernel.
+ */
+class CudaThreadDirectExecPolicy : public CudaExecPolicy
+{
+public:
+  CudaThreadDirectExecPolicy(const size_t blockDim, const cudaStream_t stream = 0)
+    : blockDim_(blockDim), stream_(stream)
+  {}
+
+  CudaThreadDirectExecPolicy(const CudaThreadDirectExecPolicy& ex)
+    : blockDim_(ex.blockDim_), stream_(ex.stream_)
+  {} 
+
+  virtual size_t gridSize(size_t numWorkUnits = 0, size_t blockDim = 0) const
+  {
+    /* ceil(n/m) = floor((n + m - 1) / m) */
+    return (numWorkUnits + blockSize() - 1) / blockSize();
+  }
+
+  virtual size_t blockSize(size_t numWorkUnits = 0, size_t gridDim = 0) const
+  {
+    return blockDim_;
+  }
+
+  virtual cudaStream_t stream() const
+  {
+    return stream_;
+  }
+
+  virtual CudaExecPolicy* clone() const
+  {
+    return static_cast<CudaExecPolicy*>(new CudaThreadDirectExecPolicy(*this));
+  }
+
+private:
+  const cudaStream_t stream_;
+  const size_t blockDim_;
+};
+
+/* 
+ * A kernel execution policy for kernels that use grid stride loops.
+ * The number of threads per block (blockSize) can be set to anything.
+ * The number of blocks (gridSize) can be set to anything. If a stream
+ * is provided, it will be used to execute the kernel.
+ */
+class CudaGridStrideExecPolicy : public CudaExecPolicy
+{
+public:
+  CudaGridStrideExecPolicy(const size_t blockDim, const size_t gridDim, const cudaStream_t stream = 0)
+    : blockDim_(blockDim), gridDim_(gridDim), stream_(stream)
+  {}
+
+  CudaGridStrideExecPolicy(const CudaGridStrideExecPolicy& ex)
+    : blockDim_(ex.blockDim_), gridDim_(ex.gridDim_), stream_(ex.stream_)
+  {} 
+
+  virtual size_t gridSize(size_t numWorkUnits = 0, size_t blockDim = 0) const
+  {
+    return gridDim_;
+  }
+
+  virtual size_t blockSize(size_t numWorkUnits = 0, size_t gridDim = 0) const
+  {
+    return blockDim_;
+  }
+
+  virtual cudaStream_t stream() const
+  {
+    return stream_;
+  }
+
+  virtual CudaExecPolicy* clone() const
+  {
+    return static_cast<CudaExecPolicy*>(new CudaGridStrideExecPolicy(*this));
+  }
+
+private:
+  const cudaStream_t stream_;
+  const size_t blockDim_;
+  const size_t gridDim_;
+};
+
+
+/* 
+ * A kernel execution policy for performing a reduction across indvidual thread blocks.
+ * The number of threads per block (blockSize) can be set to any valid multiple of
+ * the CUDA warp size. The grid size (gridSize) can be set to any value greater than 0.
+ * If it is set to 0, then the grid size will be chosen so that there is enough threads
+ * for one thread per work unit. If a stream is provided, it will be used to execute
+ * the kernel.
+ */
+class CudaBlockReduceExecPolicy : public CudaExecPolicy
+{
+public:
+  CudaBlockReduceExecPolicy(const size_t blockDim, const size_t gridDim = 0, const cudaStream_t stream = 0)
+    : blockDim_(blockDim), gridDim_(gridDim), stream_(stream)
+  {
+    if (blockDim < 1 || blockDim % 32)
+    {
+      throw std::invalid_argument("the block size must be a multiple of the CUDA warp size");
+    }
+  }
+
+  CudaBlockReduceExecPolicy(const CudaBlockReduceExecPolicy& ex)
+    : blockDim_(ex.blockDim_), gridDim_(ex.gridDim_), stream_(ex.stream_)
+  {}
+
+  virtual size_t gridSize(size_t numWorkUnits = 0, size_t blockDim = 0) const
+  {
+    if (gridDim_ == 0)
+    {
+      return (numWorkUnits + (blockSize() * 2 - 1)) / (blockSize() * 2);
+    }
+    return gridDim_;
+  }
+
+  virtual size_t blockSize(size_t numWorkUnits = 0, size_t gridDim = 0) const
+  {
+    return blockDim_;
+  }
+  
+  virtual cudaStream_t stream() const
+  {
+    return stream_;
+  }
+
+  virtual CudaExecPolicy* clone() const
+  {
+    return static_cast<CudaExecPolicy*>(new CudaBlockReduceExecPolicy(*this));
+  }
+
+private:
+  const cudaStream_t stream_;
+  const size_t blockDim_;
+  const size_t gridDim_;
+};
+
+} // namespace sundials
+
+typedef sundials::CudaExecPolicy SUNCudaExecPolicy;
+typedef sundials::CudaThreadDirectExecPolicy SUNCudaThreadDirectExecPolicy;
+typedef sundials::CudaGridStrideExecPolicy SUNCudaGridStrideExecPolicy;
+typedef sundials::CudaBlockReduceExecPolicy SUNCudaBlockReduceExecPolicy;
+
+#endif
\ No newline at end of file
diff --git a/include/sundials/sundials_linearsolver.h b/include/sundials/sundials_linearsolver.h
index d0e3badd1d..c5a5f46186 100644
--- a/include/sundials/sundials_linearsolver.h
+++ b/include/sundials/sundials_linearsolver.h
@@ -1,6 +1,7 @@
 /* -----------------------------------------------------------------
  * Programmer(s): Daniel Reynolds @ SMU
- *                David Gardner, Carol Woodward, Slaven Peles @ LLNL
+ *                David Gardner, Carol Woodward,
+ *                Slaven Peles, Cody Balos @ LLNL
  * -----------------------------------------------------------------
  * SUNDIALS Copyright Start
  * Copyright (c) 2002-2020, Lawrence Livermore National Security
@@ -181,13 +182,15 @@ SUNDIALS_EXPORT int SUNLinSolFree(SUNLinearSolver S);
 #define SUNLS_MEM_NULL           -801   /* mem argument is NULL          */
 #define SUNLS_ILL_INPUT          -802   /* illegal function input        */
 #define SUNLS_MEM_FAIL           -803   /* failed memory access          */
-#define SUNLS_ATIMES_FAIL_UNREC  -804   /* atimes unrecoverable failure  */
-#define SUNLS_PSET_FAIL_UNREC    -805   /* pset unrecoverable failure    */
-#define SUNLS_PSOLVE_FAIL_UNREC  -806   /* psolve unrecoverable failure  */
-#define SUNLS_PACKAGE_FAIL_UNREC -807   /* external package unrec. fail  */
-#define SUNLS_GS_FAIL            -808   /* Gram-Schmidt failure          */
-#define SUNLS_QRSOL_FAIL         -809   /* QRsol found singular R        */
-#define SUNLS_VECTOROP_ERR       -810   /* vector operation error        */
+#define SUNLS_ATIMES_NULL        -804   /* atimes function is NULL       */
+#define SUNLS_ATIMES_FAIL_UNREC  -805   /* atimes unrecoverable failure  */
+#define SUNLS_PSET_FAIL_UNREC    -806   /* pset unrecoverable failure    */
+#define SUNLS_PSOLVE_NULL        -807   /* psolve function is NULL       */
+#define SUNLS_PSOLVE_FAIL_UNREC  -808   /* psolve unrecoverable failure  */
+#define SUNLS_PACKAGE_FAIL_UNREC -809   /* external package unrec. fail  */
+#define SUNLS_GS_FAIL            -810   /* Gram-Schmidt failure          */
+#define SUNLS_QRSOL_FAIL         -811   /* QRsol found singular R        */
+#define SUNLS_VECTOROP_ERR       -812   /* vector operation error        */
 
 #define SUNLS_RES_REDUCED         801   /* nonconv. solve, resid reduced */
 #define SUNLS_CONV_FAIL           802   /* nonconvergent solve           */
@@ -198,6 +201,18 @@ SUNDIALS_EXPORT int SUNLinSolFree(SUNLinearSolver S);
 #define SUNLS_QRFACT_FAIL         807   /* QRfact found singular matrix  */
 #define SUNLS_LUFACT_FAIL         808   /* LUfact found singular matrix  */
 
+/* -----------------------------------------------------------------------------
+ * SUNLinearSolver messages
+ * ---------------------------------------------------------------------------*/
+
+#if defined(SUNDIALS_EXTENDED_PRECISION)
+#define SUNLS_MSG_RESIDUAL "\t\tlin. iteration %ld, lin. residual: %Lg\n"
+#elif defined(SUNDIALS_DOUBLE_PRECISION)
+#define SUNLS_MSG_RESIDUAL "\t\tlin. iteration %ld, lin. residual: %g\n"
+#else
+#define SUNLS_MSG_RESIDUAL "\t\tlin. iteration %ld, lin. residual: %g\n"
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/sundials/sundials_nonlinearsolver.h b/include/sundials/sundials_nonlinearsolver.h
index 6fb18d7657..dda005bf3e 100644
--- a/include/sundials/sundials_nonlinearsolver.h
+++ b/include/sundials/sundials_nonlinearsolver.h
@@ -1,5 +1,5 @@
 /* -----------------------------------------------------------------------------
- * Programmer(s): David J. Gardner @ LLNL
+ * Programmer(s): David J. Gardner, and Cody J. Balos @ LLNL
  * -----------------------------------------------------------------------------
  * SUNDIALS Copyright Start
  * Copyright (c) 2002-2020, Lawrence Livermore National Security
@@ -190,6 +190,19 @@ SUNDIALS_EXPORT int SUNNonlinSolGetNumConvFails(SUNNonlinearSolver NLS,
 #define SUN_NLS_VECTOROP_ERR  -904    /* failed NVector operation           */
 #define SUN_NLS_EXT_FAIL      -905    /* failed in external library call    */
 
+
+/* -----------------------------------------------------------------------------
+ * SUNNonlinearSolver messages
+ * ---------------------------------------------------------------------------*/
+
+#if defined(SUNDIALS_EXTENDED_PRECISION)
+#define SUN_NLS_MSG_RESIDUAL "\tnonlin. iteration %ld, nonlin. residual: %Lg\n"
+#elif defined(SUNDIALS_DOUBLE_PRECISION)
+#define SUN_NLS_MSG_RESIDUAL "\tnonlin. iteration %ld, nonlin. residual: %g\n"
+#else
+#define SUN_NLS_MSG_RESIDUAL "\tnonlin. iteration %ld, nonlin. residual: %g\n"
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/sundials/sundials_nvector.h b/include/sundials/sundials_nvector.h
index e343fb56ae..d5afcbe8eb 100644
--- a/include/sundials/sundials_nvector.h
+++ b/include/sundials/sundials_nvector.h
@@ -45,6 +45,9 @@
 #ifndef _NVECTOR_H
 #define _NVECTOR_H
 
+#include <stdio.h>
+#include <stdlib.h>
+
 #include <sundials/sundials_types.h>
 
 #ifdef __cplusplus  /* wrapper to enable C++ usage */
@@ -145,6 +148,10 @@ struct _generic_N_Vector_Ops {
   realtype (*nvminquotientlocal)(N_Vector, N_Vector);
   realtype (*nvwsqrsumlocal)(N_Vector, N_Vector);
   realtype (*nvwsqrsummasklocal)(N_Vector, N_Vector, N_Vector);
+
+  /* debugging functions (called when SUNDIALS_DEBUG_PRINTVEC is defined) */
+  void (*nvprint)(N_Vector);
+  void (*nvprintfile)(N_Vector, FILE*);
 };
 
 /* A vector is a structure with an implementation-dependent
@@ -257,6 +264,15 @@ SUNDIALS_EXPORT void N_VDestroyVectorArray(N_Vector* vs, int count);
 SUNDIALS_EXPORT N_Vector N_VGetVecAtIndexVectorArray(N_Vector* vs, int index);
 SUNDIALS_EXPORT void N_VSetVecAtIndexVectorArray(N_Vector* vs, int index, N_Vector w);
 
+
+/* -----------------------------------------------------------------
+ * Debugging functions
+ * ----------------------------------------------------------------- */
+
+SUNDIALS_EXPORT void N_VPrint(N_Vector v);
+SUNDIALS_EXPORT void N_VPrintFile(N_Vector v, FILE* outfile);
+
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/sunlinsol/sunlinsol_pcg.h b/include/sunlinsol/sunlinsol_pcg.h
index 9476d555fc..bf361c024c 100644
--- a/include/sunlinsol/sunlinsol_pcg.h
+++ b/include/sunlinsol/sunlinsol_pcg.h
@@ -58,6 +58,9 @@ struct _SUNLinearSolverContent_PCG {
   N_Vector p;
   N_Vector z;
   N_Vector Ap;
+
+  int print_level;
+  FILE* info_file;
 };
 
 typedef struct _SUNLinearSolverContent_PCG *SUNLinearSolverContent_PCG;
@@ -75,12 +78,9 @@ SUNDIALS_EXPORT int SUNLinSol_PCGSetPrecType(SUNLinearSolver S,
 SUNDIALS_EXPORT int SUNLinSol_PCGSetMaxl(SUNLinearSolver S,
                                          int maxl);
 
-/* deprecated */
-SUNDIALS_EXPORT SUNLinearSolver SUNPCG(N_Vector y, int pretype, int maxl);
-/* deprecated */
-SUNDIALS_EXPORT int SUNPCGSetPrecType(SUNLinearSolver S, int pretype);
-/* deprecated */
-SUNDIALS_EXPORT int SUNPCGSetMaxl(SUNLinearSolver S, int maxl);
+SUNDIALS_DEPRECATED SUNLinearSolver SUNPCG(N_Vector y, int pretype, int maxl);
+SUNDIALS_DEPRECATED int SUNPCGSetPrecType(SUNLinearSolver S, int pretype);
+SUNDIALS_DEPRECATED int SUNPCGSetMaxl(SUNLinearSolver S, int maxl);
 
 SUNDIALS_EXPORT SUNLinearSolver_Type SUNLinSolGetType_PCG(SUNLinearSolver S);
 SUNDIALS_EXPORT SUNLinearSolver_ID SUNLinSolGetID_PCG(SUNLinearSolver S);
@@ -105,6 +105,10 @@ SUNDIALS_EXPORT int SUNLinSolSpace_PCG(SUNLinearSolver S,
                                        long int *lenrwLS,
                                        long int *leniwLS);
 SUNDIALS_EXPORT int SUNLinSolFree_PCG(SUNLinearSolver S);
+SUNDIALS_EXPORT int SUNLinSolSetInfoFile_PCG(SUNLinearSolver LS,
+                                             FILE* info_file);
+SUNDIALS_EXPORT int SUNLinSolSetPrintLevel_PCG(SUNLinearSolver LS,
+                                               int print_level);
 
 #ifdef __cplusplus
 }
diff --git a/include/sunlinsol/sunlinsol_spbcgs.h b/include/sunlinsol/sunlinsol_spbcgs.h
index 738f6b3a80..ab762daa47 100644
--- a/include/sunlinsol/sunlinsol_spbcgs.h
+++ b/include/sunlinsol/sunlinsol_spbcgs.h
@@ -27,6 +27,8 @@
 #ifndef _SUNLINSOL_SPBCGS_H
 #define _SUNLINSOL_SPBCGS_H
 
+#include <stdio.h>
+
 #include <sundials/sundials_linearsolver.h>
 #include <sundials/sundials_matrix.h>
 #include <sundials/sundials_nvector.h>
@@ -64,6 +66,9 @@ struct _SUNLinearSolverContent_SPBCGS {
   N_Vector u;
   N_Vector Ap;
   N_Vector vtemp;
+
+  int print_level;
+  FILE* info_file;
 };
 
 typedef struct _SUNLinearSolverContent_SPBCGS *SUNLinearSolverContent_SPBCGS;
@@ -81,12 +86,9 @@ SUNDIALS_EXPORT int SUNLinSol_SPBCGSSetPrecType(SUNLinearSolver S,
 SUNDIALS_EXPORT int SUNLinSol_SPBCGSSetMaxl(SUNLinearSolver S,
                                             int maxl);
 
-/* deprecated */
-SUNDIALS_EXPORT SUNLinearSolver SUNSPBCGS(N_Vector y, int pretype, int maxl);
-/* deprecated */
-SUNDIALS_EXPORT int SUNSPBCGSSetPrecType(SUNLinearSolver S, int pretype);
-/* deprecated */
-SUNDIALS_EXPORT int SUNSPBCGSSetMaxl(SUNLinearSolver S, int maxl);
+SUNDIALS_DEPRECATED SUNLinearSolver SUNSPBCGS(N_Vector y, int pretype, int maxl);
+SUNDIALS_DEPRECATED int SUNSPBCGSSetPrecType(SUNLinearSolver S, int pretype);
+SUNDIALS_DEPRECATED int SUNSPBCGSSetMaxl(SUNLinearSolver S, int maxl);
 
 SUNDIALS_EXPORT SUNLinearSolver_Type SUNLinSolGetType_SPBCGS(SUNLinearSolver S);
 SUNDIALS_EXPORT SUNLinearSolver_ID SUNLinSolGetID_SPBCGS(SUNLinearSolver S);
@@ -111,6 +113,11 @@ SUNDIALS_EXPORT int SUNLinSolSpace_SPBCGS(SUNLinearSolver S,
                                           long int *lenrwLS,
                                           long int *leniwLS);
 SUNDIALS_EXPORT int SUNLinSolFree_SPBCGS(SUNLinearSolver S);
+SUNDIALS_EXPORT int SUNLinSolSetInfoFile_SPBCGS(SUNLinearSolver S,
+                                                FILE* info_file);
+SUNDIALS_EXPORT int SUNLinSolSetPrintLevel_SPBCGS(SUNLinearSolver S,
+                                                  int print_level);
+
 
 
 #ifdef __cplusplus
diff --git a/include/sunlinsol/sunlinsol_spfgmr.h b/include/sunlinsol/sunlinsol_spfgmr.h
index 1fcc8caab4..a78f31cd19 100644
--- a/include/sunlinsol/sunlinsol_spfgmr.h
+++ b/include/sunlinsol/sunlinsol_spfgmr.h
@@ -72,6 +72,9 @@ struct _SUNLinearSolverContent_SPFGMR {
 
   realtype *cv;
   N_Vector *Xv;
+
+  int print_level;
+  FILE* info_file;
 };
 
 typedef struct _SUNLinearSolverContent_SPFGMR *SUNLinearSolverContent_SPFGMR;
@@ -90,14 +93,10 @@ SUNDIALS_EXPORT int SUNLinSol_SPFGMRSetGSType(SUNLinearSolver S,
 SUNDIALS_EXPORT int SUNLinSol_SPFGMRSetMaxRestarts(SUNLinearSolver S,
                                                    int maxrs);
 
-/* deprecated */
-SUNDIALS_EXPORT SUNLinearSolver SUNSPFGMR(N_Vector y, int pretype, int maxl);
-/* deprecated */
-SUNDIALS_EXPORT int SUNSPFGMRSetPrecType(SUNLinearSolver S, int pretype);
-/* deprecated */
-SUNDIALS_EXPORT int SUNSPFGMRSetGSType(SUNLinearSolver S, int gstype);
-/* deprecated */
-SUNDIALS_EXPORT int SUNSPFGMRSetMaxRestarts(SUNLinearSolver S, int maxrs);
+SUNDIALS_DEPRECATED SUNLinearSolver SUNSPFGMR(N_Vector y, int pretype, int maxl);
+SUNDIALS_DEPRECATED int SUNSPFGMRSetPrecType(SUNLinearSolver S, int pretype);
+SUNDIALS_DEPRECATED int SUNSPFGMRSetGSType(SUNLinearSolver S, int gstype);
+SUNDIALS_DEPRECATED int SUNSPFGMRSetMaxRestarts(SUNLinearSolver S, int maxrs);
 
 
 SUNDIALS_EXPORT SUNLinearSolver_Type SUNLinSolGetType_SPFGMR(SUNLinearSolver S);
@@ -123,6 +122,10 @@ SUNDIALS_EXPORT int SUNLinSolSpace_SPFGMR(SUNLinearSolver S,
                                           long int *lenrwLS,
                                           long int *leniwLS);
 SUNDIALS_EXPORT int SUNLinSolFree_SPFGMR(SUNLinearSolver S);
+SUNDIALS_EXPORT int SUNLinSolSetInfoFile_SPFGMR(SUNLinearSolver LS,
+                                                FILE* info_file);
+SUNDIALS_EXPORT int SUNLinSolSetPrintLevel_SPFGMR(SUNLinearSolver LS,
+                                                  int print_level);
 
 
 #ifdef __cplusplus
diff --git a/include/sunlinsol/sunlinsol_spgmr.h b/include/sunlinsol/sunlinsol_spgmr.h
index 6227c3ee65..ea92761985 100644
--- a/include/sunlinsol/sunlinsol_spgmr.h
+++ b/include/sunlinsol/sunlinsol_spgmr.h
@@ -28,6 +28,8 @@
 #ifndef _SUNLINSOL_SPGMR_H
 #define _SUNLINSOL_SPGMR_H
 
+#include <stdio.h>
+
 #include <sundials/sundials_linearsolver.h>
 #include <sundials/sundials_matrix.h>
 #include <sundials/sundials_nvector.h>
@@ -71,6 +73,9 @@ struct _SUNLinearSolverContent_SPGMR {
 
   realtype *cv;
   N_Vector *Xv;
+
+  int print_level;
+  FILE* info_file;
 };
 
 typedef struct _SUNLinearSolverContent_SPGMR *SUNLinearSolverContent_SPGMR;
@@ -90,14 +95,10 @@ SUNDIALS_EXPORT int SUNLinSol_SPGMRSetGSType(SUNLinearSolver S,
 SUNDIALS_EXPORT int SUNLinSol_SPGMRSetMaxRestarts(SUNLinearSolver S,
                                                   int maxrs);
 
-/* deprecated */
-SUNDIALS_EXPORT SUNLinearSolver SUNSPGMR(N_Vector y, int pretype, int maxl);
-/* deprecated */
-SUNDIALS_EXPORT int SUNSPGMRSetPrecType(SUNLinearSolver S, int pretype);
-/* deprecated */
-SUNDIALS_EXPORT int SUNSPGMRSetGSType(SUNLinearSolver S, int gstype);
-/* deprecated */
-SUNDIALS_EXPORT int SUNSPGMRSetMaxRestarts(SUNLinearSolver S, int maxrs);
+SUNDIALS_DEPRECATED SUNLinearSolver SUNSPGMR(N_Vector y, int pretype, int maxl);
+SUNDIALS_DEPRECATED int SUNSPGMRSetPrecType(SUNLinearSolver S, int pretype);
+SUNDIALS_DEPRECATED int SUNSPGMRSetGSType(SUNLinearSolver S, int gstype);
+SUNDIALS_DEPRECATED int SUNSPGMRSetMaxRestarts(SUNLinearSolver S, int maxrs);
 
 SUNDIALS_EXPORT SUNLinearSolver_Type SUNLinSolGetType_SPGMR(SUNLinearSolver S);
 SUNDIALS_EXPORT SUNLinearSolver_ID SUNLinSolGetID_SPGMR(SUNLinearSolver S);
@@ -122,6 +123,11 @@ SUNDIALS_EXPORT int SUNLinSolSpace_SPGMR(SUNLinearSolver S,
                                          long int *lenrwLS,
                                          long int *leniwLS);
 SUNDIALS_EXPORT int SUNLinSolFree_SPGMR(SUNLinearSolver S);
+SUNDIALS_EXPORT int SUNLinSolSetInfoFile_SPGMR(SUNLinearSolver LS,
+                                               FILE* info_file);
+SUNDIALS_EXPORT int SUNLinSolSetPrintLevel_SPGMR(SUNLinearSolver LS,
+                                                 int print_level);
+
 
 
 #ifdef __cplusplus
diff --git a/include/sunlinsol/sunlinsol_sptfqmr.h b/include/sunlinsol/sunlinsol_sptfqmr.h
index 8cd72380ff..beef13348e 100644
--- a/include/sunlinsol/sunlinsol_sptfqmr.h
+++ b/include/sunlinsol/sunlinsol_sptfqmr.h
@@ -67,6 +67,9 @@ struct _SUNLinearSolverContent_SPTFQMR {
   N_Vector vtemp1;
   N_Vector vtemp2;
   N_Vector vtemp3;
+
+  int print_level;
+  FILE* info_file;
 };
 
 typedef struct _SUNLinearSolverContent_SPTFQMR *SUNLinearSolverContent_SPTFQMR;
@@ -83,12 +86,9 @@ SUNDIALS_EXPORT int SUNLinSol_SPTFQMRSetPrecType(SUNLinearSolver S,
 SUNDIALS_EXPORT int SUNLinSol_SPTFQMRSetMaxl(SUNLinearSolver S,
                                              int maxl);
 
-/* deprecated */
-SUNDIALS_EXPORT SUNLinearSolver SUNSPTFQMR(N_Vector y, int pretype, int maxl);
-/* deprecated */
-SUNDIALS_EXPORT int SUNSPTFQMRSetPrecType(SUNLinearSolver S, int pretype);
-/* deprecated */
-SUNDIALS_EXPORT int SUNSPTFQMRSetMaxl(SUNLinearSolver S, int maxl);
+SUNDIALS_DEPRECATED SUNLinearSolver SUNSPTFQMR(N_Vector y, int pretype, int maxl);
+SUNDIALS_DEPRECATED int SUNSPTFQMRSetPrecType(SUNLinearSolver S, int pretype);
+SUNDIALS_DEPRECATED int SUNSPTFQMRSetMaxl(SUNLinearSolver S, int maxl);
 
 SUNDIALS_EXPORT SUNLinearSolver_Type SUNLinSolGetType_SPTFQMR(SUNLinearSolver S);
 SUNDIALS_EXPORT SUNLinearSolver_ID SUNLinSolGetID_SPTFQMR(SUNLinearSolver S);
@@ -113,6 +113,10 @@ SUNDIALS_EXPORT int SUNLinSolSpace_SPTFQMR(SUNLinearSolver S,
                                            long int *lenrwLS,
                                            long int *leniwLS);
 SUNDIALS_EXPORT int SUNLinSolFree_SPTFQMR(SUNLinearSolver S);
+SUNDIALS_EXPORT int SUNLinSolSetInfoFile_SPTFQMR(SUNLinearSolver LS,
+                                                 FILE* info_file);
+SUNDIALS_EXPORT int SUNLinSolSetPrintLevel_SPTFQMR(SUNLinearSolver LS,
+                                                   int print_level);
 
 
 #ifdef __cplusplus
diff --git a/include/sunmatrix/sunmatrix_cusparse.h b/include/sunmatrix/sunmatrix_cusparse.h
index 20cf5081d9..8452e053e1 100644
--- a/include/sunmatrix/sunmatrix_cusparse.h
+++ b/include/sunmatrix/sunmatrix_cusparse.h
@@ -25,6 +25,7 @@
 #include <cuda_runtime.h>
 #include <cusparse.h>
 
+#include <sundials/sundials_cuda_policies.hpp>
 #include <sundials/sundials_matrix.h>
 
 #ifdef __cplusplus  /* wrapper to enable C++ usage */
@@ -49,17 +50,18 @@ struct _SUNMatrix_Content_cuSparse {
   int blocknnz;
   int sparse_type;
   booleantype own_data;
+  booleantype own_exec;
   booleantype fixed_pattern;
   int* colind;
   int* rowptrs;
   realtype* data;
   cusparseMatDescr_t mat_descr;
   cusparseHandle_t cusp_handle;
+  SUNCudaExecPolicy* exec_policy;
 };
 
 typedef struct _SUNMatrix_Content_cuSparse *SUNMatrix_Content_cuSparse;
 
-
 /* ------------------------------------------------------------------
  * Constructors.
  * ------------------------------------------------------------------ */
@@ -88,6 +90,7 @@ SUNDIALS_EXPORT int* SUNMatrix_cuSparse_IndexValues(SUNMatrix A);
 SUNDIALS_EXPORT realtype* SUNMatrix_cuSparse_Data(SUNMatrix A);
 
 SUNDIALS_EXPORT int SUNMatrix_cuSparse_SetFixedPattern(SUNMatrix A, booleantype yesno);
+SUNDIALS_EXPORT int SUNMatrix_cuSparse_SetKernelExecPolicy(SUNMatrix A, SUNCudaExecPolicy* exec_policy);
 SUNDIALS_EXPORT int SUNMatrix_cuSparse_NumBlocks(SUNMatrix A);
 SUNDIALS_EXPORT int SUNMatrix_cuSparse_BlockRows(SUNMatrix A);
 SUNDIALS_EXPORT int SUNMatrix_cuSparse_BlockColumns(SUNMatrix A);
diff --git a/include/sunnonlinsol/sunnonlinsol_fixedpoint.h b/include/sunnonlinsol/sunnonlinsol_fixedpoint.h
index 046f2939a4..c2fdbcb976 100644
--- a/include/sunnonlinsol/sunnonlinsol_fixedpoint.h
+++ b/include/sunnonlinsol/sunnonlinsol_fixedpoint.h
@@ -62,6 +62,11 @@ struct _SUNNonlinearSolverContent_FixedPoint {
   long int     niters;     /* total number of iterations across all solves   */
   long int     nconvfails; /* total number of convergence failures           */
   void        *ctest_data; /* data to pass to convergence test function      */
+
+  /* if 0 (default) nothing is printed, if 1 the residual is printed every iteration */
+  int print_level;
+  /* if NULL nothing is printed, if 1 the residual is printed every iteration */
+  FILE* info_file;
 };
 
 typedef struct _SUNNonlinearSolverContent_FixedPoint *SUNNonlinearSolverContent_FixedPoint;
@@ -113,6 +118,13 @@ SUNDIALS_EXPORT int SUNNonlinSolGetNumConvFails_FixedPoint(SUNNonlinearSolver NL
 SUNDIALS_EXPORT int SUNNonlinSolGetSysFn_FixedPoint(SUNNonlinearSolver NLS,
                                                     SUNNonlinSolSysFn *SysFn);
 
+SUNDIALS_EXPORT int SUNNonlinSolSetInfoFile_FixedPoint(SUNNonlinearSolver NLS,
+                                                       FILE* info_file);
+
+SUNDIALS_EXPORT int SUNNonlinSolSetPrintLevel_FixedPoint(SUNNonlinearSolver NLS,
+                                                         int print_level);                                                   
+
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/sunnonlinsol/sunnonlinsol_newton.h b/include/sunnonlinsol/sunnonlinsol_newton.h
index f1e54bb673..47d8b69cc1 100644
--- a/include/sunnonlinsol/sunnonlinsol_newton.h
+++ b/include/sunnonlinsol/sunnonlinsol_newton.h
@@ -50,6 +50,11 @@ struct _SUNNonlinearSolverContent_Newton {
   long int    niters;     /* total number of nonlinear iterations across all solves */
   long int    nconvfails; /* total number of convergence failures across all solves */
   void*       ctest_data; /* data to pass to convergence test function              */
+
+  /* if 0 (default) nothing is printed, if 1 the residual is printed every iteration */
+  int print_level;
+  /* if NULL nothing is printed, if 1 the residual is printed every iteration */
+  FILE* info_file;
 };
 
 typedef struct _SUNNonlinearSolverContent_Newton *SUNNonlinearSolverContent_Newton;
@@ -104,6 +109,13 @@ SUNDIALS_EXPORT int SUNNonlinSolGetNumConvFails_Newton(SUNNonlinearSolver NLS,
 SUNDIALS_EXPORT int SUNNonlinSolGetSysFn_Newton(SUNNonlinearSolver NLS,
                                                 SUNNonlinSolSysFn *SysFn);
 
+SUNDIALS_EXPORT int SUNNonlinSolSetInfoFile_Newton(SUNNonlinearSolver NLS,
+                                                   FILE* info_file);
+
+SUNDIALS_EXPORT int SUNNonlinSolSetPrintLevel_Newton(SUNNonlinearSolver NLS,
+                                                     int print_level);
+
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/arkode/README.md b/src/arkode/README.md
index 741d1d371f..bd8587342f 100644
--- a/src/arkode/README.md
+++ b/src/arkode/README.md
@@ -1,5 +1,5 @@
 # ARKode
-### Version 4.2.0 (Mar 2020)
+### Version 4.3.0 (May 2020)
 
 **Daniel R. Reynolds and Jean M. Sexton  
   Department of Mathematics, SMU**
@@ -49,8 +49,8 @@ the "SUNDIALS Release History" appendix of the ARKode User Guide.
 ## References
 
 * D. R. Reynolds, D. J. Gardner, A. C. Hindmarsh, C. S. Woodward, and
-  J. M. Sexton, "User Documentation for ARKode v4.2.0," LLNL technical report
-  LLNL-SM-668082, Mar 2020.
+  J. M. Sexton, "User Documentation for ARKode v4.3.0," LLNL technical report
+  LLNL-SM-668082, May 2020.
 
-* D. R. Reynolds, "Example Programs for ARKode v4.2.0," Technical Report,
-  Southern Methodist University Center for Scientific Computation, Mar 2020.
+* D. R. Reynolds, "Example Programs for ARKode v4.3.0," Technical Report,
+  Southern Methodist University Center for Scientific Computation, May 2020.
diff --git a/src/arkode/arkode.c b/src/arkode/arkode.c
index 53c80e7447..77cf82e923 100644
--- a/src/arkode/arkode.c
+++ b/src/arkode/arkode.c
@@ -30,12 +30,6 @@
 #include <sundials/sundials_math.h>
 #include <sundials/sundials_types.h>
 
-#define NO_DEBUG_OUTPUT
-/* #define DEBUG_OUTPUT */
-#ifdef DEBUG_OUTPUT
-#include <nvector/nvector_serial.h>
-#endif
-
 #if defined(SUNDIALS_EXTENDED_PRECISION)
 #define RSYM ".32Lg"
 #else
@@ -109,7 +103,6 @@ ARKodeMem arkCreate()
   ark_mem->VabstolMallocDone     = SUNFALSE;
   ark_mem->VRabstolMallocDone    = SUNFALSE;
   ark_mem->MallocDone            = SUNFALSE;
-  ark_mem->ConstraintsMallocDone = SUNFALSE;
 
   /* No user-supplied step postprocessing function yet */
   ark_mem->ProcessStep  = NULL;
@@ -184,6 +177,7 @@ ARKodeMem arkCreate()
 int arkResize(ARKodeMem ark_mem, N_Vector y0, realtype hscale,
               realtype t0, ARKVecResizeFn resize, void *resize_data)
 {
+  booleantype resizeOK;
   sunindextype lrw1, liw1, lrw_diff, liw_diff;
   int retval;
 
@@ -240,48 +234,14 @@ int arkResize(ARKodeMem ark_mem, N_Vector y0, realtype hscale,
   ark_mem->lrw1 = lrw1;
   ark_mem->liw1 = liw1;
 
-  /* Resize the ARKode vectors */
-  /*     Vabstol */
-  retval = arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
-                     liw_diff, y0, &ark_mem->Vabstol);
-  if (retval != ARK_SUCCESS)  return(retval);
-  /*     VRabstol */
-  retval = arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
-                     liw_diff, y0, &ark_mem->VRabstol);
-  if (retval != ARK_SUCCESS)  return(retval);
-  /*     ewt */
-  retval = arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
-                     liw_diff, y0, &ark_mem->ewt);
-  if (retval != ARK_SUCCESS)  return(retval);
-  /*     rwt  */
-  if (ark_mem->rwt_is_ewt) {      /* update pointer to ewt */
-    ark_mem->rwt = ark_mem->ewt;
-  } else {                        /* resize if distinct from ewt */
-    retval = arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
-                       liw_diff, y0, &ark_mem->rwt);
-    if (retval != ARK_SUCCESS)  return(retval);
+  /* Resize the solver vectors (using y0 as a template) */
+  resizeOK = arkResizeVectors(ark_mem, resize, resize_data,
+                              lrw_diff, liw_diff, y0);
+  if (!resizeOK) {
+    arkProcessError(ark_mem, ARK_MEM_FAIL, "ARKode",
+                    "arkResize", "Unable to resize vector");
+    return(ARK_MEM_FAIL);
   }
-  /*     yn */
-  retval = arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
-                     liw_diff, y0, &ark_mem->yn);
-  if (retval != ARK_SUCCESS)  return(retval);
-  /*     fn */
-  retval = arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
-                     liw_diff, y0, &ark_mem->fn);
-  if (retval != ARK_SUCCESS)  return(retval);
-  /*     tempv* */
-  retval = arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
-                     liw_diff, y0, &ark_mem->tempv1);
-  if (retval != ARK_SUCCESS)  return(retval);
-  retval = arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
-                     liw_diff, y0, &ark_mem->tempv2);
-  if (retval != ARK_SUCCESS)  return(retval);
-  retval = arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
-                     liw_diff, y0, &ark_mem->tempv3);
-  if (retval != ARK_SUCCESS)  return(retval);
-  retval = arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
-                     liw_diff, y0, &ark_mem->tempv4);
-  if (retval != ARK_SUCCESS)  return(retval);
 
   /* Indicate that the fullrhs is not required after each step, this is updated
      by the interpolation module constructor and stepper init function */
@@ -301,6 +261,9 @@ int arkResize(ARKodeMem ark_mem, N_Vector y0, realtype hscale,
   /* Copy y0 into ark_yn to set the current solution */
   N_VScale(ONE, y0, ark_mem->yn);
 
+  /* Disable constraints */
+  ark_mem->constraintsSet = SUNFALSE;
+
   /* Indicate that problem size is new */
   ark_mem->resized    = SUNTRUE;
   ark_mem->firststage = SUNTRUE;
@@ -1263,6 +1226,7 @@ int arkInit(ARKodeMem ark_mem, realtype t0, N_Vector y0)
   ark_mem->nhnil        = 0;
   ark_mem->ncfn         = 0;
   ark_mem->netf         = 0;
+  ark_mem->nconstrfails = 0;
 
   /* Initialize other integrator optional outputs */
   ark_mem->h0u    = ZERO;
@@ -1343,6 +1307,7 @@ int arkReInit(ARKodeMem ark_mem, realtype t0, N_Vector y0)
   ark_mem->nhnil        = 0;
   ark_mem->ncfn         = 0;
   ark_mem->netf         = 0;
+  ark_mem->nconstrfails = 0;
 
   /* Indicate that problem size is new */
   ark_mem->resized     = SUNTRUE;
@@ -1419,7 +1384,6 @@ void arkPrintMem(ARKodeMem ark_mem, FILE *outfile)
 
   /* output inequality constraints quantities */
   fprintf(outfile, "constraintsSet = %i\n", ark_mem->constraintsSet);
-  fprintf(outfile, "ConstraintsDone = %i\n", ark_mem->ConstraintsMallocDone);
   fprintf(outfile, "maxconstrfails = %i\n", ark_mem->maxconstrfails);
 
   /* output root-finding quantities */
@@ -1429,52 +1393,32 @@ void arkPrintMem(ARKodeMem ark_mem, FILE *outfile)
   /* output interpolation quantities */
   arkInterpPrintMem(ark_mem->interp, outfile);
 
-#ifdef DEBUG_OUTPUT
+#ifdef SUNDIALS_DEBUG_PRINTVEC
   /* output vector quantities */
-  if (ark_mem->Vabstol != NULL) {
-    fprintf(outfile, "Vapbsol:\n");
-    N_VPrint_Serial(ark_mem->Vabstol);
-  }
-  if (ark_mem->ewt != NULL) {
-    fprintf(outfile, "ewt:\n");
-    N_VPrint_Serial(ark_mem->ewt);
-  }
-  if (!ark_mem->rwt_is_ewt && ark_mem->rwt != NULL) {
+  fprintf(outfile, "Vapbsol:\n");
+  N_VPrintFile(ark_mem->Vabstol, outfile);
+  fprintf(outfile, "ewt:\n");
+  N_VPrintFile(ark_mem->ewt, outfile);
+  if (!ark_mem->rwt_is_ewt) {
     fprintf(outfile, "rwt:\n");
-    N_VPrint_Serial(ark_mem->rwt);
-  }
-  if (ark_mem->ycur != NULL) {
-    fprintf(outfile, "ycur:\n");
-    N_VPrint_Serial(ark_mem->ycur);
-  }
-  if (ark_mem->yn != NULL) {
-    fprintf(outfile, "yn:\n");
-    N_VPrint_Serial(ark_mem->yn);
-  }
-  if (ark_mem->fn != NULL) {
-    fprintf(outfile, "fn:\n");
-    N_VPrint_Serial(ark_mem->fn);
-  }
-  if (ark_mem->tempv1 != NULL) {
-    fprintf(outfile, "tempv1:\n");
-    N_VPrint_Serial(ark_mem->tempv1);
-  }
-  if (ark_mem->tempv2 != NULL) {
-    fprintf(outfile, "tempv2:\n");
-    N_VPrint_Serial(ark_mem->tempv2);
-  }
-  if (ark_mem->tempv3 != NULL) {
-    fprintf(outfile, "tempv3:\n");
-    N_VPrint_Serial(ark_mem->tempv3);
-  }
-  if (ark_mem->tempv4 != NULL) {
-    fprintf(outfile, "tempv4:\n");
-    N_VPrint_Serial(ark_mem->tempv4);
-  }
-  if (ark_mem->constraints != NULL) {
-    fprintf(outfile, "constraints:\n");
-    N_VPrint_Serial(ark_mem->constraints);
-  }
+    N_VPrintFile(ark_mem->rwt, outfile);
+  }
+  fprintf(outfile, "ycur:\n");
+  N_VPrintFile(ark_mem->ycur, outfile);
+  fprintf(outfile, "yn:\n");
+  N_VPrintFile(ark_mem->yn, outfile);
+  fprintf(outfile, "fn:\n");
+  N_VPrintFile(ark_mem->fn, outfile);
+  fprintf(outfile, "tempv1:\n");
+  N_VPrintFile(ark_mem->tempv1, outfile);
+  fprintf(outfile, "tempv2:\n");
+  N_VPrintFile(ark_mem->tempv2, outfile);
+  fprintf(outfile, "tempv3:\n");
+  N_VPrintFile(ark_mem->tempv3, outfile);
+  fprintf(outfile, "tempv4:\n");
+  N_VPrintFile(ark_mem->tempv4, outfile);
+  fprintf(outfile, "constraints:\n");
+  N_VPrintFile(ark_mem->constraints, outfile);
 #endif
 
 }
@@ -1574,46 +1518,56 @@ void arkFreeVec(ARKodeMem ark_mem, N_Vector *v)
   arkResizeVec:
 
   This routine resizes a single vector based on a template
-  vector.  If the ARKVecResizeFn function is non-NULL, then it
+  vector. If the ARKVecResizeFn function is non-NULL, then it
   calls that routine to perform the single-vector resize;
   otherwise it deallocates and reallocates the target vector based
-  on the template vector.  If the resize is successful then this
-  returns SUNTRUE.  This routine also updates the optional outputs
-  lrw and liw, which are (respectively) the lengths of the overall
-  ARKode real and integer work spaces.
+  on the template vector. This routine also updates the optional
+  outputs lrw and liw, which are (respectively) the lengths of the
+  overall ARKode real and integer work spaces.
+
+  If the resize is successful then this returns SUNTRUE,
+  otherwise it returns SUNFALSE.
   ---------------------------------------------------------------*/
-int arkResizeVec(ARKodeMem ark_mem, ARKVecResizeFn resize,
-                 void *resize_data, sunindextype lrw_diff,
-                 sunindextype liw_diff, N_Vector tmpl, N_Vector *v)
+booleantype arkResizeVec(ARKodeMem ark_mem, ARKVecResizeFn resize,
+                         void *resize_data, sunindextype lrw_diff,
+                         sunindextype liw_diff, N_Vector tmpl, N_Vector *v)
 {
   if (*v != NULL) {
     if (resize == NULL) {
       N_VDestroy(*v);
+      *v = NULL;
       *v = N_VClone(tmpl);
+      if (*v == NULL) {
+        arkProcessError(ark_mem, ARK_MEM_FAIL, "ARKode",
+                        "arkResizeVec", "Unable to clone vector");
+        return(SUNFALSE);
+      }
     } else {
       if (resize(*v, tmpl, resize_data)) {
-        arkProcessError(ark_mem, ARK_ILL_INPUT, "ARKode",
+        arkProcessError(ark_mem, ARK_MEM_FAIL, "ARKode",
                         "arkResizeVec", MSG_ARK_RESIZE_FAIL);
-        return(ARK_ILL_INPUT);
+        return(SUNFALSE);
       }
     }
     ark_mem->lrw += lrw_diff;
     ark_mem->liw += liw_diff;
   }
-  return(ARK_SUCCESS);
+  return(SUNTRUE);
 }
 
+
 /*---------------------------------------------------------------
   arkAllocVectors:
 
   This routine allocates the ARKode vectors ewt, yn, tempv* and
-  ftemp.  If any of these vectors already exist, they are left
-  alone.  Otherwise, it will allocate each vector by cloning the
-  input vector. If all memory allocations are successful,
-  arkAllocVectors returns SUNTRUE. Otherwise all vector memory
-  is freed and arkAllocVectors returns SUNFALSE.  This routine
-  also updates the optional outputs lrw and liw, which are
-  (respectively) the lengths of the real and integer work spaces.
+  ftemp. If any of these vectors already exist, they are left
+  alone. Otherwise, it will allocate each vector by cloning the
+  input vector. This routine also updates the optional outputs
+  lrw and liw, which are (respectively) the lengths of the real
+  and integer work spaces.
+
+  If all memory allocations are successful, arkAllocVectors
+  returns SUNTRUE, otherwise it returns SUNFALSE.
   ---------------------------------------------------------------*/
 booleantype arkAllocVectors(ARKodeMem ark_mem, N_Vector tmpl)
 {
@@ -1652,6 +1606,83 @@ booleantype arkAllocVectors(ARKodeMem ark_mem, N_Vector tmpl)
   return(SUNTRUE);
 }
 
+/*---------------------------------------------------------------
+  arkResizeVectors:
+
+  This routine resizes all ARKode vectors if they exist,
+  otherwise they are left alone. If a resize function is provided
+  it is called to resize the vectors otherwise the vector is
+  freed and a new vector is created by cloning in input vector.
+  This routine also updates the optional outputs lrw and liw,
+  which are (respectively) the lengths of the real and integer
+  work spaces.
+
+  If all memory allocations are successful, arkResizeVectors
+  returns SUNTRUE, otherwise it returns SUNFALSE.
+  ---------------------------------------------------------------*/
+booleantype arkResizeVectors(ARKodeMem ark_mem, ARKVecResizeFn resize,
+                             void *resize_data, sunindextype lrw_diff,
+                             sunindextype liw_diff, N_Vector tmpl)
+{
+  /* Vabstol */
+  if (!arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
+                    liw_diff, tmpl, &ark_mem->Vabstol))
+    return(SUNFALSE);
+
+  /* VRabstol */
+  if (!arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
+                    liw_diff, tmpl, &ark_mem->VRabstol))
+    return(SUNFALSE);
+
+  /* ewt */
+  if (!arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
+                    liw_diff, tmpl, &ark_mem->ewt))
+    return(SUNFALSE);
+
+  /* rwt  */
+  if (ark_mem->rwt_is_ewt) {      /* update pointer to ewt */
+    ark_mem->rwt = ark_mem->ewt;
+  } else {                        /* resize if distinct from ewt */
+    if (!arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
+                      liw_diff, tmpl, &ark_mem->rwt))
+      return(SUNFALSE);
+  }
+
+  /* yn */
+  if (!arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
+                    liw_diff, tmpl, &ark_mem->yn))
+    return(SUNFALSE);
+
+  /* fn */
+  if (!arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
+                    liw_diff, tmpl, &ark_mem->fn))
+    return(SUNFALSE);
+
+  /* tempv* */
+  if (!arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
+                    liw_diff, tmpl, &ark_mem->tempv1))
+    return(SUNFALSE);
+
+  if (!arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
+                    liw_diff, tmpl, &ark_mem->tempv2))
+    return(SUNFALSE);
+
+  if (!arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
+                    liw_diff, tmpl, &ark_mem->tempv3))
+    return(SUNFALSE);
+
+  if (!arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
+                    liw_diff, tmpl, &ark_mem->tempv4))
+    return(SUNFALSE);
+
+  /* constraints */
+  if (!arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
+                    liw_diff, tmpl, &ark_mem->constraints))
+    return(SUNFALSE);
+
+  return(SUNTRUE);
+}
+
 
 /*---------------------------------------------------------------
   arkFreeVectors
@@ -1671,8 +1702,7 @@ void arkFreeVectors(ARKodeMem ark_mem)
   arkFreeVec(ark_mem, &ark_mem->yn);
   arkFreeVec(ark_mem, &ark_mem->fn);
   arkFreeVec(ark_mem, &ark_mem->Vabstol);
-  if (ark_mem->ConstraintsMallocDone)
-    arkFreeVec(ark_mem, &ark_mem->constraints);
+  arkFreeVec(ark_mem, &ark_mem->constraints);
 }
 
 
diff --git a/src/arkode/arkode_arkstep.c b/src/arkode/arkode_arkstep.c
index 2b4091438d..54ad01dd43 100644
--- a/src/arkode/arkode_arkstep.c
+++ b/src/arkode/arkode_arkstep.c
@@ -31,12 +31,6 @@
 #define RSYM ".16g"
 #endif
 
-#define NO_DEBUG_OUTPUT
-/* #define DEBUG_OUTPUT */
-#ifdef DEBUG_OUTPUT
-#include <nvector/nvector_serial.h>
-#endif
-
 /* constants */
 #define ZERO   RCONST(0.0)
 #define ONE    RCONST(1.0)
@@ -252,37 +246,48 @@ int ARKStepResize(void *arkode_mem, N_Vector y0, realtype hscale,
   }
 
   /* Resize the sdata, zpred and zcor vectors */
-  if (step_mem->sdata != NULL) {
-    retval = arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
-                          liw_diff, y0, &step_mem->sdata);
-    if (retval != ARK_SUCCESS)  return(retval);
+  if (!arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
+                    liw_diff, y0, &step_mem->sdata)) {
+    arkProcessError(ark_mem, ARK_MEM_FAIL, "ARKode::ARKStep", "ARKStepResize",
+                    "Unable to resize vector");
+    return(ARK_MEM_FAIL);
   }
-  if (step_mem->zpred != NULL) {
-    retval = arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
-                          liw_diff, y0, &step_mem->zpred);
-    if (retval != ARK_SUCCESS)  return(retval);
+
+  if (!arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
+                    liw_diff, y0, &step_mem->zpred)) {
+    arkProcessError(ark_mem, ARK_MEM_FAIL, "ARKode::ARKStep", "ARKStepResize",
+                    "Unable to resize vector");
+    return(ARK_MEM_FAIL);
   }
-  if (step_mem->zcor != NULL) {
-    retval = arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
-                          liw_diff, y0, &step_mem->zcor);
-    if (retval != ARK_SUCCESS)  return(retval);
+
+  if (!arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
+                    liw_diff, y0, &step_mem->zcor)) {
+    arkProcessError(ark_mem, ARK_MEM_FAIL, "ARKode::ARKStep", "ARKStepResize",
+                    "Unable to resize vector");
+    return(ARK_MEM_FAIL);
   }
 
   /* Resize the ARKStep vectors */
   /*     Fe */
   if (step_mem->Fe != NULL) {
     for (i=0; i<step_mem->stages; i++) {
-      retval = arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
-                            liw_diff, y0, &step_mem->Fe[i]);
-      if (retval != ARK_SUCCESS)  return(retval);
+      if (!arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
+                        liw_diff, y0, &step_mem->Fe[i])) {
+        arkProcessError(ark_mem, ARK_MEM_FAIL, "ARKode::ARKStep", "ARKStepResize",
+                        "Unable to resize vector");
+        return(ARK_MEM_FAIL);
+      }
     }
   }
   /*     Fi */
   if (step_mem->Fi != NULL) {
     for (i=0; i<step_mem->stages; i++) {
-      retval = arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
-                            liw_diff, y0, &step_mem->Fi[i]);
-      if (retval != ARK_SUCCESS)  return(retval);
+      if (!arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
+                        liw_diff, y0, &step_mem->Fi[i])) {
+        arkProcessError(ark_mem, ARK_MEM_FAIL, "ARKode::ARKStep", "ARKStepResize",
+                        "Unable to resize vector");
+        return(ARK_MEM_FAIL);
+      }
     }
   }
 
@@ -656,6 +661,10 @@ void ARKStepPrintMem(void* arkode_mem, FILE* outfile)
   ARKodeARKStepMem step_mem;
   int retval;
 
+#ifdef SUNDIALS_DEBUG_PRINTVEC
+  int i;
+#endif
+
   /* access ARKodeARKStepMem structure */
   retval = arkStep_AccessStepMem(arkode_mem, "ARKStepPrintMem",
                                  &ark_mem, &step_mem);
@@ -712,29 +721,23 @@ void ARKStepPrintMem(void* arkode_mem, FILE* outfile)
   fprintf(outfile,"ARKStep: rdiv = %"RSYM"\n", step_mem->rdiv);
   fprintf(outfile,"ARKStep: dgmax = %"RSYM"\n", step_mem->dgmax);
 
-#ifdef DEBUG_OUTPUT
+#ifdef SUNDIALS_DEBUG_PRINTVEC
   /* output vector quantities */
-  if (step_mem->sdata != NULL) {
-    fprintf(outfile, "ARKStep: sdata:\n");
-    N_VPrint_Serial(step_mem->sdata);
-  }
-  if (step_mem->zpred != NULL) {
-    fprintf(outfile, "ARKStep: zpred:\n");
-    N_VPrint_Serial(step_mem->zpred);
-  }
-  if (step_mem->zcor != NULL) {
-    fprintf(outfile, "ARKStep: zcor:\n");
-    N_VPrint_Serial(step_mem->zcor);
-  }
+  fprintf(outfile, "ARKStep: sdata:\n");
+  N_VPrintFile(step_mem->sdata, outfile);
+  fprintf(outfile, "ARKStep: zpred:\n");
+  N_VPrintFile(step_mem->zpred, outfile);
+  fprintf(outfile, "ARKStep: zcor:\n");
+  N_VPrintFile(step_mem->zcor, outfile);
   if (step_mem->Fe != NULL)
     for (i=0; i<step_mem->stages; i++) {
       fprintf(outfile,"ARKStep: Fe[%i]:\n", i);
-      N_VPrint_Serial(step_mem->Fe[i]);
+      N_VPrintFile(step_mem->Fe[i], outfile);
     }
   if (step_mem->Fi != NULL)
     for (i=0; i<step_mem->stages; i++) {
       fprintf(outfile,"ARKStep: Fi[%i]:\n", i);
-      N_VPrint_Serial(step_mem->Fi[i]);
+      N_VPrintFile(step_mem->Fi[i], outfile);
     }
 #endif
 }
@@ -1102,7 +1105,7 @@ int arkStep_Init(void* arkode_mem, int init_type)
       ark_mem->liw += step_mem->nfusedopvecs;   /* pointers */
     }
 
-    /* Limit interpolant degree based on method order (use negative 
+    /* Limit interpolant degree based on method order (use negative
        argument to specify update instead of overwrite) */
     if (ark_mem->interp != NULL) {
       retval = arkInterpSetDegree(ark_mem, ark_mem->interp, -(step_mem->q-1));
@@ -1115,10 +1118,10 @@ int arkStep_Init(void* arkode_mem, int init_type)
 
   } /* end (init_type == 0) */
 
-  /* If the bootstrap predictor is enabled, signal to shared arkode module that 
+  /* If the bootstrap predictor is enabled, signal to shared arkode module that
      fullrhs is required after each step */
   if (step_mem->predictor == 4)  ark_mem->call_fullrhs = SUNTRUE;
-  
+
   /* Check for consistency between linear system modules
        (e.g., if lsolve is direct, msolve needs to match) */
   if (step_mem->mass_mem != NULL) {  /* M != I */
@@ -1573,7 +1576,7 @@ int arkStep_TakeStep(void* arkode_mem, realtype *dsmPtr, int *nflagPtr)
     else
       ark_mem->tcur = ark_mem->tn + step_mem->Be->c[is]*ark_mem->h;
 
-#ifdef DEBUG_OUTPUT
+#ifdef SUNDIALS_DEBUG
     printf("step %li,  stage %i,  h = %"RSYM",  t_n = %"RSYM"\n",
            ark_mem->nst, is, ark_mem->h, ark_mem->tcur);
 #endif
@@ -1600,18 +1603,18 @@ int arkStep_TakeStep(void* arkode_mem, realtype *dsmPtr, int *nflagPtr)
       if (retval > 0)  return(TRY_AGAIN);
     }
 
-#ifdef DEBUG_OUTPUT
+#ifdef SUNDIALS_DEBUG_PRINTVEC
     printf("predictor:\n");
-    N_VPrint_Serial(step_mem->zpred);
+    N_VPrint(step_mem->zpred);
 #endif
 
     /* Set up data for evaluation of ARK stage residual (data stored in sdata) */
     retval = arkStep_StageSetup(ark_mem);
     if (retval != ARK_SUCCESS)  return (retval);
 
-#ifdef DEBUG_OUTPUT
+#ifdef SUNDIALS_DEBUG_PRINTVEC
     printf("rhs data:\n");
-    N_VPrint_Serial(step_mem->sdata);
+    N_VPrint(step_mem->sdata);
 #endif
 
     /* Solver diagnostics reporting */
@@ -1627,9 +1630,9 @@ int arkStep_TakeStep(void* arkode_mem, realtype *dsmPtr, int *nflagPtr)
       *nflagPtr = arkStep_Nls(ark_mem, *nflagPtr);
       if (*nflagPtr != ARK_SUCCESS)  return(TRY_AGAIN);
 
-#ifdef DEBUG_OUTPUT
+#ifdef SUNDIALS_DEBUG_PRINTVEC
       printf("nonlinear solution:\n");
-      N_VPrint_Serial(ark_mem->ycur);
+      N_VPrint(ark_mem->ycur);
 #endif
 
     /* otherwise no implicit solve is needed */
@@ -1649,9 +1652,9 @@ int arkStep_TakeStep(void* arkode_mem, realtype *dsmPtr, int *nflagPtr)
         N_VLinearSum(ONE, step_mem->sdata, ONE, step_mem->zpred, ark_mem->ycur);
       }
 
-#ifdef DEBUG_OUTPUT
+#ifdef SUNDIALS_DEBUG_PRINTVEC
       printf("explicit solution:\n");
-      N_VPrint_Serial(ark_mem->ycur);
+      N_VPrint(ark_mem->ycur);
 #endif
 
     }
@@ -1736,7 +1739,7 @@ int arkStep_TakeStep(void* arkode_mem, realtype *dsmPtr, int *nflagPtr)
     return(TRY_AGAIN);
   }
 
-#ifdef DEBUG_OUTPUT
+#ifdef SUNDIALS_DEBUG
   printf("error estimate = %"RSYM"\n", *dsmPtr);
 #endif
 
diff --git a/src/arkode/arkode_arkstep_io.c b/src/arkode/arkode_arkstep_io.c
index e1f29d259a..797b432434 100644
--- a/src/arkode/arkode_arkstep_io.c
+++ b/src/arkode/arkode_arkstep_io.c
@@ -210,6 +210,8 @@ int ARKStepSetMassPreconditioner(void *arkode_mem, ARKLsMassPrecSetupFn psetup,
 int ARKStepSetJacTimes(void *arkode_mem, ARKLsJacTimesSetupFn jtsetup,
                        ARKLsJacTimesVecFn jtimes) {
   return(arkLSSetJacTimes(arkode_mem, jtsetup, jtimes)); }
+int ARKStepSetJacTimesRhsFn(void *arkode_mem, ARKRhsFn jtimesRhsFn) {
+  return(arkLSSetJacTimesRhsFn(arkode_mem, jtimesRhsFn)); }
 int ARKStepSetMassTimes(void *arkode_mem, ARKLsMassTimesSetupFn msetup,
                         ARKLsMassTimesVecFn mtimes, void *mtimes_data) {
   return(arkLSSetMassTimes(arkode_mem, msetup, mtimes, mtimes_data)); }
diff --git a/src/arkode/arkode_erkstep.c b/src/arkode/arkode_erkstep.c
index bd3bbb2e04..f095532bc8 100644
--- a/src/arkode/arkode_erkstep.c
+++ b/src/arkode/arkode_erkstep.c
@@ -30,18 +30,11 @@
 #define RSYM ".16g"
 #endif
 
-#define NO_DEBUG_OUTPUT
-/* #define DEBUG_OUTPUT */
-#ifdef DEBUG_OUTPUT
-#include <nvector/nvector_serial.h>
-#endif
-
 /* constants */
 #define ZERO   RCONST(0.0)
 #define ONE    RCONST(1.0)
 
 
-
 /*===============================================================
   ERKStep Exported functions -- Required
   ===============================================================*/
@@ -172,9 +165,12 @@ int ERKStepResize(void *arkode_mem, N_Vector y0, realtype hscale,
 
   /* Resize the RHS vectors */
   for (i=0; i<step_mem->stages; i++) {
-    retval = arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
-                       liw_diff, y0, &step_mem->F[i]);
-    if (retval != ARK_SUCCESS)  return(retval);
+    if (!arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
+                      liw_diff, y0, &step_mem->F[i])) {
+      arkProcessError(ark_mem, ARK_MEM_FAIL, "ARKode::ERKStep", "ERKStepResize",
+                      "Unable to resize vector");
+      return(ARK_MEM_FAIL);
+    }
   }
 
   return(ARK_SUCCESS);
@@ -383,6 +379,10 @@ void ERKStepPrintMem(void* arkode_mem, FILE* outfile)
   ARKodeERKStepMem step_mem;
   int retval;
 
+#ifdef SUNDIALS_DEBUG_PRINTVEC
+  int i;
+#endif
+
   /* access ARKodeERKStepMem structure */
   retval = erkStep_AccessStepMem(arkode_mem, "ERKStepPrintMem",
                                  &ark_mem, &step_mem);
@@ -403,11 +403,11 @@ void ERKStepPrintMem(void* arkode_mem, FILE* outfile)
   fprintf(outfile,"ERKStep: Butcher table:\n");
   ARKodeButcherTable_Write(step_mem->B, outfile);
 
-#ifdef DEBUG_OUTPUT
+#ifdef SUNDIALS_DEBUG_PRINTVEC
   /* output vector quantities */
   for (i=0; i<step_mem->stages; i++) {
     fprintf(outfile,"ERKStep: F[%i]:\n", i);
-    N_VPrint_Serial(step_mem->F[i]);
+    N_VPrintFile(step_mem->F[i], outfile);
   }
 #endif
 }
@@ -688,9 +688,9 @@ int erkStep_TakeStep(void* arkode_mem, realtype *dsmPtr, int *nflagPtr)
   cvals = step_mem->cvals;
   Xvecs = step_mem->Xvecs;
 
-#ifdef DEBUG_OUTPUT
+#ifdef SUNDIALS_DEBUG_PRINTVEC
   printf("stage 0 RHS:\n");
-  N_VPrint_Serial(step_mem->F[0]);
+  N_VPrint(step_mem->F[0]);
 #endif
 
   /* Loop over internal stages to the step; since the method is explicit
@@ -700,7 +700,7 @@ int erkStep_TakeStep(void* arkode_mem, realtype *dsmPtr, int *nflagPtr)
     /* Set current stage time(s) */
     ark_mem->tcur = ark_mem->tn + step_mem->B->c[is]*ark_mem->h;
 
-#ifdef DEBUG_OUTPUT
+#ifdef SUNDIALS_DEBUG
     printf("step %li,  stage %i,  h = %"RSYM",  t_n = %"RSYM"\n",
            ark_mem->nst, is, ark_mem->h, ark_mem->tcur);
 #endif
@@ -740,9 +740,9 @@ int erkStep_TakeStep(void* arkode_mem, realtype *dsmPtr, int *nflagPtr)
     if (retval < 0)  return(ARK_RHSFUNC_FAIL);
     if (retval > 0)  return(ARK_UNREC_RHSFUNC_ERR);
 
-#ifdef DEBUG_OUTPUT
+#ifdef SUNDIALS_DEBUG_PRINTVEC
     printf("RHS:\n");
-    N_VPrint_Serial(step_mem->F[is]);
+    N_VPrint(step_mem->F[is]);
 #endif
 
   } /* loop over stages */
@@ -751,10 +751,12 @@ int erkStep_TakeStep(void* arkode_mem, realtype *dsmPtr, int *nflagPtr)
   retval = erkStep_ComputeSolutions(ark_mem, dsmPtr);
   if (retval < 0)  return(retval);
 
-#ifdef DEBUG_OUTPUT
+#ifdef SUNDIALS_DEBUG
   printf("error estimate = %"RSYM"\n", *dsmPtr);
+#endif
+#ifdef SUNDIALS_DEBUG_PRINTVEC
   printf("updated solution:\n");
-  N_VPrint_Serial(ark_mem->ycur);
+  N_VPrint(ark_mem->ycur);
 #endif
 
   /* Solver diagnostics reporting */
diff --git a/src/arkode/arkode_impl.h b/src/arkode/arkode_impl.h
index bda1282584..500cbbdb97 100644
--- a/src/arkode/arkode_impl.h
+++ b/src/arkode/arkode_impl.h
@@ -358,7 +358,6 @@ typedef struct ARKodeMemRec {
   realtype    tolsf;        /* tolerance scale factor (suggestion to user) */
   booleantype VabstolMallocDone;
   booleantype VRabstolMallocDone;
-  booleantype ConstraintsMallocDone;
   booleantype MallocDone;
   booleantype resized;      /* denotes first step after ARKodeResize      */
   booleantype firststage;   /* denotes first stage in simulation          */
@@ -838,18 +837,24 @@ booleantype arkAllocVec(ARKodeMem ark_mem,
                         N_Vector tmpl,
                         N_Vector *v);
 void arkFreeVec(ARKodeMem ark_mem, N_Vector *v);
-int arkResizeVec(ARKodeMem ark_mem,
-                 ARKVecResizeFn resize,
-                 void *resize_data,
-                 sunindextype lrw_diff,
-                 sunindextype liw_diff,
-                 N_Vector tmpl,
-                 N_Vector *v);
+booleantype arkResizeVec(ARKodeMem ark_mem,
+                         ARKVecResizeFn resize,
+                         void *resize_data,
+                         sunindextype lrw_diff,
+                         sunindextype liw_diff,
+                         N_Vector tmpl,
+                         N_Vector *v);
 void arkPrintMem(ARKodeMem ark_mem, FILE *outfile);
 booleantype arkCheckTimestepper(ARKodeMem ark_mem);
 booleantype arkCheckNvector(N_Vector tmpl);
 booleantype arkAllocVectors(ARKodeMem ark_mem,
                             N_Vector tmpl);
+booleantype arkResizeVectors(ARKodeMem ark_mem,
+                             ARKVecResizeFn resize,
+                             void *resize_data,
+                             sunindextype lrw_diff,
+                             sunindextype liw_diff,
+                             N_Vector tmpl);
 void arkFreeVectors(ARKodeMem ark_mem);
 
 int arkInitialSetup(ARKodeMem ark_mem, realtype tout);
diff --git a/src/arkode/arkode_interp.c b/src/arkode/arkode_interp.c
index ec01e9fb06..fd29ca73a3 100644
--- a/src/arkode/arkode_interp.c
+++ b/src/arkode/arkode_interp.c
@@ -32,11 +32,6 @@
 #define RSYM ".16g"
 #endif
 
-#define NO_DEBUG_OUTPUT
-#ifdef DEBUG_OUTPUT
-#include <nvector/nvector_serial.h>
-#endif
-
 
 /*---------------------------------------------------------------
   Section I: generic ARKInterp functions provided by all
@@ -191,7 +186,6 @@ int arkInterpResize_Hermite(void* arkode_mem, ARKInterp interp,
                             sunindextype lrw_diff, sunindextype liw_diff,
                             N_Vector y0)
 {
-  int ier;
   ARKodeMem ark_mem;
 
   /* access ARKodeMem structure */
@@ -200,26 +194,22 @@ int arkInterpResize_Hermite(void* arkode_mem, ARKInterp interp,
 
   /* resize vectors */
   if (interp == NULL)  return(ARK_SUCCESS);
-  if (HINT_FOLD(interp) != NULL) {
-    ier = arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
-                       liw_diff, y0, &HINT_FOLD(interp));
-    if (ier != ARK_SUCCESS)  return(ier);
-  }
-  if (HINT_YOLD(interp) != NULL) {
-    ier = arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
-                       liw_diff, y0, &HINT_YOLD(interp));
-    if (ier != ARK_SUCCESS)  return(ier);
-  }
-  if (HINT_FA(interp) != NULL) {
-    ier = arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
-                       liw_diff, y0, &HINT_FA(interp));
-    if (ier != ARK_SUCCESS)  return(ier);
-  }
-  if (HINT_FB(interp) != NULL) {
-    ier = arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
-                       liw_diff, y0, &HINT_FB(interp));
-    if (ier != ARK_SUCCESS)  return(ier);
-  }
+
+  if (!arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
+                    liw_diff, y0, &HINT_FOLD(interp)))
+    return(ARK_MEM_FAIL);
+
+  if (!arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
+                    liw_diff, y0, &HINT_YOLD(interp)))
+    return(ARK_MEM_FAIL);
+
+  if (!arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
+                    liw_diff, y0, &HINT_FA(interp)))
+    return(ARK_MEM_FAIL);
+
+  if (!arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
+                    liw_diff, y0, &HINT_FB(interp)))
+    return(ARK_MEM_FAIL);
 
   /* update ynew and fnew pointers */
   HINT_YNEW(interp) = ark_mem->yn;
@@ -302,31 +292,19 @@ void arkInterpPrintMem_Hermite(ARKInterp interp, FILE *outfile)
     fprintf(outfile, "arkode_interp (Hermite): told = %"RSYM"\n", HINT_TOLD(interp));
     fprintf(outfile, "arkode_interp (Hermite): tnew = %"RSYM"\n", HINT_TNEW(interp));
     fprintf(outfile, "arkode_interp (Hermite): h = %"RSYM"\n", HINT_H(interp));
-#ifdef DEBUG_OUTPUT
-    if (HINT_FOLD(interp) != NULL) {
-      fprintf(outfile, "arkode_interp (Hermite): fold:\n");
-      N_VPrint_Serial(HINT_FOLD(interp));
-    }
-    if (HINT_FNEW(interp) != NULL) {
-      fprintf(outfile, "arkode_interp (Hermite): fnew:\n");
-      N_VPrint_Serial(HINT_FNEW(interp));
-    }
-    if (HINT_YOLD(interp) != NULL) {
-      fprintf(outfile, "arkode_interp (Hermite): yold:\n");
-      N_VPrint_Serial(HINT_YOLD(interp));
-    }
-    if (HINT_YNEW(interp) != NULL) {
-      fprintf(outfile, "arkode_interp (Hermite): ynew:\n");
-      N_VPrint_Serial(HINT_YNEW(interp));
-    }
-    if (HINT_FA(interp) != NULL) {
-      fprintf(outfile, "arkode_interp (Hermite): fa:\n");
-      N_VPrint_Serial(HINT_FA(interp));
-    }
-    if (HINT_FB(interp) != NULL) {
-      fprintf(outfile, "arkode_interp (Hermite): fb:\n");
-      N_VPrint_Serial(HINT_FB(interp));
-    }
+#ifdef SUNDIALS_DEBUG_PRINTVEC
+    fprintf(outfile, "arkode_interp (Hermite): fold:\n");
+    N_VPrintFile(HINT_FOLD(interp), outfile);
+    fprintf(outfile, "arkode_interp (Hermite): fnew:\n");
+    N_VPrintFile(HINT_FNEW(interp), outfile);
+    fprintf(outfile, "arkode_interp (Hermite): yold:\n");
+    N_VPrintFile(HINT_YOLD(interp), outfile);
+    fprintf(outfile, "arkode_interp (Hermite): ynew:\n");
+    N_VPrintFile(HINT_YNEW(interp), outfile);
+    fprintf(outfile, "arkode_interp (Hermite): fa:\n");
+    N_VPrintFile(HINT_FA(interp), outfile);
+    fprintf(outfile, "arkode_interp (Hermite): fb:\n");
+    N_VPrintFile(HINT_FB(interp), outfile);
 #endif
   }
 }
@@ -844,7 +822,7 @@ int arkInterpResize_Lagrange(void* arkode_mem, ARKInterp I,
                              sunindextype lrw_diff, sunindextype liw_diff,
                              N_Vector y0)
 {
-  int ier, i;
+  int i;
   ARKodeMem ark_mem;
 
   /* access ARKodeMem structure */
@@ -855,9 +833,9 @@ int arkInterpResize_Lagrange(void* arkode_mem, ARKInterp I,
   if (I == NULL)  return(ARK_SUCCESS);
   if (LINT_YHIST(I) != NULL) {
     for (i=0; i<LINT_NMAXALLOC(I); i++) {
-      ier = arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
-                         liw_diff, y0, &(LINT_YJ(I,i)));
-      if (ier != ARK_SUCCESS)  return(ier);
+      if (!arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
+                        liw_diff, y0, &(LINT_YJ(I,i))))
+        return(ARK_MEM_FAIL);
     }
   }
 
@@ -946,13 +924,11 @@ void arkInterpPrintMem_Lagrange(ARKInterp I, FILE *outfile)
         fprintf(outfile, "  %p",(void*) LINT_YJ(I,i));
       fprintf(outfile, "\n");
     }
-#ifdef DEBUG_OUTPUT
+#ifdef SUNDIALS_DEBUG_PRINTVEC
     if (LINT_YHIST(I) != NULL) {
       for (i=0; i<LINT_NMAX(I); i++) {
-        if (LINT_YJ(I,i) != NULL) {
-          fprintf(outfile, "arkode_interp (Lagrange): yhist[%i]:\n",i);
-          N_VPrint_Serial(LINT_YJ(I,i));
-        }
+        fprintf(outfile, "arkode_interp (Lagrange): yhist[%i]:\n",i);
+        N_VPrintFile(LINT_YJ(I,i), outfile);
       }
     }
 #endif
diff --git a/src/arkode/arkode_io.c b/src/arkode/arkode_io.c
index 63d6762931..9be70f5b50 100644
--- a/src/arkode/arkode_io.c
+++ b/src/arkode/arkode_io.c
@@ -696,14 +696,9 @@ int arkSetConstraints(void *arkode_mem, N_Vector constraints)
   }
   ark_mem = (ARKodeMem) arkode_mem;
 
-  /* If there are no constarints, destroy data structures */
+  /* If there are no constraints, destroy data structures */
   if (constraints == NULL) {
-    if (ark_mem->ConstraintsMallocDone) {
-      N_VDestroy(ark_mem->constraints);
-      ark_mem->lrw -= ark_mem->lrw1;
-      ark_mem->liw -= ark_mem->liw1;
-    }
-    ark_mem->ConstraintsMallocDone = SUNFALSE;
+    arkFreeVec(ark_mem, &ark_mem->constraints);
     ark_mem->constraintsSet = SUNFALSE;
     return(ARK_SUCCESS);
   }
@@ -727,16 +722,12 @@ int arkSetConstraints(void *arkode_mem, N_Vector constraints)
     return(ARK_ILL_INPUT);
   }
 
-  if ( !(ark_mem->ConstraintsMallocDone) ) {
-    ark_mem->constraints = N_VClone(constraints);
-    ark_mem->lrw += ark_mem->lrw1;
-    ark_mem->liw += ark_mem->liw1;
-    ark_mem->ConstraintsMallocDone = SUNTRUE;
-  }
+  /* Allocate the internal constrains vector (if necessary) */
+  if (!arkAllocVec(ark_mem, constraints, &ark_mem->constraints))
+    return(ARK_MEM_FAIL);
 
   /* Load the constraints vector */
   N_VScale(ONE, constraints, ark_mem->constraints);
-
   ark_mem->constraintsSet = SUNTRUE;
 
   return(ARK_SUCCESS);
diff --git a/src/arkode/arkode_ls.c b/src/arkode/arkode_ls.c
index 28f7d04288..295ac95d51 100644
--- a/src/arkode/arkode_ls.c
+++ b/src/arkode/arkode_ls.c
@@ -169,6 +169,15 @@ int arkLSSetLinearSolver(void *arkode_mem, SUNLinearSolver LS,
   arkls_mem->jtsetup  = NULL;
   arkls_mem->jtimes   = arkLsDQJtimes;
   arkls_mem->Jt_data  = ark_mem;
+  arkls_mem->Jt_f     = ark_mem->step_getimplicitrhs(arkode_mem);
+
+  if (arkls_mem->Jt_f == NULL) {
+    arkProcessError(ark_mem, ARKLS_ILL_INPUT, "ARKLS",
+                    "arkLSSetLinearSolver",
+                    "Time step module is missing implicit RHS fcn");
+    free(arkls_mem); arkls_mem = NULL;
+    return(ARKLS_ILL_INPUT);
+  }
 
   arkls_mem->user_linsys = SUNFALSE;
   arkls_mem->linsys      = arkLsLinSys;
@@ -692,6 +701,54 @@ int arkLSSetJacTimes(void *arkode_mem,
     arkls_mem->jtsetup  = NULL;
     arkls_mem->jtimes   = arkLsDQJtimes;
     arkls_mem->Jt_data  = ark_mem;
+    arkls_mem->Jt_f     = ark_mem->step_getimplicitrhs(arkode_mem);
+
+    if (arkls_mem->Jt_f == NULL) {
+      arkProcessError(ark_mem, ARKLS_ILL_INPUT, "ARKLS",
+                      "arkLSSetJacTimes",
+                      "Time step module is missing implicit RHS fcn");
+      return(ARKLS_ILL_INPUT);
+    }
+  }
+
+  return(ARKLS_SUCCESS);
+}
+
+/*---------------------------------------------------------------
+  arkLSSetJacTimesRhsFn specifies an alternative user-supplied
+  ODE right-hand side function to use in the internal finite
+  difference Jacobian-vector product.
+  ---------------------------------------------------------------*/
+int arkLSSetJacTimesRhsFn(void *arkode_mem, ARKRhsFn jtimesRhsFn)
+{
+  ARKodeMem ark_mem;
+  ARKLsMem  arkls_mem;
+  int       retval;
+
+  /* access ARKLsMem structure */
+  retval = arkLs_AccessLMem(arkode_mem, "arkLSSetJacTimesRhsFn",
+                            &ark_mem, &arkls_mem);
+  if (retval != ARK_SUCCESS) return(retval);
+
+  /* check if using internal finite difference approximation */
+  if (!(arkls_mem->jtimesDQ)) {
+    arkProcessError(ark_mem, ARKLS_ILL_INPUT, "ARKLS", "arkLSSetJacTimesRhsFn",
+                    "Internal finite-difference Jacobian-vector product is disabled.");
+    return(ARKLS_ILL_INPUT);
+  }
+
+  /* store function pointers for RHS function (NULL implies use ODE RHS) */
+  if (jtimesRhsFn != NULL) {
+    arkls_mem->Jt_f = jtimesRhsFn;
+  } else {
+    arkls_mem->Jt_f = ark_mem->step_getimplicitrhs(arkode_mem);
+
+    if (arkls_mem->Jt_f == NULL) {
+      arkProcessError(ark_mem, ARKLS_ILL_INPUT, "ARKLS",
+                      "arkLSSetJacTimesRhsFn",
+                      "Time step module is missing implicit RHS fcn");
+      return(ARKLS_ILL_INPUT);
+    }
   }
 
   return(ARKLS_SUCCESS);
@@ -1949,7 +2006,6 @@ int arkLsDQJtimes(N_Vector v, N_Vector Jv, realtype t,
 {
   ARKodeMem ark_mem;
   ARKLsMem  arkls_mem;
-  ARKRhsFn  fi;
   realtype  sig, siginv;
   int       iter, retval;
 
@@ -1961,22 +2017,13 @@ int arkLsDQJtimes(N_Vector v, N_Vector Jv, realtype t,
   /* Initialize perturbation to 1/||v|| */
   sig = ONE/N_VWrmsNorm(v, ark_mem->ewt);
 
-  /* Access implicit RHS function */
-  fi = ark_mem->step_getimplicitrhs(arkode_mem);
-  if (fi == NULL) {
-    arkProcessError(ark_mem, ARKLS_ILL_INPUT, "ARKLS",
-                    "arkLsDQJtimes",
-                    "Time step module is missing implicit RHS fcn");
-    return(ARKLS_ILL_INPUT);
-  }
-
   for (iter=0; iter<MAX_DQITERS; iter++) {
 
     /* Set work = y + sig*v */
     N_VLinearSum(sig, v, ONE, y, work);
 
     /* Set Jv = f(tn, y+sig*v) */
-    retval = fi(t, work, Jv, ark_mem->user_data);
+    retval = arkls_mem->Jt_f(t, work, Jv, ark_mem->user_data);
     arkls_mem->nfeDQ++;
     if (retval == 0) break;
     if (retval < 0)  return(-1);
diff --git a/src/arkode/arkode_ls_impl.h b/src/arkode/arkode_ls_impl.h
index e5d11df2c0..69fa555ae4 100644
--- a/src/arkode/arkode_ls_impl.h
+++ b/src/arkode/arkode_ls_impl.h
@@ -107,8 +107,10 @@ typedef struct ARKLsMemRec {
   booleantype jtimesDQ;
   ARKLsJacTimesSetupFn jtsetup;
   ARKLsJacTimesVecFn jtimes;
+  ARKRhsFn Jt_f;
   void *Jt_data;
 
+
   /* Linear system setup function
    * (a) user-provided linsys function:
    *     - user_linsys = SUNTRUE
@@ -271,6 +273,7 @@ int arkLSSetMassPreconditioner(void* arkode_mem, ARKLsMassPrecSetupFn psetup,
                                ARKLsMassPrecSolveFn psolve);
 int arkLSSetJacTimes(void* arkode_mem, ARKLsJacTimesSetupFn jtsetup,
                      ARKLsJacTimesVecFn jtimes);
+int arkLSSetJacTimesRhsFn(void *arkode_mem, ARKRhsFn jtimesRhsFn);
 int arkLSSetMassTimes(void* arkode_mem, ARKLsMassTimesSetupFn msetup,
                       ARKLsMassTimesVecFn mtimes, void* mtimes_data);
 int arkLSSetLinSysFn(void* arkode_mem, ARKLsLinSysFn linsys);
diff --git a/src/arkode/arkode_mristep.c b/src/arkode/arkode_mristep.c
index ae568f9a49..72153427b0 100644
--- a/src/arkode/arkode_mristep.c
+++ b/src/arkode/arkode_mristep.c
@@ -30,18 +30,11 @@
 #define RSYM ".16g"
 #endif
 
-#define NO_DEBUG_OUTPUT
-/* #define DEBUG_OUTPUT */
-#ifdef DEBUG_OUTPUT
-#include <nvector/nvector_serial.h>
-#endif
-
 /* constants */
 #define ZERO   RCONST(0.0)
 #define ONE    RCONST(1.0)
 
 
-
 /*===============================================================
   MRIStep Exported functions -- Required
   ===============================================================*/
@@ -120,17 +113,23 @@ int MRIStepResize(void *arkode_mem, N_Vector y0, realtype t0,
   /* Resize the inner forcing vector */
   if (step_mem->inner_forcing != NULL) {
     for (i = 0; i < step_mem->inner_num_forcing; i++) {
-      retval = arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
-                            liw_diff, y0, &(step_mem->inner_forcing[i]));
-      if (retval != ARK_SUCCESS) return(retval);
+      if (!arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
+                        liw_diff, y0, &(step_mem->inner_forcing[i]))) {
+        arkProcessError(ark_mem, ARK_MEM_FAIL, "ARKode::MRIStep", "MRIStepResize",
+                        "Unable to resize vector");
+        return(ARK_MEM_FAIL);
+      }
     }
   }
 
   /* Resize the RHS vectors */
   for (i=0; i<step_mem->stages; i++) {
-    retval = arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
-                          liw_diff, y0, &step_mem->F[i]);
-    if (retval != ARK_SUCCESS)  return(retval);
+    if (!arkResizeVec(ark_mem, resize, resize_data, lrw_diff,
+                      liw_diff, y0, &step_mem->F[i])) {
+      arkProcessError(ark_mem, ARK_MEM_FAIL, "ARKode::ERKStep", "ERKStepResize",
+                      "Unable to resize vector");
+      return(ARK_MEM_FAIL);
+    }
   }
 
   return(ARK_SUCCESS);
@@ -348,6 +347,10 @@ void MRIStepPrintMem(void* arkode_mem, FILE* outfile)
   ARKodeMRIStepMem step_mem;
   int retval;
 
+#ifdef SUNDIALS_DEBUG_PRINTVEC
+  int i;
+#endif
+
   /* access ARKodeMRIStepMem structure */
   retval = mriStep_AccessStepMem(arkode_mem, "MRIStepPrintMem",
                                  &ark_mem, &step_mem);
@@ -369,11 +372,11 @@ void MRIStepPrintMem(void* arkode_mem, FILE* outfile)
   fprintf(outfile,"MRIStep: Butcher table:\n");
   ARKodeButcherTable_Write(step_mem->B, outfile);
 
-#ifdef DEBUG_OUTPUT
+#ifdef SUNDIALS_DEBUG_PRINTVEC
   /* output vector quantities */
   for (i=0; i<step_mem->stages; i++) {
     fprintf(outfile,"MRIStep: F[%i]:\n", i);
-    N_VPrint_Serial(step_mem->F[i]);
+    N_VPrintFile(step_mem->F[i], outfile);
   }
 #endif
 
@@ -807,9 +810,9 @@ int mriStep_TakeStep(void* arkode_mem, realtype *dsmPtr, int *nflagPtr)
                                  &ark_mem, &step_mem);
   if (retval != ARK_SUCCESS) return(retval);
 
-#ifdef DEBUG_OUTPUT
+#ifdef SUNDIALS_DEBUG_PRINTVEC
     printf("stage 0 RHS:\n");
-    N_VPrint_Serial(step_mem->F[0]);
+    N_VPrint(step_mem->F[0]);
 #endif
 
   /* Loop over internal stages to the step; since the method is explicit
@@ -819,7 +822,7 @@ int mriStep_TakeStep(void* arkode_mem, realtype *dsmPtr, int *nflagPtr)
     /* Set current stage time */
     ark_mem->tcur = ark_mem->tn + step_mem->B->c[is]*ark_mem->h;
 
-#ifdef DEBUG_OUTPUT
+#ifdef SUNDIALS_DEBUG
     printf("step %li,  stage %i,  h = %"RSYM",  t_n = %"RSYM"\n",
            ark_mem->nst, is, ark_mem->h, ark_mem->tcur);
 #endif
@@ -882,9 +885,9 @@ int mriStep_TakeStep(void* arkode_mem, realtype *dsmPtr, int *nflagPtr)
     if (retval < 0)  return(ARK_RHSFUNC_FAIL);
     if (retval > 0)  return(ARK_UNREC_RHSFUNC_ERR);
 
-#ifdef DEBUG_OUTPUT
+#ifdef SUNDIALS_DEBUG_PRINTVEC
     printf("RHS:\n");
-    N_VPrint_Serial(step_mem->F[is]);
+    N_VPrint(step_mem->F[is]);
 #endif
 
   } /* loop over stages */
@@ -931,10 +934,9 @@ int mriStep_TakeStep(void* arkode_mem, realtype *dsmPtr, int *nflagPtr)
     if (retval != 0) return(ARK_INNERTOOUTER_FAIL);
   }
 
-#ifdef DEBUG_OUTPUT
-    printf("error estimate = %"RSYM"\n", dsm);
+#ifdef SUNDIALS_DEBUG_PRINTVEC
     printf("updated solution:\n");
-    N_VPrint_Serial(ark_mem->ycur);
+    N_VPrint(ark_mem->ycur);
 #endif
 
   /* Solver diagnostics reporting */
diff --git a/src/arkode/fmod/farkode_arkstep_mod.c b/src/arkode/fmod/farkode_arkstep_mod.c
index 4141aade3d..82a8f16f39 100644
--- a/src/arkode/fmod/farkode_arkstep_mod.c
+++ b/src/arkode/fmod/farkode_arkstep_mod.c
@@ -679,6 +679,20 @@ SWIGEXPORT int _wrap_FARKStepSetMaxGrowth(void *farg1, double const *farg2) {
 }
 
 
+SWIGEXPORT int _wrap_FARKStepSetMinReduction(void *farg1, double const *farg2) {
+  int fresult ;
+  void *arg1 = (void *) 0 ;
+  realtype arg2 ;
+  int result;
+  
+  arg1 = (void *)(farg1);
+  arg2 = (realtype)(*farg2);
+  result = (int)ARKStepSetMinReduction(arg1,arg2);
+  fresult = (int)(result);
+  return fresult;
+}
+
+
 SWIGEXPORT int _wrap_FARKStepSetFixedStepBounds(void *farg1, double const *farg2, double const *farg3) {
   int fresult ;
   void *arg1 = (void *) 0 ;
@@ -1313,6 +1327,20 @@ SWIGEXPORT int _wrap_FARKStepSetJacTimes(void *farg1, ARKLsJacTimesSetupFn farg2
 }
 
 
+SWIGEXPORT int _wrap_FARKStepSetJacTimesRhsFn(void *farg1, ARKRhsFn farg2) {
+  int fresult ;
+  void *arg1 = (void *) 0 ;
+  ARKRhsFn arg2 = (ARKRhsFn) 0 ;
+  int result;
+  
+  arg1 = (void *)(farg1);
+  arg2 = (ARKRhsFn)(farg2);
+  result = (int)ARKStepSetJacTimesRhsFn(arg1,arg2);
+  fresult = (int)(result);
+  return fresult;
+}
+
+
 SWIGEXPORT int _wrap_FARKStepSetMassTimes(void *farg1, ARKLsMassTimesSetupFn farg2, ARKLsMassTimesVecFn farg3, void *farg4) {
   int fresult ;
   void *arg1 = (void *) 0 ;
diff --git a/src/arkode/fmod/farkode_arkstep_mod.f90 b/src/arkode/fmod/farkode_arkstep_mod.f90
index 1316adae4e..4dffada757 100644
--- a/src/arkode/fmod/farkode_arkstep_mod.f90
+++ b/src/arkode/fmod/farkode_arkstep_mod.f90
@@ -82,6 +82,7 @@ module farkode_arkstep_mod
  public :: FARKStepSetSafetyFactor
  public :: FARKStepSetErrorBias
  public :: FARKStepSetMaxGrowth
+ public :: FARKStepSetMinReduction
  public :: FARKStepSetFixedStepBounds
  public :: FARKStepSetAdaptivityMethod
  public :: FARKStepSetAdaptivityFn
@@ -126,6 +127,7 @@ module farkode_arkstep_mod
  public :: FARKStepSetPreconditioner
  public :: FARKStepSetMassPreconditioner
  public :: FARKStepSetJacTimes
+ public :: FARKStepSetJacTimesRhsFn
  public :: FARKStepSetMassTimes
  public :: FARKStepSetLinSysFn
  public :: FARKStepEvolve
@@ -474,6 +476,15 @@ function swigc_FARKStepSetMaxGrowth(farg1, farg2) &
 integer(C_INT) :: fresult
 end function
 
+function swigc_FARKStepSetMinReduction(farg1, farg2) &
+bind(C, name="_wrap_FARKStepSetMinReduction") &
+result(fresult)
+use, intrinsic :: ISO_C_BINDING
+type(C_PTR), value :: farg1
+real(C_DOUBLE), intent(in) :: farg2
+integer(C_INT) :: fresult
+end function
+
 function swigc_FARKStepSetFixedStepBounds(farg1, farg2, farg3) &
 bind(C, name="_wrap_FARKStepSetFixedStepBounds") &
 result(fresult)
@@ -879,6 +890,15 @@ function swigc_FARKStepSetJacTimes(farg1, farg2, farg3) &
 integer(C_INT) :: fresult
 end function
 
+function swigc_FARKStepSetJacTimesRhsFn(farg1, farg2) &
+bind(C, name="_wrap_FARKStepSetJacTimesRhsFn") &
+result(fresult)
+use, intrinsic :: ISO_C_BINDING
+type(C_PTR), value :: farg1
+type(C_FUNPTR), value :: farg2
+integer(C_INT) :: fresult
+end function
+
 function swigc_FARKStepSetMassTimes(farg1, farg2, farg3, farg4) &
 bind(C, name="_wrap_FARKStepSetMassTimes") &
 result(fresult)
@@ -1948,6 +1968,22 @@ function FARKStepSetMaxGrowth(arkode_mem, mx_growth) &
 swig_result = fresult
 end function
 
+function FARKStepSetMinReduction(arkode_mem, eta_min) &
+result(swig_result)
+use, intrinsic :: ISO_C_BINDING
+integer(C_INT) :: swig_result
+type(C_PTR) :: arkode_mem
+real(C_DOUBLE), intent(in) :: eta_min
+integer(C_INT) :: fresult 
+type(C_PTR) :: farg1 
+real(C_DOUBLE) :: farg2 
+
+farg1 = arkode_mem
+farg2 = eta_min
+fresult = swigc_FARKStepSetMinReduction(farg1, farg2)
+swig_result = fresult
+end function
+
 function FARKStepSetFixedStepBounds(arkode_mem, lb, ub) &
 result(swig_result)
 use, intrinsic :: ISO_C_BINDING
@@ -2679,6 +2715,22 @@ function FARKStepSetJacTimes(arkode_mem, jtsetup, jtimes) &
 swig_result = fresult
 end function
 
+function FARKStepSetJacTimesRhsFn(arkode_mem, jtimesrhsfn) &
+result(swig_result)
+use, intrinsic :: ISO_C_BINDING
+integer(C_INT) :: swig_result
+type(C_PTR) :: arkode_mem
+type(C_FUNPTR), intent(in), value :: jtimesrhsfn
+integer(C_INT) :: fresult 
+type(C_PTR) :: farg1 
+type(C_FUNPTR) :: farg2 
+
+farg1 = arkode_mem
+farg2 = jtimesrhsfn
+fresult = swigc_FARKStepSetJacTimesRhsFn(farg1, farg2)
+swig_result = fresult
+end function
+
 function FARKStepSetMassTimes(arkode_mem, msetup, mtimes, mtimes_data) &
 result(swig_result)
 use, intrinsic :: ISO_C_BINDING
diff --git a/src/arkode/fmod/farkode_erkstep_mod.c b/src/arkode/fmod/farkode_erkstep_mod.c
index a9b0736852..7d22a1d8f6 100644
--- a/src/arkode/fmod/farkode_erkstep_mod.c
+++ b/src/arkode/fmod/farkode_erkstep_mod.c
@@ -503,6 +503,20 @@ SWIGEXPORT int _wrap_FERKStepSetMaxGrowth(void *farg1, double const *farg2) {
 }
 
 
+SWIGEXPORT int _wrap_FERKStepSetMinReduction(void *farg1, double const *farg2) {
+  int fresult ;
+  void *arg1 = (void *) 0 ;
+  realtype arg2 ;
+  int result;
+  
+  arg1 = (void *)(farg1);
+  arg2 = (realtype)(*farg2);
+  result = (int)ERKStepSetMinReduction(arg1,arg2);
+  fresult = (int)(result);
+  return fresult;
+}
+
+
 SWIGEXPORT int _wrap_FERKStepSetFixedStepBounds(void *farg1, double const *farg2, double const *farg3) {
   int fresult ;
   void *arg1 = (void *) 0 ;
diff --git a/src/arkode/fmod/farkode_erkstep_mod.f90 b/src/arkode/fmod/farkode_erkstep_mod.f90
index a48ad53b3c..09faa7e8b3 100644
--- a/src/arkode/fmod/farkode_erkstep_mod.f90
+++ b/src/arkode/fmod/farkode_erkstep_mod.f90
@@ -60,6 +60,7 @@ module farkode_erkstep_mod
  public :: FERKStepSetSafetyFactor
  public :: FERKStepSetErrorBias
  public :: FERKStepSetMaxGrowth
+ public :: FERKStepSetMinReduction
  public :: FERKStepSetFixedStepBounds
  public :: FERKStepSetAdaptivityMethod
  public :: FERKStepSetAdaptivityFn
@@ -290,6 +291,15 @@ function swigc_FERKStepSetMaxGrowth(farg1, farg2) &
 integer(C_INT) :: fresult
 end function
 
+function swigc_FERKStepSetMinReduction(farg1, farg2) &
+bind(C, name="_wrap_FERKStepSetMinReduction") &
+result(fresult)
+use, intrinsic :: ISO_C_BINDING
+type(C_PTR), value :: farg1
+real(C_DOUBLE), intent(in) :: farg2
+integer(C_INT) :: fresult
+end function
+
 function swigc_FERKStepSetFixedStepBounds(farg1, farg2, farg3) &
 bind(C, name="_wrap_FERKStepSetFixedStepBounds") &
 result(fresult)
@@ -1098,6 +1108,22 @@ function FERKStepSetMaxGrowth(arkode_mem, mx_growth) &
 swig_result = fresult
 end function
 
+function FERKStepSetMinReduction(arkode_mem, eta_min) &
+result(swig_result)
+use, intrinsic :: ISO_C_BINDING
+integer(C_INT) :: swig_result
+type(C_PTR) :: arkode_mem
+real(C_DOUBLE), intent(in) :: eta_min
+integer(C_INT) :: fresult 
+type(C_PTR) :: farg1 
+real(C_DOUBLE) :: farg2 
+
+farg1 = arkode_mem
+farg2 = eta_min
+fresult = swigc_FERKStepSetMinReduction(farg1, farg2)
+swig_result = fresult
+end function
+
 function FERKStepSetFixedStepBounds(arkode_mem, lb, ub) &
 result(swig_result)
 use, intrinsic :: ISO_C_BINDING
diff --git a/src/cvode/CMakeLists.txt b/src/cvode/CMakeLists.txt
index a591122c62..1e11e3759c 100644
--- a/src/cvode/CMakeLists.txt
+++ b/src/cvode/CMakeLists.txt
@@ -32,9 +32,13 @@ set(cvode_SOURCES
   cvode_io.c
   cvode_ls.c
   cvode_nls.c
+  cvode_proj.c
   cvode_spils.c
   )
 
+set(cvode_cuda_SOURCES cvode_fused_cuda.cu)
+set(cvode_stub_SOURCES cvode_fused_stubs.c)
+
 # Add variable shared_SOURCES with the common SUNDIALS sources which will
 # also be included in the CVODE library
 set(shared_SOURCES
@@ -88,6 +92,7 @@ set(cvode_HEADERS
   cvode_diag.h
   cvode_direct.h
   cvode_ls.h
+  cvode_proj.h
   cvode_spils.h
   )
 
@@ -106,15 +111,36 @@ add_definitions(-DBUILD_SUNDIALS_LIBRARY)
 if(BUILD_STATIC_LIBS)
   # Add the build target for the static CVODE library
   add_library(sundials_cvode_static STATIC
-    ${cvode_SOURCES} ${shared_SOURCES} ${sunmatrix_SOURCES} ${sunlinsol_SOURCES} ${sunnonlinsol_SOURCES}
+    ${cvode_SOURCES} ${shared_SOURCES}
+    ${sunmatrix_SOURCES} ${sunlinsol_SOURCES}
+    ${sunnonlinsol_SOURCES}
   )
 
+  if(SUNDIALS_BUILD_PACKAGE_FUSED_KERNELS)
+    add_library(sundials_cvode_fused_cuda_static STATIC ${cvode_cuda_SOURCES})
+
+    target_link_libraries(sundials_cvode_fused_cuda_static
+      PUBLIC sundials_nveccuda_static)
+
+    set_target_properties(sundials_cvode_fused_cuda_static
+      PROPERTIES OUTPUT_NAME sundials_cvode_fused_cuda CLEAN_DIRECT_OUTPUT 1)
+
+    add_library(sundials_cvode_fused_stubs_static STATIC ${cvode_stub_SOURCES})
+
+    set_target_properties(sundials_cvode_fused_stubs_static
+      PROPERTIES OUTPUT_NAME sundials_cvode_fused_stubs CLEAN_DIRECT_OUTPUT 1)
+  endif()
+
   # Set the library name and make sure it is not deleted
   set_target_properties(sundials_cvode_static
     PROPERTIES OUTPUT_NAME sundials_cvode CLEAN_DIRECT_OUTPUT 1)
 
   # Install the CVODE library
   install(TARGETS sundials_cvode_static DESTINATION ${CMAKE_INSTALL_LIBDIR})
+  if(SUNDIALS_BUILD_PACKAGE_FUSED_KERNELS)
+    install(TARGETS sundials_cvode_fused_cuda_static DESTINATION ${CMAKE_INSTALL_LIBDIR})
+    install(TARGETS sundials_cvode_fused_stubs_static DESTINATION ${CMAKE_INSTALL_LIBDIR})
+  endif()
 
 endif(BUILD_STATIC_LIBS)
 
@@ -122,11 +148,28 @@ endif(BUILD_STATIC_LIBS)
 if(BUILD_SHARED_LIBS)
   # Add the build target for the shared CVODE library
   add_library(sundials_cvode_shared SHARED
-    ${cvode_SOURCES} ${shared_SOURCES} ${sunmatrix_SOURCES} ${sunlinsol_SOURCES} ${sunnonlinsol_SOURCES}
+    ${cvode_SOURCES} ${shared_SOURCES}
+    ${sunmatrix_SOURCES} ${sunlinsol_SOURCES}
+    ${sunnonlinsol_SOURCES}
   )
 
+  if(SUNDIALS_BUILD_PACKAGE_FUSED_KERNELS)
+    add_library(sundials_cvode_fused_cuda_shared SHARED ${cvode_cuda_SOURCES})
+
+    target_link_libraries(sundials_cvode_fused_cuda_shared
+      PUBLIC sundials_nveccuda_shared)
+
+    set_target_properties(sundials_cvode_fused_cuda_shared
+      PROPERTIES OUTPUT_NAME sundials_cvode_fused_cuda CLEAN_DIRECT_OUTPUT 1)
+
+    add_library(sundials_cvode_fused_stubs_shared SHARED ${cvode_stub_SOURCES})
+
+    set_target_properties(sundials_cvode_fused_stubs_shared
+      PROPERTIES OUTPUT_NAME sundials_cvode_fused_stubs CLEAN_DIRECT_OUTPUT 1)
+  endif()
+
   if(UNIX)
-    target_link_libraries(sundials_cvode_shared m)
+    target_link_libraries(sundials_cvode_shared PRIVATE m)
   endif()
 
   # Set the library name and make sure it is not deleted
@@ -139,6 +182,10 @@ if(BUILD_SHARED_LIBS)
 
   # Install the CVODE library
   install(TARGETS sundials_cvode_shared DESTINATION ${CMAKE_INSTALL_LIBDIR})
+  if(SUNDIALS_BUILD_PACKAGE_FUSED_KERNELS)
+    install(TARGETS sundials_cvode_fused_cuda_shared DESTINATION ${CMAKE_INSTALL_LIBDIR})
+    install(TARGETS sundials_cvode_fused_stubs_shared DESTINATION ${CMAKE_INSTALL_LIBDIR})
+  endif()
 
 endif(BUILD_SHARED_LIBS)
 
diff --git a/src/cvode/README.md b/src/cvode/README.md
index f6806bb7a4..065985bb08 100644
--- a/src/cvode/README.md
+++ b/src/cvode/README.md
@@ -1,5 +1,5 @@
 # CVODE
-### Version 5.2.0 (Mar 2020)
+### Version 5.3.0 (May 2020)
 
 **Alan C. Hindmarsh and Radu Serban  
   Center for Applied Scientific Computing, LLNL**
@@ -45,11 +45,11 @@ the "SUNDIALS Release History" appendix of the CVODE User Guide.
 
 ## References
 
-* A. C. Hindmarsh and R. Serban, "User Documentation for CVODE v5.2.0,"
-  LLNL technical report UCRL-SM-208108, Mar 2020.
+* A. C. Hindmarsh and R. Serban, "User Documentation for CVODE v5.3.0,"
+  LLNL technical report UCRL-SM-208108, May 2020.
 
-* A. C. Hindmarsh and R. Serban, "Example Programs for CVODE v5.2.0,"
-  LLNL technical report UCRL-SM-208110, Mar 2020.
+* A. C. Hindmarsh and R. Serban, "Example Programs for CVODE v5.3.0,"
+  LLNL technical report UCRL-SM-208110, May 2020.
 
 * S.D. Cohen and A.C. Hindmarsh, "CVODE, a Stiff/nonstiff ODE Solver in C,"
   Computers in Physics, 10(2), pp. 138-143, 1996.
diff --git a/src/cvode/cvode.c b/src/cvode/cvode.c
index c77d6b6364..953e0f2886 100644
--- a/src/cvode/cvode.c
+++ b/src/cvode/cvode.c
@@ -53,54 +53,6 @@
 /* CVODE Routine-Specific Constants                                */
 /*=================================================================*/
 
-/*
- * Control constants for lower-level functions used by cvStep
- * ----------------------------------------------------------
- *
- * cvHin return values:
- *    CV_SUCCESS
- *    CV_RHSFUNC_FAIL
- *    CV_TOO_CLOSE
- *
- * cvStep control constants:
- *    DO_ERROR_TEST
- *    PREDICT_AGAIN
- *
- * cvStep return values:
- *    CV_SUCCESS,
- *    CV_LSETUP_FAIL,  CV_LSOLVE_FAIL,
- *    CV_RHSFUNC_FAIL, CV_RTFUNC_FAIL
- *    CV_CONV_FAILURE, CV_ERR_FAILURE,
- *    CV_FIRST_RHSFUNC_ERR
- *
- * cvNls input nflag values:
- *    FIRST_CALL
- *    PREV_CONV_FAIL
- *    PREV_ERR_FAIL
- *
- * cvNls return values:
- *    CV_SUCCESS,
- *    CV_LSETUP_FAIL, CV_LSOLVE_FAIL, CV_RHSFUNC_FAIL,
- *    CONV_FAIL, RHSFUNC_RECVR
- *
- * cvNewtonIteration return values:
- *    CV_SUCCESS,
- *    CV_LSOLVE_FAIL, CV_RHSFUNC_FAIL
- *    CONV_FAIL, RHSFUNC_RECVR,
- *    TRY_AGAIN
- *
- */
-
-#define DO_ERROR_TEST    +2
-#define PREDICT_AGAIN    +3
-
-#define TRY_AGAIN        +5
-#define FIRST_CALL       +6
-#define PREV_CONV_FAIL   +7
-#define PREV_ERR_FAIL    +8
-
-#define CONSTR_RECVR     +10
-
 /*
  * Control constants for lower-level rootfinding functions
  * -------------------------------------------------------
@@ -237,6 +189,24 @@ static void cvFreeVectors(CVodeMem cv_mem);
 static int cvEwtSetSS(CVodeMem cv_mem, N_Vector ycur, N_Vector weight);
 static int cvEwtSetSV(CVodeMem cv_mem, N_Vector ycur, N_Vector weight);
 
+#ifdef SUNDIALS_BUILD_PACKAGE_FUSED_KERNELS
+extern
+int cvEwtSetSS_fused(const booleantype atolmin0,
+                     const realtype reltol,
+                     const realtype Sabstol,
+                     const N_Vector ycur,
+                     N_Vector tempv,
+                     N_Vector weight);
+
+extern
+int cvEwtSetSV_fused(const booleantype atolmin0,
+                     const realtype reltol,
+                     const N_Vector Vabstol,
+                     const N_Vector ycur,
+                     N_Vector tempv,
+                     N_Vector weight);
+#endif
+
 /* Initial stepsize calculation */
 
 static int cvHin(CVodeMem cv_mem, realtype tout);
@@ -255,7 +225,6 @@ static void cvAdjustAdams(CVodeMem cv_mem, int deltaq);
 static void cvAdjustBDF(CVodeMem cv_mem, int deltaq);
 static void cvIncreaseBDF(CVodeMem cv_mem);
 static void cvDecreaseBDF(CVodeMem cv_mem);
-static void cvRescale(CVodeMem cv_mem);
 static void cvPredict(CVodeMem cv_mem);
 static void cvSet(CVodeMem cv_mem);
 static void cvSetAdams(CVodeMem cv_mem);
@@ -271,12 +240,18 @@ static void cvSetTqBDF(CVodeMem cv_mem, realtype hsum, realtype alpha0,
 static int cvNls(CVodeMem cv_mem, int nflag);
 
 static int cvCheckConstraints(CVodeMem cv_mem);
+#ifdef SUNDIALS_BUILD_PACKAGE_FUSED_KERNELS
+extern
+int cvCheckConstraints_fused(const N_Vector c,
+                             const N_Vector ewt,
+                             const N_Vector y,
+                             const N_Vector mm,
+                             N_Vector tempv);
+#endif
 
 static int cvHandleNFlag(CVodeMem cv_mem, int *nflagPtr, realtype saved_t,
                          int *ncfPtr);
 
-static void cvRestore(CVodeMem cv_mem, realtype saved_t);
-
 /* Error Test */
 
 static int cvDoErrorTest(CVodeMem cv_mem, int *nflagPtr,
@@ -307,6 +282,7 @@ static int cvRcheck2(CVodeMem cv_mem);
 static int cvRcheck3(CVodeMem cv_mem);
 static int cvRootfind(CVodeMem cv_mem);
 
+
 /*
  * =================================================================
  * Exported Functions Implementation
@@ -361,30 +337,32 @@ void *CVodeCreate(int lmm)
   cv_mem->cv_uround = UNIT_ROUNDOFF;
 
   /* Set default values for integrator optional inputs */
-  cv_mem->cv_f           = NULL;
-  cv_mem->cv_user_data   = NULL;
-  cv_mem->cv_itol        = CV_NN;
-  cv_mem->cv_atolmin0    = SUNTRUE;
-  cv_mem->cv_user_efun   = SUNFALSE;
-  cv_mem->cv_efun        = NULL;
-  cv_mem->cv_e_data      = NULL;
-  cv_mem->cv_ehfun       = cvErrHandler;
-  cv_mem->cv_eh_data     = cv_mem;
-  cv_mem->cv_errfp       = stderr;
-  cv_mem->cv_qmax        = maxord;
-  cv_mem->cv_mxstep      = MXSTEP_DEFAULT;
-  cv_mem->cv_mxhnil      = MXHNIL_DEFAULT;
-  cv_mem->cv_sldeton     = SUNFALSE;
-  cv_mem->cv_hin         = ZERO;
-  cv_mem->cv_hmin        = HMIN_DEFAULT;
-  cv_mem->cv_hmax_inv    = HMAX_INV_DEFAULT;
-  cv_mem->cv_tstopset    = SUNFALSE;
-  cv_mem->cv_maxnef      = MXNEF;
-  cv_mem->cv_maxncf      = MXNCF;
-  cv_mem->cv_nlscoef     = CORTES;
-  cv_mem->convfail       = CV_NO_FAILURES;
-  cv_mem->cv_constraints = NULL;
-  cv_mem->cv_constraintsSet = SUNFALSE;
+  cv_mem->cv_f                = NULL;
+  cv_mem->cv_user_data        = NULL;
+  cv_mem->cv_itol             = CV_NN;
+  cv_mem->cv_atolmin0         = SUNTRUE;
+  cv_mem->cv_user_efun        = SUNFALSE;
+  cv_mem->cv_efun             = NULL;
+  cv_mem->cv_e_data           = NULL;
+  cv_mem->cv_ehfun            = cvErrHandler;
+  cv_mem->cv_eh_data          = cv_mem;
+  cv_mem->cv_monitorfun       = NULL;
+  cv_mem->cv_monitor_interval = 0;
+  cv_mem->cv_errfp            = stderr;
+  cv_mem->cv_qmax             = maxord;
+  cv_mem->cv_mxstep           = MXSTEP_DEFAULT;
+  cv_mem->cv_mxhnil           = MXHNIL_DEFAULT;
+  cv_mem->cv_sldeton          = SUNFALSE;
+  cv_mem->cv_hin              = ZERO;
+  cv_mem->cv_hmin             = HMIN_DEFAULT;
+  cv_mem->cv_hmax_inv         = HMAX_INV_DEFAULT;
+  cv_mem->cv_tstopset         = SUNFALSE;
+  cv_mem->cv_maxnef           = MXNEF;
+  cv_mem->cv_maxncf           = MXNCF;
+  cv_mem->cv_nlscoef          = CORTES;
+  cv_mem->convfail            = CV_NO_FAILURES;
+  cv_mem->cv_constraints      = NULL;
+  cv_mem->cv_constraintsSet   = SUNFALSE;
 
   /* Initialize root finding variables */
 
@@ -398,6 +376,11 @@ void *CVodeCreate(int lmm)
   cv_mem->cv_gactive    = NULL;
   cv_mem->cv_mxgnull    = 1;
 
+  /* Initialize projection variables */
+  cv_mem->proj_mem     = NULL;
+  cv_mem->proj_enabled = SUNFALSE;
+  cv_mem->proj_applied = SUNFALSE;
+
   /* Set the saved value for qmax_alloc */
 
   cv_mem->cv_qmax_alloc = maxord;
@@ -417,6 +400,9 @@ void *CVodeCreate(int lmm)
   cv_mem->NLS    = NULL;
   cv_mem->ownNLS = SUNFALSE;
 
+  /* Initialize fused operations variable */
+  cv_mem->cv_usefused = SUNFALSE;
+
   /* Return pointer to CVODE memory block */
 
   return((void *)cv_mem);
@@ -1832,6 +1818,24 @@ static int cvInitialSetup(CVodeMem cv_mem)
     return(CV_NLS_INIT_FAIL);
   }
 
+  /* Initialize projection data */
+  if (cv_mem->proj_enabled && cv_mem->proj_mem == NULL) {
+    cvProcessError(cv_mem, CV_PROJ_MEM_NULL, "CVODE",
+                   "cvInitialSetup", MSG_CV_PROJ_MEM_NULL);
+    return(CV_PROJ_MEM_NULL);
+  }
+
+  if (cv_mem->proj_mem != NULL) {
+    ier = cvProjInit(cv_mem->proj_mem);
+    if (ier != CV_SUCCESS) {
+      cvProcessError(cv_mem, CV_MEM_FAIL, "CVODE", "cvInitialSetup",
+                     MSGCV_MEM_FAIL);
+      return(CV_MEM_FAIL);
+    }
+    cv_mem->proj_applied = SUNFALSE;
+  }
+
+  /* Initial setup complete */
   return(CV_SUCCESS);
 }
 
@@ -2076,16 +2080,31 @@ static int cvYddNorm(CVodeMem cv_mem, realtype hg, realtype *yddnrm)
 
 static int cvStep(CVodeMem cv_mem)
 {
-  realtype saved_t, dsm;
-  int ncf, nef;
-  int nflag, kflag, eflag;
+  realtype saved_t;          /* time to restore to if a failure occurs   */
+  realtype dsm;              /* local truncation error estimate          */
+  int ncf;                   /* corrector failures in this step attempt  */
+  int npf;                   /* projection failures in this step attempt */
+  int nef;                   /* error test failures in this step attempt */
+  int nflag, kflag;          /* nonlinear solver flags                   */
+  int pflag;                 /* projection return flag                   */
+  int eflag;                 /* error test return flag                   */
+  booleantype doProjection;  /* flag to apply projection in this step    */
 
   saved_t = cv_mem->cv_tn;
-  ncf = nef = 0;
+  ncf = npf = nef = 0;
   nflag = FIRST_CALL;
+  doProjection = SUNFALSE;
 
-  if ((cv_mem->cv_nst > 0) && (cv_mem->cv_hprime != cv_mem->cv_h))
+  /* If the step size has changed, update the history array */
+  if ((cv_mem->cv_nst > 0) && (cv_mem->cv_hprime != cv_mem->cv_h)) {
     cvAdjustParams(cv_mem);
+  }
+
+  /* Check if this step should be projected */
+  if (cv_mem->proj_enabled)
+    doProjection = cv_mem->proj_mem->freq > 0 &&
+      (cv_mem->cv_nst == 0 || (cv_mem->cv_nst >= cv_mem->proj_mem->nstlprj
+                               + cv_mem->proj_mem->freq));
 
   /* Looping point for attempts to take a step */
   for(;;) {
@@ -2099,9 +2118,24 @@ static int cvStep(CVodeMem cv_mem)
     /* Go back in loop if we need to predict again (nflag=PREV_CONV_FAIL) */
     if (kflag == PREDICT_AGAIN) continue;
 
-    /* Return if nonlinear solve failed and recovery not possible. */
+    /* Return if nonlinear solve failed and recovery is not possible. */
     if (kflag != DO_ERROR_TEST) return(kflag);
 
+    /* Check if a projection needs to be performed */
+    cv_mem->proj_applied = SUNFALSE;
+
+    if (doProjection) {
+
+      /* Perform projection (nflag=CV_SUCCESS) */
+      pflag = cvDoProjection(cv_mem, &nflag, saved_t, &npf);
+
+      /* Go back in loop if we need to predict again (nflag=PREV_PROJ_FAIL) */
+      if (pflag == PREDICT_AGAIN) continue;
+
+      /* Return if projection failed and recovery is not possible */
+      if (pflag != CV_SUCCESS) return(pflag);
+    }
+
     /* Perform error test (nflag=CV_SUCCESS) */
     eflag = cvDoErrorTest(cv_mem, &nflag, saved_t, &nef, &dsm);
 
@@ -2346,7 +2380,7 @@ static void cvDecreaseBDF(CVodeMem cv_mem)
  * h is rescaled by eta, and hscale is reset to h.
  */
 
-static void cvRescale(CVodeMem cv_mem)
+void cvRescale(CVodeMem cv_mem)
 {
   int j;
 
@@ -2560,7 +2594,15 @@ static realtype cvAltSum(int iend, realtype a[], int k)
  *                                 q-1
  * Lambda(x) = (1 + x / xi*_q) * PRODUCT (1 + x / xi_i) , where
  *                                 i=1
- *  xi_i = [t_n - t_(n-i)] / h.
+ *
+ * The components of the array p (for projections) are the
+ * coefficients of a polynomial Phi(x) = p_0 + p_1 x + ... + p_q x^q,
+ * given by
+ *             q
+ * Phi(x) = PRODUCT (1 + x / xi_i)
+ *            i=1
+ *
+ * Here xi_i = [t_n - t_(n-i)] / h.
  *
  * The array tq is set to test quantities used in the convergence
  * test, the error test, and the selection of h at a new order.
@@ -2575,6 +2617,11 @@ static void cvSetBDF(CVodeMem cv_mem)
   for (i=2; i <= cv_mem->cv_q; i++) cv_mem->cv_l[i] = ZERO;
   alpha0 = alpha0_hat = -ONE;
   hsum = cv_mem->cv_h;
+
+  if (cv_mem->proj_enabled)
+    for (i=0; i <= cv_mem->cv_q; i++)
+      cv_mem->cv_p[i] = cv_mem->cv_l[i];
+
   if (cv_mem->cv_q > 1) {
     for (j=2; j < cv_mem->cv_q; j++) {
       hsum += cv_mem->cv_tau[j-1];
@@ -2590,6 +2637,11 @@ static void cvSetBDF(CVodeMem cv_mem)
     hsum += cv_mem->cv_tau[cv_mem->cv_q-1];
     xi_inv = cv_mem->cv_h / hsum;
     alpha0_hat = -cv_mem->cv_l[1] - xi_inv;
+
+    if (cv_mem->proj_enabled)
+      for (i = cv_mem->cv_q; i >= 1; i--)
+        cv_mem->cv_p[i] = cv_mem->cv_l[i] + cv_mem->cv_p[i-1] * xi_inv;
+
     for (i=cv_mem->cv_q; i >= 1; i--)
       cv_mem->cv_l[i] += cv_mem->cv_l[i-1]*xistar_inv;
   }
@@ -2731,11 +2783,24 @@ static int cvCheckConstraints(CVodeMem cv_mem)
   /* Constraints not met */
 
   /* Compute correction to satisfy constraints */
-  N_VCompare(ONEPT5, cv_mem->cv_constraints, tmp); /* a[i]=1 when |c[i]|=2  */
-  N_VProd(tmp, cv_mem->cv_constraints, tmp);       /* a * c                 */
-  N_VDiv(tmp, cv_mem->cv_ewt, tmp);                /* a * c * wt            */
-  N_VLinearSum(ONE, cv_mem->cv_y, -PT1, tmp, tmp); /* y - 0.1 * a * c * wt  */
-  N_VProd(tmp, mm, tmp);                           /* v = mm*(y-0.1*a*c*wt) */
+#ifdef SUNDIALS_BUILD_PACKAGE_FUSED_KERNELS
+  if (cv_mem->cv_usefused)
+  {
+    cvCheckConstraints_fused(cv_mem->cv_constraints,
+                             cv_mem->cv_ewt,
+                             cv_mem->cv_y,
+                             mm,
+                             tmp);
+  }
+  else
+#endif
+  {
+    N_VCompare(ONEPT5, cv_mem->cv_constraints, tmp); /* a[i]=1 when |c[i]|=2  */
+    N_VProd(tmp, cv_mem->cv_constraints, tmp);       /* a * c                 */
+    N_VDiv(tmp, cv_mem->cv_ewt, tmp);                /* a * c * wt            */
+    N_VLinearSum(ONE, cv_mem->cv_y, -PT1, tmp, tmp); /* y - 0.1 * a * c * wt  */
+    N_VProd(tmp, mm, tmp);                           /* v = mm*(y-0.1*a*c*wt) */
+  }
 
   vnorm = N_VWrmsNorm(tmp, cv_mem->cv_ewt);        /* ||v|| */
 
@@ -2850,7 +2915,7 @@ static int cvHandleNFlag(CVodeMem cv_mem, int *nflagPtr, realtype saved_t,
  * the same values as before the call to cvPredict.
  */
 
-static void cvRestore(CVodeMem cv_mem, realtype saved_t)
+void cvRestore(CVodeMem cv_mem, realtype saved_t)
 {
   int j, k;
 
@@ -2992,12 +3057,28 @@ static void cvCompleteStep(CVodeMem cv_mem)
   /* Apply correction to column j of zn: l_j * Delta_n */
   (void) N_VScaleAddMulti(cv_mem->cv_q+1, cv_mem->cv_l, cv_mem->cv_acor,
                           cv_mem->cv_zn, cv_mem->cv_zn);
+
+  /* Apply the projection correction to column j of zn: p_j * Delta_n */
+  if (cv_mem->proj_applied) {
+    (void) N_VScaleAddMulti(cv_mem->cv_q+1,
+                            cv_mem->cv_p, cv_mem->cv_tempv, /* tempv = acorP */
+                            cv_mem->cv_zn, cv_mem->cv_zn);
+  }
+
   cv_mem->cv_qwait--;
   if ((cv_mem->cv_qwait == 1) && (cv_mem->cv_q != cv_mem->cv_qmax)) {
     N_VScale(ONE, cv_mem->cv_acor, cv_mem->cv_zn[cv_mem->cv_qmax]);
     cv_mem->cv_saved_tq5 = cv_mem->cv_tq[5];
     cv_mem->cv_indx_acor = cv_mem->cv_qmax;
   }
+
+#ifdef SUNDIALS_BUILD_WITH_MONITORING
+  /* If user access function was provided, call it now */
+  if (cv_mem->cv_monitorfun != NULL &&
+      !(cv_mem->cv_nst % cv_mem->cv_monitor_interval)) {
+    cv_mem->cv_monitorfun((void*) cv_mem, cv_mem->cv_user_data);
+  }
+#endif
 }
 
 /*
@@ -3242,6 +3323,18 @@ static int cvHandleFailure(CVodeMem cv_mem, int flag)
     cvProcessError(cv_mem, CV_NLS_FAIL, "CVODE", "CVode",
                    MSGCV_NLS_FAIL, cv_mem->cv_tn);
     break;
+  case CV_PROJ_MEM_NULL:
+    cvProcessError(cv_mem, CV_PROJ_MEM_NULL, "CVODE", "CVode",
+                   MSG_CV_PROJ_MEM_NULL);
+    break;
+  case CV_PROJFUNC_FAIL:
+    cvProcessError(cv_mem, CV_PROJFUNC_FAIL, "CVODE", "CVode",
+                   MSG_CV_PROJFUNC_FAIL, cv_mem->cv_tn);
+    break;
+  case CV_REPTD_PROJFUNC_ERR:
+    cvProcessError(cv_mem, CV_REPTD_PROJFUNC_ERR, "CVODE", "CVode",
+                   MSG_CV_REPTD_PROJFUNC_ERR, cv_mem->cv_tn);
+    break;
   default:
     /* This return should never happen */
     cvProcessError(cv_mem, CV_UNRECOGNIZED_ERR, "CVODE", "CVode",
@@ -4135,13 +4228,30 @@ int cvEwtSet(N_Vector ycur, N_Vector weight, void *data)
 
 static int cvEwtSetSS(CVodeMem cv_mem, N_Vector ycur, N_Vector weight)
 {
-  N_VAbs(ycur, cv_mem->cv_tempv);
-  N_VScale(cv_mem->cv_reltol, cv_mem->cv_tempv, cv_mem->cv_tempv);
-  N_VAddConst(cv_mem->cv_tempv, cv_mem->cv_Sabstol, cv_mem->cv_tempv);
-  if (cv_mem->cv_atolmin0) {
-    if (N_VMin(cv_mem->cv_tempv) <= ZERO) return(-1);
+#ifdef SUNDIALS_BUILD_PACKAGE_FUSED_KERNELS
+  if (cv_mem->cv_usefused)
+  {
+    /* We compute weight (inverse of tempv) regardless of the component test
+       since it will be thrown away in this case anyways. */
+    cvEwtSetSS_fused(cv_mem->cv_atolmin0, cv_mem->cv_reltol,
+                     cv_mem->cv_Sabstol, ycur, cv_mem->cv_tempv,
+                     weight);
+    if (cv_mem->cv_atolmin0) {
+      if (N_VMin(cv_mem->cv_tempv) <= ZERO) return(-1);
+    }
+  }
+  else
+#endif
+  {
+    N_VAbs(ycur, cv_mem->cv_tempv);
+    N_VScale(cv_mem->cv_reltol, cv_mem->cv_tempv, cv_mem->cv_tempv);
+    N_VAddConst(cv_mem->cv_tempv, cv_mem->cv_Sabstol, cv_mem->cv_tempv);
+    if (cv_mem->cv_atolmin0) {
+      if (N_VMin(cv_mem->cv_tempv) <= ZERO) return(-1);
+    }
+    N_VInv(cv_mem->cv_tempv, weight);
   }
-  N_VInv(cv_mem->cv_tempv, weight);
+
   return(0);
 }
 
@@ -4157,13 +4267,30 @@ static int cvEwtSetSS(CVodeMem cv_mem, N_Vector ycur, N_Vector weight)
 
 static int cvEwtSetSV(CVodeMem cv_mem, N_Vector ycur, N_Vector weight)
 {
-  N_VAbs(ycur, cv_mem->cv_tempv);
-  N_VLinearSum(cv_mem->cv_reltol, cv_mem->cv_tempv, ONE,
-               cv_mem->cv_Vabstol, cv_mem->cv_tempv);
-  if (cv_mem->cv_atolmin0) {
-    if (N_VMin(cv_mem->cv_tempv) <= ZERO) return(-1);
+#ifdef SUNDIALS_BUILD_PACKAGE_FUSED_KERNELS
+  if (cv_mem->cv_usefused)
+  {
+    /* We compute weight (inverse of tempv) regardless of the component test
+       since it will be thrown away in this case anyways. */
+    cvEwtSetSV_fused(cv_mem->cv_atolmin0, cv_mem->cv_reltol,
+                     cv_mem->cv_Vabstol, ycur, cv_mem->cv_tempv,
+                     weight);
+    if (cv_mem->cv_atolmin0) {
+      if (N_VMin(cv_mem->cv_tempv) <= ZERO) return(-1);
+    }
+  }
+  else
+#endif
+  {
+    N_VAbs(ycur, cv_mem->cv_tempv);
+    N_VLinearSum(cv_mem->cv_reltol, cv_mem->cv_tempv, ONE,
+                 cv_mem->cv_Vabstol, cv_mem->cv_tempv);
+    if (cv_mem->cv_atolmin0) {
+      if (N_VMin(cv_mem->cv_tempv) <= ZERO) return(-1);
+    }
+    N_VInv(cv_mem->cv_tempv, weight);
   }
-  N_VInv(cv_mem->cv_tempv, weight);
+
   return(0);
 }
 
diff --git a/src/cvode/cvode_diag.c b/src/cvode/cvode_diag.c
index fd1ff7d9e5..94352ecca9 100644
--- a/src/cvode/cvode_diag.c
+++ b/src/cvode/cvode_diag.c
@@ -1,7 +1,7 @@
 /*
- * ----------------------------------------------------------------- 
+ * -----------------------------------------------------------------
  * Programmer(s): Scott D. Cohen, Alan C. Hindmarsh and
- *                Radu Serban @ LLNL
+ *                Radu Serban, Cody J. Balos @ LLNL
  * -----------------------------------------------------------------
  * SUNDIALS Copyright Start
  * Copyright (c) 2002-2020, Lawrence Livermore National Security
@@ -23,8 +23,33 @@
 #include "cvode_diag_impl.h"
 #include "cvode_impl.h"
 
+#ifdef SUNDIALS_BUILD_PACKAGE_FUSED_KERNELS
+extern
+int cvDiagSetup_formY(const realtype h,
+                      const realtype r,
+                      const N_Vector fpred,
+                      const N_Vector zn1,
+                      const N_Vector ypred,
+                      N_Vector ftemp,
+                      N_Vector y);
+
+extern
+int cvDiagSetup_buildM(const realtype fract,
+                       const realtype uround,
+                       const realtype h,
+                       const N_Vector ftemp,
+                       const N_Vector fpred,
+                       const N_Vector ewt,
+                       N_Vector bit,
+                       N_Vector bitcomp,
+                       N_Vector y,
+                       N_Vector M);
+
+int cvDiagSolve_updateM(const realtype r, N_Vector M);
+#endif
+
 /* Other Constants */
-  
+
 #define FRACT RCONST(0.1)
 #define ONE   RCONST(1.0)
 
@@ -71,7 +96,7 @@ static int CVDiagFree(CVodeMem cv_mem);
 
 /*
  * -----------------------------------------------------------------
- * CVDiag 
+ * CVDiag
  * -----------------------------------------------------------------
  * This routine initializes the memory record and sets various function
  * fields specific to the diagonal linear solver module.  CVDense first
@@ -82,11 +107,11 @@ static int CVDiagFree(CVodeMem cv_mem);
  * CVDiagMemRec and sets the cv_lmem field in (*cvode_mem) to the
  * address of this structure.  It sets setupNonNull in (*cvode_mem) to
  * SUNTRUE.  Finally, it allocates memory for M, bit, and bitcomp.
- * The CVDiag return value is SUCCESS = 0, LMEM_FAIL = -1, or 
+ * The CVDiag return value is SUCCESS = 0, LMEM_FAIL = -1, or
  * LIN_ILL_INPUT=-2.
  * -----------------------------------------------------------------
  */
-  
+
 int CVDiag(void *cvode_mem)
 {
   CVodeMem cv_mem;
@@ -107,7 +132,7 @@ int CVDiag(void *cvode_mem)
   }
 
   if (lfree != NULL) lfree(cv_mem);
-  
+
   /* Set four main function fields in cv_mem */
   linit  = CVDiagInit;
   lsetup = CVDiagSetup;
@@ -124,9 +149,9 @@ int CVDiag(void *cvode_mem)
 
   last_flag = CVDIAG_SUCCESS;
 
-  
+
   /* Allocate memory for M, bit, and bitcomp */
-   
+
   M = N_VClone(vec_tmpl);
   if (M == NULL) {
     cvProcessError(cv_mem, CVDIAG_MEM_FAIL, "CVDIAG", "CVDiag", MSGDG_MEM_FAIL);
@@ -253,7 +278,7 @@ char *CVDiagGetReturnFlagName(long int flag)
   switch(flag) {
   case CVDIAG_SUCCESS:
     sprintf(name,"CVDIAG_SUCCESS");
-    break;   
+    break;
   case CVDIAG_MEM_NULL:
     sprintf(name,"CVDIAG_MEM_NULL");
     break;
@@ -307,8 +332,8 @@ static int CVDiagInit(CVodeMem cv_mem)
  * -----------------------------------------------------------------
  * CVDiagSetup
  * -----------------------------------------------------------------
- * This routine does the setup operations for the diagonal linear 
- * solver.  It constructs a diagonal approximation to the Newton matrix 
+ * This routine does the setup operations for the diagonal linear
+ * solver.  It constructs a diagonal approximation to the Newton matrix
  * M = I - gamma*J, updates counters, and inverts M.
  * -----------------------------------------------------------------
  */
@@ -331,8 +356,17 @@ static int CVDiagSetup(CVodeMem cv_mem, int convfail, N_Vector ypred,
 
   /* Form y with perturbation = FRACT*(func. iter. correction) */
   r = FRACT * rl1;
-  N_VLinearSum(h, fpred, -ONE, zn[1], ftemp);
-  N_VLinearSum(r, ftemp, ONE, ypred, y);
+#ifdef SUNDIALS_BUILD_PACKAGE_FUSED_KERNELS
+  if (cv_mem->cv_usefused)
+  {
+    cvDiagSetup_formY(h, r, fpred, zn[1], ypred, ftemp, y);
+  }
+  else
+#endif
+  {
+    N_VLinearSum(h, fpred, -ONE, zn[1], ftemp);
+    N_VLinearSum(r, ftemp, ONE, ypred, y);
+  }
 
   /* Evaluate f at perturbed y */
   retval = f(tn, y, M, cv_mem->cv_user_data);
@@ -348,17 +382,26 @@ static int CVDiagSetup(CVodeMem cv_mem, int convfail, N_Vector ypred,
   }
 
   /* Construct M = I - gamma*J with J = diag(deltaf_i/deltay_i) */
-  N_VLinearSum(ONE, M, -ONE, fpred, M);
-  N_VLinearSum(FRACT, ftemp, -h, M, M);
-  N_VProd(ftemp, ewt, y);
-  /* Protect against deltay_i being at roundoff level */
-  N_VCompare(uround, y, bit);
-  N_VAddConst(bit, -ONE, bitcomp);
-  N_VProd(ftemp, bit, y);
-  N_VLinearSum(FRACT, y, -ONE, bitcomp, y);
-  N_VDiv(M, y, M);
-  N_VProd(M, bit, M);
-  N_VLinearSum(ONE, M, -ONE, bitcomp, M);
+#ifdef SUNDIALS_BUILD_PACKAGE_FUSED_KERNELS
+  if (cv_mem->cv_usefused)
+  {
+    cvDiagSetup_buildM(FRACT, uround, h, ftemp, fpred, ewt, bit, bitcomp, y, M);
+  }
+  else
+#endif
+  {
+    N_VLinearSum(ONE, M, -ONE, fpred, M);
+    N_VLinearSum(FRACT, ftemp, -h, M, M);
+    N_VProd(ftemp, ewt, y);
+    /* Protect against deltay_i being at roundoff level */
+    N_VCompare(uround, y, bit);
+    N_VAddConst(bit, -ONE, bitcomp);
+    N_VProd(ftemp, bit, y);
+    N_VLinearSum(FRACT, y, -ONE, bitcomp, y);
+    N_VDiv(M, y, M);
+    N_VProd(M, bit, M);
+    N_VLinearSum(ONE, M, -ONE, bitcomp, M);
+  }
 
   /* Invert M with test for zero components */
   invOK = N_VInvTest(M, M);
@@ -391,15 +434,24 @@ static int CVDiagSolve(CVodeMem cv_mem, N_Vector b, N_Vector weight,
   CVDiagMem cvdiag_mem;
 
   cvdiag_mem = (CVDiagMem) lmem;
-  
+
   /* If gamma has changed, update factor in M, and save gamma value */
 
   if (gammasv != gamma) {
     r = gamma / gammasv;
-    N_VInv(M, M);
-    N_VAddConst(M, -ONE, M);
-    N_VScale(r, M, M);
-    N_VAddConst(M, ONE, M);
+#ifdef SUNDIALS_BUILD_PACKAGE_FUSED_KERNELS
+    if (cv_mem->cv_usefused)
+    {
+      cvDiagSolve_updateM(r, M);
+    }
+    else
+#endif
+    {
+      N_VInv(M, M);
+      N_VAddConst(M, -ONE, M);
+      N_VScale(r, M, M);
+      N_VAddConst(M, ONE, M);
+    }
     invOK = N_VInvTest(M, M);
     if (!invOK) {
       last_flag = CVDIAG_INV_FAIL;
@@ -426,7 +478,7 @@ static int CVDiagSolve(CVodeMem cv_mem, N_Vector b, N_Vector weight,
 static int CVDiagFree(CVodeMem cv_mem)
 {
   CVDiagMem cvdiag_mem;
-  
+
   cvdiag_mem = (CVDiagMem) lmem;
 
   N_VDestroy(M);
diff --git a/src/cvode/cvode_fused_cuda.cu b/src/cvode/cvode_fused_cuda.cu
new file mode 100644
index 0000000000..3b7e191211
--- /dev/null
+++ b/src/cvode/cvode_fused_cuda.cu
@@ -0,0 +1,458 @@
+/*
+ * -----------------------------------------------------------------
+ * Programmer(s): Cody J. Balos @ LLNL
+ * -----------------------------------------------------------------
+ * SUNDIALS Copyright Start
+ * Copyright (c) 2002-2020, Lawrence Livermore National Security
+ * and Southern Methodist University.
+ * All rights reserved.
+ *
+ * See the top-level LICENSE and NOTICE files for details.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ * SUNDIALS Copyright End
+ * -----------------------------------------------------------------
+ * This file implements fused CUDA kernels for CVODE.
+ * -----------------------------------------------------------------
+ */
+
+ #include <cuda_runtime.h>
+
+ #include <nvector/nvector_cuda.h>
+ #include "sundials_cuda_kernels.cuh"
+
+
+/*
+ * -----------------------------------------------------------------
+ * Compute the ewt vector when the tol type is CV_SS.
+ * -----------------------------------------------------------------
+ */
+
+
+__global__
+void cvEwtSetSS_cukernel(const sunindextype length,
+                         const realtype reltol,
+                         const realtype Sabstol,
+                         const realtype* ycur,
+                         realtype* tempv,
+                         realtype* weight)
+{
+  const realtype one = 1.0;
+  GRID_STRIDE_XLOOP(sunindextype, i, length)
+  {
+    // N_VAbs(ycur, cv_mem->cv_tempv);
+    // N_VScale(cv_mem->cv_reltol, cv_mem->cv_tempv, cv_mem->cv_tempv);
+    // N_VAddConst(cv_mem->cv_tempv, cv_mem->cv_Sabstol, cv_mem->cv_tempv);
+    // N_VInv(cv_mem->cv_tempv, weight);
+    realtype tmp = abs(ycur[i]);
+    tempv[i] = reltol*tmp + Sabstol;
+    weight[i] = one/tempv[i];
+  }
+}
+
+extern "C"
+int cvEwtSetSS_fused(const booleantype atolMin0,
+                     const realtype reltol,
+                     const realtype Sabstol,
+                     const N_Vector ycur,
+                     N_Vector tempv,
+                     N_Vector weight)
+{
+  const SUNCudaExecPolicy* exec_policy = ((N_VectorContent_Cuda)weight->content)->stream_exec_policy;
+  const sunindextype N = N_VGetLength(weight);
+  size_t block = exec_policy->blockSize(N);
+  size_t grid  = exec_policy->gridSize(N);
+
+  cvEwtSetSS_cukernel<<<grid, block, 0, exec_policy->stream()>>>
+  (
+    N,
+    reltol,
+    Sabstol,
+    N_VGetDeviceArrayPointer_Cuda(ycur),
+    N_VGetDeviceArrayPointer_Cuda(tempv),
+    N_VGetDeviceArrayPointer_Cuda(weight)
+  );
+
+#ifdef SUNDIALS_DEBUG_CUDA_LASTERROR
+  cudaDeviceSynchronize();
+  if (!SUNDIALS_CUDA_VERIFY(cudaGetLastError())) return -1;
+#endif
+
+  return 0;
+}
+
+
+/*
+ * -----------------------------------------------------------------
+ * Compute the ewt vector when the tol type is CV_SV.
+ * -----------------------------------------------------------------
+ */
+
+
+__global__
+void cvEwtSetSV_cukernel(const sunindextype length,
+                         const realtype reltol,
+                         const realtype* Vabstol,
+                         const realtype* ycur,
+                         realtype* tempv,
+                         realtype* weight)
+{
+  const realtype one = 1.0;
+  GRID_STRIDE_XLOOP(sunindextype, i, length)
+  {
+    // N_VAbs(ycur, cv_mem->cv_tempv);
+    // N_VLinearSum(cv_mem->cv_reltol, cv_mem->cv_tempv, ONE,
+    //             cv_mem->cv_Vabstol, cv_mem->cv_tempv);
+    // N_VInv(cv_mem->cv_tempv, weight);
+    realtype tmp = abs(ycur[i]);
+    tempv[i] = reltol*tmp + Vabstol[i];
+    weight[i] = one/tempv[i];
+  }
+}
+
+extern "C"
+int cvEwtSetSV_fused(const booleantype atolMin0,
+                     const realtype reltol,
+                     const N_Vector Vabstol,
+                     const N_Vector ycur,
+                     N_Vector tempv,
+                     N_Vector weight)
+{
+  const SUNCudaExecPolicy* exec_policy = ((N_VectorContent_Cuda)weight->content)->stream_exec_policy;
+  const sunindextype N = N_VGetLength(weight);
+  size_t block = exec_policy->blockSize(N);
+  size_t grid  = exec_policy->gridSize(N);
+
+  cvEwtSetSV_cukernel<<<grid, block, 0, exec_policy->stream()>>>
+  (
+    N,
+    reltol,
+    N_VGetDeviceArrayPointer_Cuda(Vabstol),
+    N_VGetDeviceArrayPointer_Cuda(ycur),
+    N_VGetDeviceArrayPointer_Cuda(tempv),
+    N_VGetDeviceArrayPointer_Cuda(weight)
+  );
+
+#ifdef SUNDIALS_DEBUG_CUDA_LASTERROR
+  cudaDeviceSynchronize();
+  if (!SUNDIALS_CUDA_VERIFY(cudaGetLastError())) return -1;
+#endif
+
+  return 0;
+}
+
+
+/*
+ * -----------------------------------------------------------------
+ * Determine if the constraints of the problem are satisfied by
+ * the proposed step.
+ * -----------------------------------------------------------------
+ */
+
+
+__global__
+void cvCheckConstraints_cukernel(const sunindextype length,
+                                 const realtype* c,
+                                 const realtype* ewt,
+                                 const realtype* y,
+                                 const realtype* mm,
+                                 realtype* tempv)
+{
+  static const realtype zero = 0.0;
+  static const realtype pt1 = 0.1;
+  static const realtype one = 1.0;
+  static const realtype onept5 = 1.5;
+  GRID_STRIDE_XLOOP(sunindextype, i, length)
+  {
+    // N_VCompare(ONEPT5, cv_mem->cv_constraints, tmp); /* a[i]=1 when |c[i]|=2  */
+    // N_VProd(tmp, cv_mem->cv_constraints, tmp);       /* a * c                 */
+    // N_VDiv(tmp, cv_mem->cv_ewt, tmp);                /* a * c * wt            */
+    // N_VLinearSum(ONE, cv_mem->cv_y, -PT1, tmp, tmp); /* y - 0.1 * a * c * wt  */
+    // N_VProd(tmp, mm, tmp);                           /* v = mm*(y-0.1*a*c*wt) */
+    realtype tmp = (abs(c[i]) >= onept5) ? one : zero;
+    tmp = tmp*c[i];
+    tmp = tmp/ewt[i];
+    tmp = y[i] - pt1*tmp;
+    tempv[i] = tmp*mm[i];
+  }
+}
+
+extern "C"
+int cvCheckConstraints_fused(const N_Vector c,
+                             const N_Vector ewt,
+                             const N_Vector y,
+                             const N_Vector mm,
+                             N_Vector tempv)
+{
+  const SUNCudaExecPolicy* exec_policy = ((N_VectorContent_Cuda)c->content)->stream_exec_policy;
+  const sunindextype N = N_VGetLength(c);
+  size_t block = exec_policy->blockSize(N);
+  size_t grid  = exec_policy->gridSize(N);
+
+  cvCheckConstraints_cukernel<<<grid, block, 0, exec_policy->stream()>>>
+  (
+    N,
+    N_VGetDeviceArrayPointer_Cuda(c),
+    N_VGetDeviceArrayPointer_Cuda(ewt),
+    N_VGetDeviceArrayPointer_Cuda(y),
+    N_VGetDeviceArrayPointer_Cuda(mm),
+    N_VGetDeviceArrayPointer_Cuda(tempv)
+  );
+
+#ifdef SUNDIALS_DEBUG_CUDA_LASTERROR
+  cudaDeviceSynchronize();
+  if (!SUNDIALS_CUDA_VERIFY(cudaGetLastError())) return -1;
+#endif
+
+  return 0;
+}
+
+
+/*
+ * -----------------------------------------------------------------
+ * Compute the nonlinear residual.
+ * -----------------------------------------------------------------
+ */
+
+
+__global__
+void cvNlsResid_cukernel(const sunindextype length,
+                         const realtype rl1,
+                         const realtype ngamma,
+                         const realtype* zn1,
+                         const realtype* ycor,
+                         const realtype* ftemp,
+                         realtype* res)
+{
+  GRID_STRIDE_XLOOP(sunindextype, i, length)
+  {
+    // N_VLinearSum(cv_mem->cv_rl1, cv_mem->cv_zn[1], ONE, ycor, res);
+    // N_VLinearSum(-cv_mem->cv_gamma, cv_mem->cv_ftemp, ONE, res, res);
+    realtype tmp = rl1*zn1[i] + ycor[i];
+    res[i] = ngamma*ftemp[i] + tmp;
+  }
+}
+
+extern "C"
+int cvNlsResid_fused(const realtype rl1,
+                     const realtype ngamma,
+                     const N_Vector zn1,
+                     const N_Vector ycor,
+                     const N_Vector ftemp,
+                     N_Vector res)
+{
+  const SUNCudaExecPolicy* exec_policy = ((N_VectorContent_Cuda)res->content)->stream_exec_policy;
+  const sunindextype N = N_VGetLength(res);
+  size_t block = exec_policy->blockSize(N);
+  size_t grid  = exec_policy->gridSize(N);
+
+  cvNlsResid_cukernel<<<grid, block, 0, exec_policy->stream()>>>
+  (
+    N,
+    rl1,
+    ngamma,
+    N_VGetDeviceArrayPointer_Cuda(zn1),
+    N_VGetDeviceArrayPointer_Cuda(ycor),
+    N_VGetDeviceArrayPointer_Cuda(ftemp),
+    N_VGetDeviceArrayPointer_Cuda(res)
+  );
+
+#ifdef SUNDIALS_DEBUG_CUDA_LASTERROR
+  cudaDeviceSynchronize();
+  if (!SUNDIALS_CUDA_VERIFY(cudaGetLastError())) return -1;
+#endif
+
+  return 0;
+}
+
+/*
+ * -----------------------------------------------------------------
+ * Form y with perturbation = FRACT*(func. iter. correction)
+ * -----------------------------------------------------------------
+ */
+
+__global__
+void cvDiagSetup_formY_kernel(const sunindextype length,
+                              const realtype h,
+                              const realtype r,
+                              const realtype* fpred,
+                              const realtype* zn1,
+                              const realtype* ypred,
+                              realtype* ftemp,
+                              realtype* y)
+{
+  // N_VLinearSum(h, fpred, -ONE, zn[1], ftemp);
+  // N_VLinearSum(r, ftemp, ONE, ypred, y);
+  GRID_STRIDE_XLOOP(sunindextype, i, length)
+  {
+    ftemp[i] = h*fpred[i] - zn1[i];
+    y[i] = r*ftemp[i] + ypred[i];
+  }
+}
+
+extern "C"
+int cvDiagSetup_formY(const realtype h,
+                      const realtype r,
+                      const N_Vector fpred,
+                      const N_Vector zn1,
+                      const N_Vector ypred,
+                      N_Vector ftemp,
+                      N_Vector y)
+{
+  const SUNCudaExecPolicy* exec_policy = ((N_VectorContent_Cuda)y->content)->stream_exec_policy;
+  const sunindextype N = N_VGetLength(y);
+  size_t block = exec_policy->blockSize(N);
+  size_t grid  = exec_policy->gridSize(N);
+
+  cvDiagSetup_formY_kernel<<<grid, block, 0, exec_policy->stream()>>>
+  (
+    N,
+    h,
+    r,
+    N_VGetDeviceArrayPointer_Cuda(fpred),
+    N_VGetDeviceArrayPointer_Cuda(zn1),
+    N_VGetDeviceArrayPointer_Cuda(ypred),
+    N_VGetDeviceArrayPointer_Cuda(ftemp),
+    N_VGetDeviceArrayPointer_Cuda(y)
+  );
+
+#ifdef SUNDIALS_DEBUG_CUDA_LASTERROR
+  cudaDeviceSynchronize();
+  if (!SUNDIALS_CUDA_VERIFY(cudaGetLastError())) return -1;
+#endif
+
+  return 0;
+}
+
+/*
+ * -----------------------------------------------------------------
+ * Construct M = I - gamma*J with J = diag(deltaf_i/deltay_i)
+ * protecting against deltay_i being at roundoff level.
+ * -----------------------------------------------------------------
+ */
+
+__global__
+void cvDiagSetup_buildM_kernel(const sunindextype length,
+                               const realtype fract,
+                               const realtype uround,
+                               const realtype h,
+                               const realtype* ftemp,
+                               const realtype* fpred,
+                               const realtype* ewt,
+                               realtype* bit,
+                               realtype* bitcomp,
+                               realtype* y,
+                               realtype* M)
+{
+  static const realtype zero = 0.0;
+  static const realtype one = 1.0;
+  GRID_STRIDE_XLOOP(sunindextype, i, length)
+  {
+    // N_VLinearSum(ONE, M, -ONE, fpred, M);
+    // N_VLinearSum(FRACT, ftemp, -h, M, M);
+    // N_VProd(ftemp, ewt, y);
+    M[i] = fract*ftemp[i] - h*(M[i] - fpred[i]);
+    y[i] = ftemp[i] * ewt[i];
+
+    // N_VCompare(uround, y, bit);
+    // N_VAddConst(bit, -ONE, bitcomp);
+    bool test = (abs(y[i]) > uround);
+    bit[i] = test ? one : zero;
+    bitcomp[i] = test ? zero : -one;
+
+    // N_VProd(ftemp, bit, y);
+    // N_VLinearSum(FRACT, y, -ONE, bitcomp, y);
+    // N_VDiv(M, y, M);
+    // N_VProd(M, bit, M);
+    // N_VLinearSum(ONE, M, -ONE, bitcomp, M);
+    y[i] = fract*ftemp[i]*bit[i] - bitcomp[i];
+    M[i] = M[i]/y[i] * bit[i] - bitcomp[i];
+  }
+}
+
+extern "C"
+int cvDiagSetup_buildM(const realtype fract,
+                       const realtype uround,
+                       const realtype h,
+                       const N_Vector ftemp,
+                       const N_Vector fpred,
+                       const N_Vector ewt,
+                       N_Vector bit,
+                       N_Vector bitcomp,
+                       N_Vector y,
+                       N_Vector M)
+{
+  const SUNCudaExecPolicy* exec_policy = ((N_VectorContent_Cuda)M->content)->stream_exec_policy;
+  const sunindextype N = N_VGetLength(M);
+  size_t block = exec_policy->blockSize(N);
+  size_t grid  = exec_policy->gridSize(N);
+
+  cvDiagSetup_buildM_kernel<<<grid, block, 0, exec_policy->stream()>>>
+  (
+    N,
+    fract,
+    uround,
+    h,
+    N_VGetDeviceArrayPointer_Cuda(ftemp),
+    N_VGetDeviceArrayPointer_Cuda(fpred),
+    N_VGetDeviceArrayPointer_Cuda(ewt),
+    N_VGetDeviceArrayPointer_Cuda(bit),
+    N_VGetDeviceArrayPointer_Cuda(bitcomp),
+    N_VGetDeviceArrayPointer_Cuda(y),
+    N_VGetDeviceArrayPointer_Cuda(M)
+  );
+
+#ifdef SUNDIALS_DEBUG_CUDA_LASTERROR
+  cudaDeviceSynchronize();
+  if (!SUNDIALS_CUDA_VERIFY(cudaGetLastError())) return -1;
+#endif
+
+  return 0;
+}
+
+
+/*
+ * -----------------------------------------------------------------
+ *  Update M with changed gamma so that M = I - gamma*J.
+ * -----------------------------------------------------------------
+ */
+
+
+ __global__
+void cvDiagSolve_updateM_kernel(const sunindextype length, const realtype r, realtype* M)
+{
+  static const realtype one = 1.0;
+  GRID_STRIDE_XLOOP(sunindextype, i, length)
+  {
+    // N_VInv(M, M);
+    // N_VAddConst(M, -ONE, M);
+    // N_VScale(r, M, M);
+    // N_VAddConst(M, ONE, M);
+    realtype a = one/M[i] - one;
+    M[i] = r*a + one;
+  }
+}
+
+
+extern "C"
+int cvDiagSolve_updateM(const realtype r, N_Vector M)
+{
+  const SUNCudaExecPolicy* exec_policy = ((N_VectorContent_Cuda)M->content)->stream_exec_policy;
+  const sunindextype N = N_VGetLength(M);
+  size_t block = exec_policy->blockSize(N);
+  size_t grid  = exec_policy->gridSize(N);
+
+  cvDiagSolve_updateM_kernel<<<grid, block, 0, exec_policy->stream()>>>
+  (
+    N,
+    r,
+    N_VGetDeviceArrayPointer_Cuda(M)
+  );
+
+#ifdef SUNDIALS_DEBUG_CUDA_LASTERROR
+  cudaDeviceSynchronize();
+  if (!SUNDIALS_CUDA_VERIFY(cudaGetLastError())) return -1;
+#endif
+
+  return 0;
+}
\ No newline at end of file
diff --git a/src/cvode/cvode_fused_stubs.c b/src/cvode/cvode_fused_stubs.c
new file mode 100644
index 0000000000..bc2caa2f98
--- /dev/null
+++ b/src/cvode/cvode_fused_stubs.c
@@ -0,0 +1,184 @@
+/*
+ * -----------------------------------------------------------------
+ * Programmer(s): Cody J. Balos @ LLNL
+ * -----------------------------------------------------------------
+ * SUNDIALS Copyright Start
+ * Copyright (c) 2002-2020, Lawrence Livermore National Security
+ * and Southern Methodist University.
+ * All rights reserved.
+ *
+ * See the top-level LICENSE and NOTICE files for details.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ * SUNDIALS Copyright End
+ * -----------------------------------------------------------------
+ * This file implements fused stub kernels for CVODE.
+ * -----------------------------------------------------------------
+ */
+
+
+#include "cvode_diag_impl.h"
+#include "cvode_impl.h"
+
+#define ZERO   RCONST(0.0)
+#define PT1    RCONST(0.1)
+#define FRACT  RCONST(0.1)
+#define ONEPT5 RCONST(1.50)
+#define ONE    RCONST(1.0)
+
+/*
+ * -----------------------------------------------------------------
+ * Compute the ewt vector when the tol type is CV_SS.
+ * -----------------------------------------------------------------
+ */
+
+int cvEwtSetSS_fused(const booleantype atolmin0,
+                     const realtype reltol,
+                     const realtype Sabstol,
+                     const N_Vector ycur,
+                     N_Vector tempv,
+                     N_Vector weight)
+{
+  N_VAbs(ycur, tempv);
+  N_VScale(reltol, tempv, tempv);
+  N_VAddConst(tempv, Sabstol, tempv);
+  if (atolmin0) {
+    if (N_VMin(tempv) <= ZERO) return(-1);
+  }
+  N_VInv(tempv, weight);
+  return 0;
+}
+
+/*
+ * -----------------------------------------------------------------
+ * Compute the ewt vector when the tol type is CV_SV.
+ * -----------------------------------------------------------------
+ */
+
+
+int cvEwtSetSV_fused(const booleantype atolmin0,
+                     const realtype reltol,
+                     const N_Vector Vabstol,
+                     const N_Vector ycur,
+                     N_Vector tempv,
+                     N_Vector weight)
+{
+  N_VAbs(ycur, tempv);
+  N_VLinearSum(reltol, tempv, ONE,
+               Vabstol, tempv);
+  if (atolmin0) {
+    if (N_VMin(tempv) <= ZERO) return(-1);
+  }
+  N_VInv(tempv, weight);
+  return 0;
+}
+
+
+/*
+ * -----------------------------------------------------------------
+ * Determine if the constraints of the problem are satisfied by
+ * the proposed step.
+ * -----------------------------------------------------------------
+ */
+
+
+int cvCheckConstraints_fused(const N_Vector c,
+                             const N_Vector ewt,
+                             const N_Vector y,
+                             const N_Vector mm,
+                             N_Vector tmp)
+{
+  N_VCompare(ONEPT5, c, tmp);           /* a[i]=1 when |c[i]|=2  */
+  N_VProd(tmp, c, tmp);                 /* a * c                 */
+  N_VDiv(tmp, ewt, tmp);                /* a * c * wt            */
+  N_VLinearSum(ONE, y, -PT1, tmp, tmp); /* y - 0.1 * a * c * wt  */
+  N_VProd(tmp, mm, tmp);                /* v = mm*(y-0.1*a*c*wt) */
+  return 0;
+}
+
+
+/*
+ * -----------------------------------------------------------------
+ * Compute the nonlinear residual.
+ * -----------------------------------------------------------------
+ */
+
+
+int cvNlsResid_fused(const realtype rl1,
+                     const realtype ngamma,
+                     const N_Vector zn1,
+                     const N_Vector ycor,
+                     const N_Vector ftemp,
+                     N_Vector res)
+{
+  N_VLinearSum(rl1, zn1, ONE, ycor, res);
+  N_VLinearSum(ngamma, ftemp, ONE, res, res);
+  return 0;
+}
+
+/*
+ * -----------------------------------------------------------------
+ * Form y with perturbation = FRACT*(func. iter. correction)
+ * -----------------------------------------------------------------
+ */
+
+int cvDiagSetup_formY(const realtype h,
+                      const realtype r,
+                      const N_Vector fpred,
+                      const N_Vector zn1,
+                      const N_Vector ypred,
+                      N_Vector ftemp,
+                      N_Vector y)
+{
+  N_VLinearSum(h, fpred, -ONE, zn1, ftemp);
+  N_VLinearSum(r, ftemp, ONE, ypred, y);
+  return 0;
+}
+
+/*
+ * -----------------------------------------------------------------
+ * Construct M = I - gamma*J with J = diag(deltaf_i/deltay_i)
+ * protecting against deltay_i being at roundoff level.
+ * -----------------------------------------------------------------
+ */
+
+int cvDiagSetup_buildM(const realtype fract,
+                       const realtype uround,
+                       const realtype h,
+                       const N_Vector ftemp,
+                       const N_Vector fpred,
+                       const N_Vector ewt,
+                       N_Vector bit,
+                       N_Vector bitcomp,
+                       N_Vector y,
+                       N_Vector M)
+{
+  N_VLinearSum(ONE, M, -ONE, fpred, M);
+  N_VLinearSum(FRACT, ftemp, -h, M, M);
+  N_VProd(ftemp, ewt, y);
+  /* Protect against deltay_i being at roundoff level */
+  N_VCompare(uround, y, bit);
+  N_VAddConst(bit, -ONE, bitcomp);
+  N_VProd(ftemp, bit, y);
+  N_VLinearSum(FRACT, y, -ONE, bitcomp, y);
+  N_VDiv(M, y, M);
+  N_VProd(M, bit, M);
+  N_VLinearSum(ONE, M, -ONE, bitcomp, M);
+  return 0;
+}
+
+
+/*
+ * -----------------------------------------------------------------
+ *  Update M with changed gamma so that M = I - gamma*J.
+ * -----------------------------------------------------------------
+ */
+
+int cvDiagSolve_updateM(const realtype r, N_Vector M)
+{
+  N_VInv(M, M);
+  N_VAddConst(M, -ONE, M);
+  N_VScale(r, M, M);
+  N_VAddConst(M, ONE, M);
+  return 0;
+}
\ No newline at end of file
diff --git a/src/cvode/cvode_impl.h b/src/cvode/cvode_impl.h
index 5aed37e648..b76a75fdfa 100644
--- a/src/cvode/cvode_impl.h
+++ b/src/cvode/cvode_impl.h
@@ -24,6 +24,8 @@
 #include <stdarg.h>
 #include <cvode/cvode.h>
 
+#include "cvode_proj_impl.h"
+
 #ifdef __cplusplus  /* wrapper to enable C++ usage */
 extern "C" {
 #endif
@@ -47,10 +49,57 @@ extern "C" {
 #define MXHNIL_DEFAULT   10             /* mxhnil default value   */
 #define MXSTEP_DEFAULT   500            /* mxstep default value   */
 
-/* Return values for lower level routines used by CVode and functions
-   provided to the nonlinear solver */
+/* Control constants for lower-level functions used by cvStep
+ * ----------------------------------------------------------
+ *
+ * cvHin return values:
+ *    CV_SUCCESS
+ *    CV_RHSFUNC_FAIL
+ *    CV_TOO_CLOSE
+ *
+ * cvStep control constants:
+ *    DO_ERROR_TEST
+ *    PREDICT_AGAIN
+ *
+ * cvStep return values:
+ *    CV_SUCCESS,
+ *    CV_LSETUP_FAIL,  CV_LSOLVE_FAIL,
+ *    CV_RHSFUNC_FAIL, CV_RTFUNC_FAIL
+ *    CV_CONV_FAILURE, CV_ERR_FAILURE,
+ *    CV_FIRST_RHSFUNC_ERR
+ *
+ * cvNls input nflag values:
+ *    FIRST_CALL
+ *    PREV_CONV_FAIL
+ *    PREV_PROJ_FAIL
+ *    PREV_ERR_FAIL
+ *
+ * cvNls return values:
+ *    CV_SUCCESS,
+ *    CV_LSETUP_FAIL, CV_LSOLVE_FAIL, CV_RHSFUNC_FAIL,
+ *    CONV_FAIL, RHSFUNC_RECVR
+ *
+ * cvNewtonIteration return values:
+ *    CV_SUCCESS,
+ *    CV_LSOLVE_FAIL, CV_RHSFUNC_FAIL
+ *    CONV_FAIL, RHSFUNC_RECVR,
+ *    TRY_AGAIN
+ *
+ */
+
+#define DO_ERROR_TEST    +2
+#define PREDICT_AGAIN    +3
+
+#define TRY_AGAIN        +5
+#define FIRST_CALL       +6
+#define PREV_CONV_FAIL   +7
+#define PREV_PROJ_FAIL   +8
+#define PREV_ERR_FAIL    +9
 
-#define RHSFUNC_RECVR +9
+#define RHSFUNC_RECVR    +10
+#define CONSTR_RECVR     +11
+#define CONSTRFUNC_RECVR +12
+#define PROJFUNC_RECVR   +13
 
 /*
  * -----------------------------------------------------------------
@@ -258,6 +307,12 @@ typedef struct CVodeMemRec {
   void *cv_eh_data;           /* data pointer passed to ehfun                 */
   FILE *cv_errfp;             /* CVODE error messages are sent to errfp       */
 
+  /*-------------------------------------------
+    User access function
+  -------------------------------------------*/
+  CVMonitorFn cv_monitorfun;     /* func called with CVODE mem and user data  */
+  long int cv_monitor_interval;  /* step interval to call cv_monitorfun       */
+
   /*-------------------------
     Stability Limit Detection
     -------------------------*/
@@ -289,6 +344,15 @@ typedef struct CVodeMemRec {
   booleantype *cv_gactive; /* array with active/inactive event functions      */
   int cv_mxgnull;          /* number of warning messages about possible g==0  */
 
+  /*----------------
+    Projection Data
+    ----------------*/
+
+  CVodeProjMem proj_mem;      /* projection memory structure               */
+  booleantype  proj_enabled;  /* flag indicating if projection is enabled  */
+  booleantype  proj_applied;  /* flag indicating if projection was applied */
+  realtype     cv_p[L_MAX];   /* coefficients of p(x) (degree q poly)      */
+
   /*-----------------------
     Fused Vector Operations
     -----------------------*/
@@ -296,6 +360,8 @@ typedef struct CVodeMemRec {
   realtype cv_cvals[L_MAX]; /* array of scalars */
   N_Vector cv_Xvecs[L_MAX]; /* array of vectors */
 
+  booleantype cv_usefused;  /* flag indicating if CVODE specific fused kernels should be used */
+
 } *CVodeMem;
 
 /*
@@ -455,6 +521,21 @@ void cvErrHandler(int error_code, const char *module, const char *function,
 
 int cvNlsInit(CVodeMem cv_mem);
 
+/* Projection functions */
+
+int cvDoProjection(CVodeMem cv_mem, int *nflagPtr, realtype saved_t,
+                   int *npfPtr);
+int cvProjInit(CVodeProjMem proj_mem);
+int cvProjFree(CVodeProjMem *proj_mem);
+
+/* Restore tn and undo prediction to reattempt a step */
+
+void cvRestore(CVodeMem cv_mem, realtype saved_t);
+
+/* Reset h and rescale history array to prepare for a step */
+
+void cvRescale(CVodeMem cv_mem);
+
 /*
  * =================================================================
  *   C V O D E    E R R O R    M E S S A G E S
@@ -551,6 +632,15 @@ int cvNlsInit(CVodeMem cv_mem);
 #define MSGCV_NLS_INPUT_NULL "At " MSG_TIME ", the nonlinear solver was passed a NULL input."
 #define MSGCV_NLS_FAIL "At " MSG_TIME ", the nonlinear solver failed in an unrecoverable manner."
 
+/* CVode Projection Error Messages */
+
+#define MSG_CV_MEM_NULL  "cvode_mem = NULL illegal."
+#define MSG_CV_MEM_FAIL  "A memory request failed."
+
+#define MSG_CV_PROJ_MEM_NULL       "proj_mem = NULL illegal."
+#define MSG_CV_PROJFUNC_FAIL       "At " MSG_TIME " the projection function failed with an unrecoverable error."
+#define MSG_CV_REPTD_PROJFUNC_ERR  "At " MSG_TIME " the projection function had repeated recoverable errors."
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/cvode/cvode_io.c b/src/cvode/cvode_io.c
index 0ce639398d..d691bc35b1 100644
--- a/src/cvode/cvode_io.c
+++ b/src/cvode/cvode_io.c
@@ -104,6 +104,64 @@ int CVodeSetUserData(void *cvode_mem, void *user_data)
   return(CV_SUCCESS);
 }
 
+/*
+ * CVodeSetMonitorFn
+ *
+ * Specifies the user function to call for monitoring
+ * the solution and/or integrator statistics.
+ */
+
+int CVodeSetMonitorFn(void *cvode_mem, CVMonitorFn fn)
+{
+  CVodeMem cv_mem;
+
+  if (cvode_mem==NULL) {
+    cvProcessError(NULL, CV_MEM_NULL, "CVODE", "CVodeSetMonitorFn", MSGCV_NO_MEM);
+    return(CV_MEM_NULL);
+  }
+
+  cv_mem = (CVodeMem) cvode_mem;
+
+#ifdef SUNDIALS_BUILD_WITH_MONITORING
+  cv_mem->cv_monitorfun = fn;
+  return(CV_SUCCESS);
+#else
+  cvProcessError(cv_mem, CV_ILL_INPUT, "CVODE", "CVodeSetMonitorFn", "SUNDIALS was not built with monitoring enabled.");
+  return(CV_ILL_INPUT);
+#endif
+}
+
+/*
+ * CVodeSetMonitorFrequency
+ *
+ * Specifies the frequency with which to call the user function.
+ */
+
+int CVodeSetMonitorFrequency(void *cvode_mem, long int nst)
+{
+  CVodeMem cv_mem;
+
+  if (cvode_mem==NULL) {
+    cvProcessError(NULL, CV_MEM_NULL, "CVODE", "CVodeSetMonitorFrequency", MSGCV_NO_MEM);
+    return(CV_MEM_NULL);
+  }
+
+  if (nst < 0) {
+    cvProcessError(NULL, CV_ILL_INPUT, "CVODE", "CVodeSetMonitorFrequency", "step interval must be >= 0\n");
+    return(CV_ILL_INPUT);
+  }
+
+  cv_mem = (CVodeMem) cvode_mem;
+
+#ifdef SUNDIALS_BUILD_WITH_MONITORING
+  cv_mem->cv_monitor_interval = nst;
+  return(CV_SUCCESS);
+#else
+  cvProcessError(cv_mem, CV_ILL_INPUT, "CVODE", "CVodeSetMonitorFrequency", "SUNDIALS was not built with monitoring enabled.");
+  return(CV_ILL_INPUT);
+#endif
+}
+
 /*
  * CVodeSetMaxOrd
  *
@@ -563,6 +621,31 @@ int CVodeSetConstraints(void *cvode_mem, N_Vector constraints)
   return(CV_SUCCESS);
 }
 
+int CVodeSetUseIntegratorFusedKernels(void *cvode_mem, booleantype onoff)
+{
+  CVodeMem cv_mem;
+
+  if (cvode_mem==NULL) {
+    cvProcessError(NULL, CV_MEM_NULL, "CVODE", "CVodeSetUseIntegratorFusedKernels", MSGCV_NO_MEM);
+    return(CV_MEM_NULL);
+  }
+
+  cv_mem = (CVodeMem) cvode_mem;
+
+#ifdef SUNDIALS_BUILD_PACKAGE_FUSED_KERNELS
+  if (!cv_mem->cv_MallocDone ||
+      N_VGetVectorID(cv_mem->cv_ewt) != SUNDIALS_NVEC_CUDA) {
+    cvProcessError(cv_mem, CV_ILL_INPUT, "CVODE", "CVodeSetUseIntegratorFusedKernels", MSGCV_BAD_NVECTOR);
+    return(CV_MEM_NULL);
+  }
+  cv_mem->cv_usefused = onoff;
+  return(CV_SUCCESS);
+#else
+  cvProcessError(cv_mem, CV_ILL_INPUT, "CVODE", "CVodeSetUseIntegratorFusedKernels", "CVODE was not built with fused integrator kernels enabled");
+  return(CV_ILL_INPUT);
+#endif
+}
+
 /*
  * =================================================================
  * CVODE optional output functions
@@ -1195,6 +1278,15 @@ char *CVodeGetReturnFlagName(long int flag)
   case CV_NLS_FAIL:
     sprintf(name,"CV_NLS_FAIL");
     break;
+  case CV_PROJ_MEM_NULL:
+    sprintf(name,"CV_PROJ_MEM_NULL");
+    break;
+  case CV_PROJFUNC_FAIL:
+    sprintf(name,"CV_PROJFUNC_FAIL");
+    break;
+  case CV_REPTD_PROJFUNC_ERR:
+    sprintf(name,"CV_REPTD_PROJFUNC_ERR");
+    break;
   default:
     sprintf(name,"NONE");
   }
diff --git a/src/cvode/cvode_ls.c b/src/cvode/cvode_ls.c
index ebf0efeecd..aacd66aa41 100644
--- a/src/cvode/cvode_ls.c
+++ b/src/cvode/cvode_ls.c
@@ -165,6 +165,7 @@ int CVodeSetLinearSolver(void *cvode_mem, SUNLinearSolver LS,
   cvls_mem->jtimesDQ = SUNTRUE;
   cvls_mem->jtsetup  = NULL;
   cvls_mem->jtimes   = cvLsDQJtimes;
+  cvls_mem->jt_f     = cv_mem->cv_f;
   cvls_mem->jt_data  = cv_mem;
 
   cvls_mem->user_linsys = SUNFALSE;
@@ -440,6 +441,7 @@ int CVodeSetJacTimes(void *cvode_mem, CVLsJacTimesSetupFn jtsetup,
     cvls_mem->jtimesDQ = SUNTRUE;
     cvls_mem->jtsetup  = NULL;
     cvls_mem->jtimes   = cvLsDQJtimes;
+    cvls_mem->jt_f     = cv_mem->cv_f;
     cvls_mem->jt_data  = cv_mem;
   }
 
@@ -447,6 +449,37 @@ int CVodeSetJacTimes(void *cvode_mem, CVLsJacTimesSetupFn jtsetup,
 }
 
 
+/* CVodeSetJacTimesRhsFn specifies an alternative user-supplied ODE right-hand
+   side function to use in the internal finite difference Jacobian-vector
+   product */
+int CVodeSetJacTimesRhsFn(void *cvode_mem, CVRhsFn jtimesRhsFn)
+{
+  CVodeMem cv_mem;
+  CVLsMem  cvls_mem;
+  int      retval;
+
+  /* access CVLsMem structure */
+  retval = cvLs_AccessLMem(cvode_mem, "CVodeSetJacTimesRhsFn",
+                           &cv_mem, &cvls_mem);
+  if (retval != CVLS_SUCCESS) return(retval);
+
+  /* check if using internal finite difference approximation */
+  if (!(cvls_mem->jtimesDQ)) {
+    cvProcessError(cv_mem, CVLS_ILL_INPUT, "CVLS", "CVodeSetJacTimesRhsFn",
+                   "Internal finite-difference Jacobian-vector product is disabled.");
+    return(CVLS_ILL_INPUT);
+  }
+
+  /* store function pointers for RHS function (NULL implies use ODE RHS) */
+  if (jtimesRhsFn != NULL)
+    cvls_mem->jt_f = jtimesRhsFn;
+  else
+    cvls_mem->jt_f = cv_mem->cv_f;
+
+  return(CVLS_SUCCESS);
+}
+
+
 /* CVodeSetLinSysFn specifies the linear system setup function. */
 int CVodeSetLinSysFn(void *cvode_mem, CVLsLinSysFn linsys)
 {
@@ -667,6 +700,33 @@ int CVodeGetNumJtimesEvals(void *cvode_mem, long int *njvevals)
 }
 
 
+/* CVodeGetLinSolveStats returns statistics related to the linear solve. */
+int CVodeGetLinSolveStats(void* cvode_mem, long int* njevals, long int* nfevalsLS,
+                          long int* nliters, long int* nlcfails, long int* npevals,
+                          long int* npsolves, long int* njtsetups, long int* njtimes)
+{
+  CVodeMem cv_mem;
+  CVLsMem  cvls_mem;
+  int      retval;
+
+  /* access CVLsMem structure; set output value and return */
+  retval = cvLs_AccessLMem(cvode_mem, "CVodeGetLinSolveStats",
+                           &cv_mem, &cvls_mem);
+  if (retval != CVLS_SUCCESS)  return(retval);
+
+  *njevals   = cvls_mem->nje;
+  *nfevalsLS = cvls_mem->nfeDQ;
+  *nliters   = cvls_mem->nli;
+  *nlcfails  = cvls_mem->ncfl;
+  *npevals   = cvls_mem->npe;
+  *npsolves  = cvls_mem->nps;
+  *njtsetups = cvls_mem->njtsetup;
+  *njtimes   = cvls_mem->njtimes;
+
+  return(CVLS_SUCCESS);
+}
+
+
 /* CVodeGetLastLinFlag returns the last flag set in a CVLS function */
 int CVodeGetLastLinFlag(void *cvode_mem, long int *flag)
 {
@@ -1113,7 +1173,7 @@ int cvLsDQJtimes(N_Vector v, N_Vector Jv, realtype t,
     N_VLinearSum(sig, v, ONE, y, work);
 
     /* Set Jv = f(tn, y+sig*v) */
-    retval = cv_mem->cv_f(t, work, Jv, cv_mem->cv_user_data);
+    retval = cvls_mem->jt_f(t, work, Jv, cv_mem->cv_user_data);
     cvls_mem->nfeDQ++;
     if (retval == 0) break;
     if (retval < 0)  return(-1);
diff --git a/src/cvode/cvode_ls_impl.h b/src/cvode/cvode_ls_impl.h
index 808d110fd9..d407fe8e78 100644
--- a/src/cvode/cvode_ls_impl.h
+++ b/src/cvode/cvode_ls_impl.h
@@ -110,6 +110,7 @@ typedef struct CVLsMemRec {
   booleantype jtimesDQ;
   CVLsJacTimesSetupFn jtsetup;
   CVLsJacTimesVecFn jtimes;
+  CVRhsFn jt_f;
   void *jt_data;
 
   /* Linear system setup function
diff --git a/src/cvode/cvode_nls.c b/src/cvode/cvode_nls.c
index 117258429a..39e967c7c5 100644
--- a/src/cvode/cvode_nls.c
+++ b/src/cvode/cvode_nls.c
@@ -39,6 +39,15 @@ static int cvNlsLSolve(N_Vector delta, void* cvode_mem);
 static int cvNlsConvTest(SUNNonlinearSolver NLS, N_Vector ycor, N_Vector del,
                          realtype tol, N_Vector ewt, void* cvode_mem);
 
+#ifdef SUNDIALS_BUILD_PACKAGE_FUSED_KERNELS
+int cvNlsResid_fused(const realtype rl1,
+                     const realtype ngamma,
+                     const N_Vector zn1,
+                     const N_Vector ycor,
+                     const N_Vector ftemp,
+                     N_Vector res);
+#endif
+
 /* -----------------------------------------------------------------------------
  * Exported functions
  * ---------------------------------------------------------------------------*/
@@ -290,8 +299,18 @@ static int cvNlsResidual(N_Vector ycor, N_Vector res, void* cvode_mem)
   if (retval < 0) return(CV_RHSFUNC_FAIL);
   if (retval > 0) return(RHSFUNC_RECVR);
 
-  N_VLinearSum(cv_mem->cv_rl1, cv_mem->cv_zn[1], ONE, ycor, res);
-  N_VLinearSum(-cv_mem->cv_gamma, cv_mem->cv_ftemp, ONE, res, res);
+#ifdef SUNDIALS_BUILD_PACKAGE_FUSED_KERNELS
+  if (cv_mem->cv_usefused)
+  {
+    cvNlsResid_fused(cv_mem->cv_rl1, -cv_mem->cv_gamma, cv_mem->cv_zn[1],
+                     ycor, cv_mem->cv_ftemp, res);
+  }
+  else
+#endif
+  {
+    N_VLinearSum(cv_mem->cv_rl1, cv_mem->cv_zn[1], ONE, ycor, res);
+    N_VLinearSum(-cv_mem->cv_gamma, cv_mem->cv_ftemp, ONE, res, res);
+  }
 
   return(CV_SUCCESS);
 }
diff --git a/src/cvode/cvode_proj.c b/src/cvode/cvode_proj.c
new file mode 100644
index 0000000000..fbae9068fa
--- /dev/null
+++ b/src/cvode/cvode_proj.c
@@ -0,0 +1,477 @@
+/* ---------------------------------------------------------------------------
+ * Programmer(s): David J. Gardner @ LLNL
+ * ---------------------------------------------------------------------------
+ * Based on CPODES by Radu Serban @ LLNL
+ * ---------------------------------------------------------------------------
+ * SUNDIALS Copyright Start
+ * Copyright (c) 2002-2020, Lawrence Livermore National Security
+ * and Southern Methodist University.
+ * All rights reserved.
+ *
+ * See the top-level LICENSE and NOTICE files for details.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ * SUNDIALS Copyright End
+ * ---------------------------------------------------------------------------
+ * Implementation file for projections in CVODE.
+ * ---------------------------------------------------------------------------*/
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "sundials/sundials_math.h"
+#include "cvode_impl.h"
+
+/* Private constants */
+#define ZERO  RCONST(0.0)  /* real 0.0 */
+#define ONE   RCONST(1.0)  /* real 1.0 */
+
+#define ONEPSM RCONST(1.000001)
+
+/* Private utility function prototypes */
+static int cvProjCreate(CVodeProjMem *proj_mem);
+static int cvProjSetDefaults(CVodeProjMem proj_mem);
+static int cvAccessProjMem(void* cvode_mem, const char *fname,
+                           CVodeMem *cv_mem, CVodeProjMem *proj_mem);
+
+
+/* ===========================================================================
+ * Exported Functions - projection initialization
+ * ===========================================================================*/
+
+/* -----------------------------------------------------------------------------
+ * CVodeSetProjFn sets a user defined projection function
+ * ---------------------------------------------------------------------------*/
+int CVodeSetProjFn(void *cvode_mem, CVProjFn pfun)
+{
+  int          retval;
+  CVodeMem     cv_mem;
+  CVodeProjMem proj_mem;
+
+  /* Check the CVODE memory pointer */
+  if (cvode_mem == NULL)
+  {
+    cvProcessError(NULL, CV_MEM_NULL, "CVODE", "CVodeSetProjFn",
+                   MSG_CV_MEM_NULL);
+    return(CV_MEM_NULL);
+  }
+  cv_mem = (CVodeMem) cvode_mem;
+
+  /* Check if the projection function is NULL */
+  if (pfun == NULL)
+  {
+    cvProcessError(cv_mem, CV_ILL_INPUT, "CVODE", "CVodeSetProjFn",
+                   "The projection function is NULL.");
+    return(CV_ILL_INPUT);
+  }
+
+  /* Check for compatible method */
+  if (cv_mem->cv_lmm != CV_BDF)
+  {
+    cvProcessError(cv_mem, CV_ILL_INPUT, "CVODE", "CVodeSetProjFn",
+                   "Projection is only supported with BDF methods.");
+    return(CV_ILL_INPUT);
+  }
+
+  /* Create the projection memory (if necessary) */
+  retval = cvProjCreate(&(cv_mem->proj_mem));
+  if (retval != CV_SUCCESS)
+  {
+    cvProcessError(cv_mem, CV_MEM_FAIL, "CVODE", "CVodeSetProjFn",
+                   MSG_CV_MEM_FAIL);
+    return(CV_MEM_FAIL);
+  }
+
+  /* Shortcut to projection memory */
+  proj_mem = cv_mem->proj_mem;
+
+  /* User-defined projection */
+  proj_mem->internal_proj = SUNFALSE;
+
+  /* Set the projection function */
+  proj_mem->pfun = pfun;
+
+  /* Enable projection */
+  cv_mem->proj_enabled = SUNTRUE;
+
+  return(CV_SUCCESS);
+}
+
+
+/* ===========================================================================
+ * Exported Functions - projection set function
+ * ===========================================================================*/
+
+
+int CVodeSetProjErrEst(void *cvode_mem, booleantype onoff)
+{
+  int          retval;
+  CVodeMem     cv_mem;
+  CVodeProjMem proj_mem;
+
+  /* Access memory structures */
+  retval = cvAccessProjMem(cvode_mem, "CVodeSetProjErrEst",
+                           &cv_mem, &proj_mem);
+  if (retval != CV_SUCCESS) return(retval);
+
+  /* Set projection error flag */
+  proj_mem->err_proj = onoff;
+
+  return(CV_SUCCESS);
+}
+
+
+int CVodeSetProjFrequency(void *cvode_mem, long int freq)
+{
+  int          retval;
+  CVodeMem     cv_mem;
+  CVodeProjMem proj_mem;
+
+  /* Access memory structures */
+  retval = cvAccessProjMem(cvode_mem, "CVodeSetProjFrequency",
+                           &cv_mem, &proj_mem);
+  if (retval != CV_SUCCESS) return(retval);
+
+  /* Set projection frequency */
+  if (freq < 0)
+  {
+    /* Restore default */
+    proj_mem->freq  = 1;
+    cv_mem->proj_enabled = SUNTRUE;
+  }
+  else if (freq == 0)
+  {
+    /* Disable projection */
+    proj_mem->freq = 0;
+    cv_mem->proj_enabled = SUNFALSE;
+  }
+  else
+  {
+    /* Enable projection at given frequency */
+    proj_mem->freq = freq;
+    cv_mem->proj_enabled = SUNTRUE;
+  }
+
+  return(CV_SUCCESS);
+}
+
+
+int CVodeSetMaxNumProjFails(void *cvode_mem, int max_fails)
+{
+  int          retval;
+  CVodeMem     cv_mem;
+  CVodeProjMem proj_mem;
+
+  /* Access memory structures */
+  retval = cvAccessProjMem(cvode_mem, "CVodeSetMaxNumProjFails",
+                           &cv_mem, &proj_mem);
+  if (retval != CV_SUCCESS) return(retval);
+
+  /* Set maximum number of projection failures in a step attempt */
+  if (max_fails < 1)
+  {
+    /* Restore default */
+    proj_mem->max_fails = PROJ_MAX_FAILS;
+  }
+  else
+  {
+    /* Update max number of fails */
+    proj_mem->max_fails = max_fails;
+  }
+
+  return(CV_SUCCESS);
+}
+
+
+int CVodeSetEpsProj(void *cvode_mem, realtype eps)
+{
+  int          retval;
+  CVodeMem     cv_mem;
+  CVodeProjMem proj_mem;
+
+  /* Access memory structures */
+  retval = cvAccessProjMem(cvode_mem, "CVodeSetEpsProj",
+                           &cv_mem, &proj_mem);
+  if (retval != CV_SUCCESS) return(retval);
+
+  /* Set the projection tolerance */
+  if (eps <= ZERO)
+  {
+    /* Restore default */
+    proj_mem->eps_proj = PROJ_EPS;
+  }
+  else
+  {
+    /* Update projection tolerance */
+    proj_mem->eps_proj = eps;
+  }
+
+  return(CV_SUCCESS);
+}
+
+
+int CVodeSetProjFailEta(void *cvode_mem, realtype eta)
+{
+  int          retval;
+  CVodeMem     cv_mem;
+  CVodeProjMem proj_mem;
+
+  /* Access memory structures */
+  retval = cvAccessProjMem(cvode_mem, "CVodeSetProjFailEta",
+                           &cv_mem, &proj_mem);
+  if (retval != CV_SUCCESS) return(retval);
+
+  /* Set the step size reduction factor for a projection failure */
+  if ((eta <= ZERO) || (eta > ONE))
+  {
+    /* Restore detault */
+    proj_mem->eta_pfail = PROJ_FAIL_ETA;
+  }
+  else
+  {
+    /* Udpate the eta value */
+    proj_mem->eta_pfail = PROJ_FAIL_ETA;
+  }
+
+  return(CV_SUCCESS);
+}
+
+
+/* ===========================================================================
+ * Exported Functions - projection get functions
+ * ===========================================================================*/
+
+
+int CVodeGetNumProjEvals(void *cvode_mem, long int *nproj)
+{
+  int          retval;
+  CVodeMem     cv_mem;
+  CVodeProjMem proj_mem;
+
+  /* Access memory structures */
+  retval = cvAccessProjMem(cvode_mem, "CVodeGetNumProjectionEvals",
+                           &cv_mem, &proj_mem);
+  if (retval != CV_SUCCESS) return(retval);
+
+  /* Get number of projection evaluations */
+  *nproj = proj_mem->nproj;
+
+  return(CV_SUCCESS);
+}
+
+
+int CVodeGetNumProjFails(void *cvode_mem, long int *npfails)
+{
+  int          retval;
+  CVodeMem     cv_mem;
+  CVodeProjMem proj_mem;
+
+  /* Access memory structures */
+  retval = cvAccessProjMem(cvode_mem, "CVodeGetNumProjFails",
+                           &cv_mem, &proj_mem);
+  if (retval != CV_SUCCESS) return(retval);
+
+  /* Get number of projection fails */
+  *npfails = proj_mem->npfails;
+
+  return(CV_SUCCESS);
+}
+
+
+/* ===========================================================================
+ * Internal Functions
+ * ===========================================================================*/
+
+
+/*
+ * cvProjection
+ *
+ * For user supplied projection function, use ftemp as temporary storage
+ * for the current error estimate (acor) and use tempv to store the
+ * accumulated corection due to projection, acorP (tempv is not touched
+ * until it is potentially used in cvCompleteStep).
+ */
+
+int cvDoProjection(CVodeMem cv_mem, int *nflagPtr, realtype saved_t,
+                   int *npfailPtr)
+{
+  int          retval;
+  N_Vector     errP;
+  N_Vector     acorP;
+  CVodeProjMem proj_mem;
+
+  /* Access projection memory */
+  if (cv_mem->proj_mem == NULL) {
+    cvProcessError(cv_mem, CV_PROJ_MEM_NULL, "CVODE",
+                   "cvDoProjection", MSG_CV_PROJ_MEM_NULL);
+    return(CV_PROJ_MEM_NULL);
+  }
+  proj_mem = cv_mem->proj_mem;
+
+  /* Initialize return flag to success */
+  retval = CV_SUCCESS;
+
+  /* Use tempv to store acorP and, if projecting the error, ftemp to store
+     errP (recall that in this case we did not allocate vectors to for
+     acorP and errP). */
+  acorP = cv_mem->cv_tempv;
+  if (proj_mem->err_proj)
+    errP = cv_mem->cv_ftemp;
+  else
+    errP = NULL;
+
+  /* Copy acor into errP (if projecting the error) */
+  if (proj_mem->err_proj) N_VScale(ONE, cv_mem->cv_acor, errP);
+
+  /* Call the user projection function */
+  retval = proj_mem->pfun(cv_mem->cv_tn, cv_mem->cv_y, acorP,
+                          proj_mem->eps_proj, errP, cv_mem->cv_user_data);
+  proj_mem->nproj++;
+
+  /* This is not the first projection anymore */
+  proj_mem->first_proj = SUNFALSE;
+
+  /* Check the return value */
+  if (retval == CV_SUCCESS)
+  {
+    /* Recompute acnrm to be used in error test (if projecting the error) */
+    if (proj_mem->err_proj)
+      cv_mem->cv_acnrm = N_VWrmsNorm(errP, cv_mem->cv_ewt);
+
+    /* The projection was successful, return now */
+    cv_mem->proj_applied = SUNTRUE;
+    return(CV_SUCCESS);
+  }
+
+  /* The projection failed, update the return value */
+  if (retval < 0) retval = CV_PROJFUNC_FAIL;
+  if (retval > 0) retval = PROJFUNC_RECVR;
+
+  /* Increment cumulative failure count and restore zn */
+  proj_mem->npfails++;
+  cvRestore(cv_mem, saved_t);
+
+  /* Return if failed unrecoverably */
+  if (retval == CV_PROJFUNC_FAIL) return(CV_PROJFUNC_FAIL);
+
+  /* Recoverable failure, increment failure count for this step attempt */
+  (*npfailPtr)++;
+  cv_mem->cv_etamax = ONE;
+
+  /* Check for maximum number of failures or |h| = hmin */
+  if ((SUNRabs(cv_mem->cv_h) <= cv_mem->cv_hmin * ONEPSM) ||
+      (*npfailPtr == proj_mem->max_fails))
+  {
+    if (retval == PROJFUNC_RECVR) return(CV_REPTD_PROJFUNC_ERR);
+  }
+
+  /* Reduce step size; return to reattempt the step */
+  cv_mem->cv_eta = SUNMAX(proj_mem->eta_pfail,
+                          cv_mem->cv_hmin / SUNRabs(cv_mem->cv_h));
+  *nflagPtr = PREV_PROJ_FAIL;
+  cvRescale(cv_mem);
+
+  return(PREDICT_AGAIN);
+}
+
+
+int cvProjInit(CVodeProjMem proj_mem)
+{
+  /* check if projection memory exists */
+  if (proj_mem == NULL) return(CV_PROJ_MEM_NULL);
+
+  /* reset flags and counters */
+  proj_mem->first_proj = SUNTRUE;
+  proj_mem->nstlprj    = 0;
+  proj_mem->nproj      = 0;
+  proj_mem->npfails    = 0;
+
+  return(CV_SUCCESS);
+}
+
+
+int cvProjFree(CVodeProjMem *proj_mem)
+{
+  if (*proj_mem == NULL) return(CV_SUCCESS);
+
+  free(*proj_mem);
+  *proj_mem = NULL;
+
+  return(CV_SUCCESS);
+}
+
+
+/* ===========================================================================
+ * Utility Functions
+ * ===========================================================================*/
+
+static int cvProjCreate(CVodeProjMem *proj_mem)
+{
+  int retval;
+
+  /* Allocate projection memory if necessary, otherwise return success */
+  if (*proj_mem == NULL)
+  {
+    *proj_mem = (CVodeProjMem) malloc(sizeof(struct CVodeProjMemRec));
+    if (*proj_mem == NULL) return(CV_MEM_FAIL);
+
+    /* Zero out proj_mem */
+    memset(*proj_mem, 0, sizeof(struct CVodeProjMemRec));
+
+    /* Initialize projection variables */
+    retval = cvProjSetDefaults(*proj_mem);
+    if (retval != CV_SUCCESS) return(retval);
+  }
+
+  return(CV_SUCCESS);
+}
+
+
+static int cvProjSetDefaults(CVodeProjMem proj_mem)
+{
+  if (proj_mem == NULL) return(CV_MEM_FAIL);
+
+  proj_mem->internal_proj = SUNTRUE;
+  proj_mem->err_proj      = SUNTRUE;
+  proj_mem->first_proj    = SUNTRUE;
+
+  proj_mem->freq    = 1;
+  proj_mem->nstlprj = 0;
+
+  proj_mem->max_fails = PROJ_MAX_FAILS;
+
+  proj_mem->pfun = NULL;
+
+  proj_mem->eps_proj  = PROJ_EPS;
+  proj_mem->eta_pfail = PROJ_FAIL_ETA;
+
+  proj_mem->nproj   = 0;
+  proj_mem->npfails = 0;
+
+  return(CV_SUCCESS);
+}
+
+
+static int cvAccessProjMem(void* cvode_mem, const char *fname,
+                           CVodeMem *cv_mem, CVodeProjMem *proj_mem)
+{
+  /* Access cvode memory */
+  if (cvode_mem == NULL)
+  {
+    cvProcessError(NULL, CV_MEM_NULL, "CVODE",
+                   fname, MSG_CV_MEM_NULL);
+    return(CV_MEM_NULL);
+  }
+  *cv_mem = (CVodeMem) cvode_mem;
+
+  /* Access projection memory */
+  if ((*cv_mem)->proj_mem == NULL)
+  {
+    cvProcessError(*cv_mem, CV_PROJ_MEM_NULL, "CVODE",
+                   fname, MSG_CV_PROJ_MEM_NULL);
+    return(CV_PROJ_MEM_NULL);
+  }
+  *proj_mem = (CVodeProjMem) (*cv_mem)->proj_mem;
+
+  return(CV_SUCCESS);
+}
diff --git a/src/cvode/cvode_proj_impl.h b/src/cvode/cvode_proj_impl.h
new file mode 100644
index 0000000000..43a33ab51d
--- /dev/null
+++ b/src/cvode/cvode_proj_impl.h
@@ -0,0 +1,75 @@
+/* -----------------------------------------------------------------------------
+ * Programmer(s): David J. Gardner @ LLNL
+ * -----------------------------------------------------------------------------
+ * Based on CPODES by Radu Serban @ LLNL
+ * -----------------------------------------------------------------------------
+ * SUNDIALS Copyright Start
+ * Copyright (c) 2002-2020, Lawrence Livermore National Security
+ * and Southern Methodist University.
+ * All rights reserved.
+ *
+ * See the top-level LICENSE and NOTICE files for details.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ * SUNDIALS Copyright End
+ * -----------------------------------------------------------------------------
+ * Implementation header file for projections in CVODE.
+ * ---------------------------------------------------------------------------*/
+
+#ifndef _CVODE_PROJ_IMPL_H
+#define _CVODE_PROJ_IMPL_H
+
+#include "cvode/cvode.h"
+
+#ifdef __cplusplus  /* wrapper to enable C++ usage */
+extern "C" {
+#endif
+
+/* =============================================================================
+ * Default Projection Constants
+ *
+ * PROJ_MAX_FAILS  max nunmber of projection failures in one step attempt
+ * PROJ_EPS        projection solve tolerance
+ * PROJ_FAIL_ETA   maximum step size decrease on projection failure
+ * ===========================================================================*/
+
+#define PROJ_MAX_FAILS 10
+#define PROJ_EPS       RCONST(0.1)
+#define PROJ_FAIL_ETA  RCONST(0.25)
+
+/* =============================================================================
+ * Projection Data Structure
+ * ===========================================================================*/
+
+/* -----------------------------------------------------------------------------
+ * Types : struct CVodeProjMemRec, CVodeProjMem
+ * -----------------------------------------------------------------------------
+ * The type CVodeProjMem is type pointer to struct CVodeProjMemRec. This
+ * structure contains data pertaining to the use of projection capabilities.
+ * ---------------------------------------------------------------------------*/
+typedef struct CVodeProjMemRec {
+
+  booleantype internal_proj;  /* use the internal projection algorithm?      */
+  booleantype err_proj;       /* is error projection enabled?                */
+  booleantype first_proj;     /* is this the first time we project?          */
+
+  long int freq;              /* projection frequency                        */
+  long int nstlprj;           /* step number of last projection              */
+
+  int max_fails;              /* maximum number of projection failures       */
+
+  CVProjFn pfun;              /* function to perform projection              */
+
+  realtype eps_proj;          /* projection solve tolerance                  */
+  realtype eta_pfail;         /* projection failure step reduction factor    */
+
+  long int nproj;             /* number of projections performed             */
+  long int npfails;           /* number of projection failures               */
+
+} *CVodeProjMem;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/cvode/fmod/fcvode_mod.c b/src/cvode/fmod/fcvode_mod.c
index fd70cd90e1..1d97f9f60e 100644
--- a/src/cvode/fmod/fcvode_mod.c
+++ b/src/cvode/fmod/fcvode_mod.c
@@ -208,6 +208,7 @@
 #include "cvode/cvode_bbdpre.h"
 #include "cvode/cvode_diag.h"
 #include "cvode/cvode_ls.h"
+#include "cvode/cvode_proj.h"
 
 
 #include <stdlib.h>
@@ -373,6 +374,34 @@ SWIGEXPORT int _wrap_FCVodeSetUserData(void *farg1, void *farg2) {
 }
 
 
+SWIGEXPORT int _wrap_FCVodeSetMonitorFn(void *farg1, CVMonitorFn farg2) {
+  int fresult ;
+  void *arg1 = (void *) 0 ;
+  CVMonitorFn arg2 = (CVMonitorFn) 0 ;
+  int result;
+  
+  arg1 = (void *)(farg1);
+  arg2 = (CVMonitorFn)(farg2);
+  result = (int)CVodeSetMonitorFn(arg1,arg2);
+  fresult = (int)(result);
+  return fresult;
+}
+
+
+SWIGEXPORT int _wrap_FCVodeSetMonitorFrequency(void *farg1, long const *farg2) {
+  int fresult ;
+  void *arg1 = (void *) 0 ;
+  long arg2 ;
+  int result;
+  
+  arg1 = (void *)(farg1);
+  arg2 = (long)(*farg2);
+  result = (int)CVodeSetMonitorFrequency(arg1,arg2);
+  fresult = (int)(result);
+  return fresult;
+}
+
+
 SWIGEXPORT int _wrap_FCVodeSetMaxOrd(void *farg1, int const *farg2) {
   int fresult ;
   void *arg1 = (void *) 0 ;
@@ -569,6 +598,20 @@ SWIGEXPORT int _wrap_FCVodeSetNonlinearSolver(void *farg1, SUNNonlinearSolver fa
 }
 
 
+SWIGEXPORT int _wrap_FCVodeSetUseIntegratorFusedKernels(void *farg1, int const *farg2) {
+  int fresult ;
+  void *arg1 = (void *) 0 ;
+  int arg2 ;
+  int result;
+  
+  arg1 = (void *)(farg1);
+  arg2 = (int)(*farg2);
+  result = (int)CVodeSetUseIntegratorFusedKernels(arg1,arg2);
+  fresult = (int)(result);
+  return fresult;
+}
+
+
 SWIGEXPORT int _wrap_FCVodeRootInit(void *farg1, int const *farg2, CVRootFn farg3) {
   int fresult ;
   void *arg1 = (void *) 0 ;
@@ -1014,6 +1057,20 @@ SWIGEXPORT void _wrap_FCVodeFree(void *farg1) {
 }
 
 
+SWIGEXPORT int _wrap_FCVodeSetJacTimesRhsFn(void *farg1, CVRhsFn farg2) {
+  int fresult ;
+  void *arg1 = (void *) 0 ;
+  CVRhsFn arg2 = (CVRhsFn) 0 ;
+  int result;
+  
+  arg1 = (void *)(farg1);
+  arg2 = (CVRhsFn)(farg2);
+  result = (int)CVodeSetJacTimesRhsFn(arg1,arg2);
+  fresult = (int)(result);
+  return fresult;
+}
+
+
 SWIGEXPORT int _wrap_FCVBandPrecInit(void *farg1, int64_t const *farg2, int64_t const *farg3, int64_t const *farg4) {
   int fresult ;
   void *arg1 = (void *) 0 ;
@@ -1453,6 +1510,34 @@ SWIGEXPORT int _wrap_FCVodeGetNumLinRhsEvals(void *farg1, long *farg2) {
 }
 
 
+SWIGEXPORT int _wrap_FCVodeGetLinSolveStats(void *farg1, long *farg2, long *farg3, long *farg4, long *farg5, long *farg6, long *farg7, long *farg8, long *farg9) {
+  int fresult ;
+  void *arg1 = (void *) 0 ;
+  long *arg2 = (long *) 0 ;
+  long *arg3 = (long *) 0 ;
+  long *arg4 = (long *) 0 ;
+  long *arg5 = (long *) 0 ;
+  long *arg6 = (long *) 0 ;
+  long *arg7 = (long *) 0 ;
+  long *arg8 = (long *) 0 ;
+  long *arg9 = (long *) 0 ;
+  int result;
+  
+  arg1 = (void *)(farg1);
+  arg2 = (long *)(farg2);
+  arg3 = (long *)(farg3);
+  arg4 = (long *)(farg4);
+  arg5 = (long *)(farg5);
+  arg6 = (long *)(farg6);
+  arg7 = (long *)(farg7);
+  arg8 = (long *)(farg8);
+  arg9 = (long *)(farg9);
+  result = (int)CVodeGetLinSolveStats(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9);
+  fresult = (int)(result);
+  return fresult;
+}
+
+
 SWIGEXPORT int _wrap_FCVodeGetLastLinFlag(void *farg1, long *farg2) {
   int fresult ;
   void *arg1 = (void *) 0 ;
@@ -1480,4 +1565,116 @@ SWIGEXPORT SwigArrayWrapper _wrap_FCVodeGetLinReturnFlagName(long const *farg1)
 }
 
 
+SWIGEXPORT int _wrap_FCVodeSetProjFn(void *farg1, CVProjFn farg2) {
+  int fresult ;
+  void *arg1 = (void *) 0 ;
+  CVProjFn arg2 = (CVProjFn) 0 ;
+  int result;
+  
+  arg1 = (void *)(farg1);
+  arg2 = (CVProjFn)(farg2);
+  result = (int)CVodeSetProjFn(arg1,arg2);
+  fresult = (int)(result);
+  return fresult;
+}
+
+
+SWIGEXPORT int _wrap_FCVodeSetProjErrEst(void *farg1, int const *farg2) {
+  int fresult ;
+  void *arg1 = (void *) 0 ;
+  int arg2 ;
+  int result;
+  
+  arg1 = (void *)(farg1);
+  arg2 = (int)(*farg2);
+  result = (int)CVodeSetProjErrEst(arg1,arg2);
+  fresult = (int)(result);
+  return fresult;
+}
+
+
+SWIGEXPORT int _wrap_FCVodeSetProjFrequency(void *farg1, long const *farg2) {
+  int fresult ;
+  void *arg1 = (void *) 0 ;
+  long arg2 ;
+  int result;
+  
+  arg1 = (void *)(farg1);
+  arg2 = (long)(*farg2);
+  result = (int)CVodeSetProjFrequency(arg1,arg2);
+  fresult = (int)(result);
+  return fresult;
+}
+
+
+SWIGEXPORT int _wrap_FCVodeSetMaxNumProjFails(void *farg1, int const *farg2) {
+  int fresult ;
+  void *arg1 = (void *) 0 ;
+  int arg2 ;
+  int result;
+  
+  arg1 = (void *)(farg1);
+  arg2 = (int)(*farg2);
+  result = (int)CVodeSetMaxNumProjFails(arg1,arg2);
+  fresult = (int)(result);
+  return fresult;
+}
+
+
+SWIGEXPORT int _wrap_FCVodeSetEpsProj(void *farg1, double const *farg2) {
+  int fresult ;
+  void *arg1 = (void *) 0 ;
+  realtype arg2 ;
+  int result;
+  
+  arg1 = (void *)(farg1);
+  arg2 = (realtype)(*farg2);
+  result = (int)CVodeSetEpsProj(arg1,arg2);
+  fresult = (int)(result);
+  return fresult;
+}
+
+
+SWIGEXPORT int _wrap_FCVodeSetProjFailEta(void *farg1, double const *farg2) {
+  int fresult ;
+  void *arg1 = (void *) 0 ;
+  realtype arg2 ;
+  int result;
+  
+  arg1 = (void *)(farg1);
+  arg2 = (realtype)(*farg2);
+  result = (int)CVodeSetProjFailEta(arg1,arg2);
+  fresult = (int)(result);
+  return fresult;
+}
+
+
+SWIGEXPORT int _wrap_FCVodeGetNumProjEvals(void *farg1, long *farg2) {
+  int fresult ;
+  void *arg1 = (void *) 0 ;
+  long *arg2 = (long *) 0 ;
+  int result;
+  
+  arg1 = (void *)(farg1);
+  arg2 = (long *)(farg2);
+  result = (int)CVodeGetNumProjEvals(arg1,arg2);
+  fresult = (int)(result);
+  return fresult;
+}
+
+
+SWIGEXPORT int _wrap_FCVodeGetNumProjFails(void *farg1, long *farg2) {
+  int fresult ;
+  void *arg1 = (void *) 0 ;
+  long *arg2 = (long *) 0 ;
+  int result;
+  
+  arg1 = (void *)(farg1);
+  arg2 = (long *)(farg2);
+  result = (int)CVodeGetNumProjFails(arg1,arg2);
+  fresult = (int)(result);
+  return fresult;
+}
+
+
 
diff --git a/src/cvode/fmod/fcvode_mod.f90 b/src/cvode/fmod/fcvode_mod.f90
index faf6d809e9..8413036a11 100644
--- a/src/cvode/fmod/fcvode_mod.f90
+++ b/src/cvode/fmod/fcvode_mod.f90
@@ -68,6 +68,9 @@ module fcvode_mod
  integer(C_INT), parameter, public :: CV_BAD_DKY = -26_C_INT
  integer(C_INT), parameter, public :: CV_TOO_CLOSE = -27_C_INT
  integer(C_INT), parameter, public :: CV_VECTOROP_ERR = -28_C_INT
+ integer(C_INT), parameter, public :: CV_PROJ_MEM_NULL = -29_C_INT
+ integer(C_INT), parameter, public :: CV_PROJFUNC_FAIL = -30_C_INT
+ integer(C_INT), parameter, public :: CV_REPTD_PROJFUNC_ERR = -31_C_INT
  integer(C_INT), parameter, public :: CV_UNRECOGNIZED_ERR = -99_C_INT
  public :: FCVodeCreate
  public :: FCVodeInit
@@ -78,6 +81,8 @@ module fcvode_mod
  public :: FCVodeSetErrHandlerFn
  public :: FCVodeSetErrFile
  public :: FCVodeSetUserData
+ public :: FCVodeSetMonitorFn
+ public :: FCVodeSetMonitorFrequency
  public :: FCVodeSetMaxOrd
  public :: FCVodeSetMaxNumSteps
  public :: FCVodeSetMaxHnilWarns
@@ -92,6 +97,7 @@ module fcvode_mod
  public :: FCVodeSetNonlinConvCoef
  public :: FCVodeSetConstraints
  public :: FCVodeSetNonlinearSolver
+ public :: FCVodeSetUseIntegratorFusedKernels
  public :: FCVodeRootInit
  public :: FCVodeSetRootDirection
  public :: FCVodeSetNoInactiveRootWarn
@@ -126,6 +132,7 @@ module fcvode_mod
  end type
  public :: FCVodeGetReturnFlagName
  public :: FCVodeFree
+ public :: FCVodeSetJacTimesRhsFn
  public :: FCVBandPrecInit
  public :: FCVBandPrecGetWorkSpace
  public :: FCVBandPrecGetNumRhsEvals
@@ -173,8 +180,17 @@ module fcvode_mod
  public :: FCVodeGetNumJTSetupEvals
  public :: FCVodeGetNumJtimesEvals
  public :: FCVodeGetNumLinRhsEvals
+ public :: FCVodeGetLinSolveStats
  public :: FCVodeGetLastLinFlag
  public :: FCVodeGetLinReturnFlagName
+ public :: FCVodeSetProjFn
+ public :: FCVodeSetProjErrEst
+ public :: FCVodeSetProjFrequency
+ public :: FCVodeSetMaxNumProjFails
+ public :: FCVodeSetEpsProj
+ public :: FCVodeSetProjFailEta
+ public :: FCVodeGetNumProjEvals
+ public :: FCVodeGetNumProjFails
 
 ! WRAPPER DECLARATIONS
 interface
@@ -264,6 +280,24 @@ function swigc_FCVodeSetUserData(farg1, farg2) &
 integer(C_INT) :: fresult
 end function
 
+function swigc_FCVodeSetMonitorFn(farg1, farg2) &
+bind(C, name="_wrap_FCVodeSetMonitorFn") &
+result(fresult)
+use, intrinsic :: ISO_C_BINDING
+type(C_PTR), value :: farg1
+type(C_FUNPTR), value :: farg2
+integer(C_INT) :: fresult
+end function
+
+function swigc_FCVodeSetMonitorFrequency(farg1, farg2) &
+bind(C, name="_wrap_FCVodeSetMonitorFrequency") &
+result(fresult)
+use, intrinsic :: ISO_C_BINDING
+type(C_PTR), value :: farg1
+integer(C_LONG), intent(in) :: farg2
+integer(C_INT) :: fresult
+end function
+
 function swigc_FCVodeSetMaxOrd(farg1, farg2) &
 bind(C, name="_wrap_FCVodeSetMaxOrd") &
 result(fresult)
@@ -390,6 +424,15 @@ function swigc_FCVodeSetNonlinearSolver(farg1, farg2) &
 integer(C_INT) :: fresult
 end function
 
+function swigc_FCVodeSetUseIntegratorFusedKernels(farg1, farg2) &
+bind(C, name="_wrap_FCVodeSetUseIntegratorFusedKernels") &
+result(fresult)
+use, intrinsic :: ISO_C_BINDING
+type(C_PTR), value :: farg1
+integer(C_INT), intent(in) :: farg2
+integer(C_INT) :: fresult
+end function
+
 function swigc_FCVodeRootInit(farg1, farg2, farg3) &
 bind(C, name="_wrap_FCVodeRootInit") &
 result(fresult)
@@ -678,6 +721,15 @@ subroutine swigc_FCVodeFree(farg1) &
 type(C_PTR), value :: farg1
 end subroutine
 
+function swigc_FCVodeSetJacTimesRhsFn(farg1, farg2) &
+bind(C, name="_wrap_FCVodeSetJacTimesRhsFn") &
+result(fresult)
+use, intrinsic :: ISO_C_BINDING
+type(C_PTR), value :: farg1
+type(C_FUNPTR), value :: farg2
+integer(C_INT) :: fresult
+end function
+
 function swigc_FCVBandPrecInit(farg1, farg2, farg3, farg4) &
 bind(C, name="_wrap_FCVBandPrecInit") &
 result(fresult)
@@ -956,6 +1008,22 @@ function swigc_FCVodeGetNumLinRhsEvals(farg1, farg2) &
 integer(C_INT) :: fresult
 end function
 
+function swigc_FCVodeGetLinSolveStats(farg1, farg2, farg3, farg4, farg5, farg6, farg7, farg8, farg9) &
+bind(C, name="_wrap_FCVodeGetLinSolveStats") &
+result(fresult)
+use, intrinsic :: ISO_C_BINDING
+type(C_PTR), value :: farg1
+type(C_PTR), value :: farg2
+type(C_PTR), value :: farg3
+type(C_PTR), value :: farg4
+type(C_PTR), value :: farg5
+type(C_PTR), value :: farg6
+type(C_PTR), value :: farg7
+type(C_PTR), value :: farg8
+type(C_PTR), value :: farg9
+integer(C_INT) :: fresult
+end function
+
 function swigc_FCVodeGetLastLinFlag(farg1, farg2) &
 bind(C, name="_wrap_FCVodeGetLastLinFlag") &
 result(fresult)
@@ -974,6 +1042,78 @@ function swigc_FCVodeGetLinReturnFlagName(farg1) &
 type(SwigArrayWrapper) :: fresult
 end function
 
+function swigc_FCVodeSetProjFn(farg1, farg2) &
+bind(C, name="_wrap_FCVodeSetProjFn") &
+result(fresult)
+use, intrinsic :: ISO_C_BINDING
+type(C_PTR), value :: farg1
+type(C_FUNPTR), value :: farg2
+integer(C_INT) :: fresult
+end function
+
+function swigc_FCVodeSetProjErrEst(farg1, farg2) &
+bind(C, name="_wrap_FCVodeSetProjErrEst") &
+result(fresult)
+use, intrinsic :: ISO_C_BINDING
+type(C_PTR), value :: farg1
+integer(C_INT), intent(in) :: farg2
+integer(C_INT) :: fresult
+end function
+
+function swigc_FCVodeSetProjFrequency(farg1, farg2) &
+bind(C, name="_wrap_FCVodeSetProjFrequency") &
+result(fresult)
+use, intrinsic :: ISO_C_BINDING
+type(C_PTR), value :: farg1
+integer(C_LONG), intent(in) :: farg2
+integer(C_INT) :: fresult
+end function
+
+function swigc_FCVodeSetMaxNumProjFails(farg1, farg2) &
+bind(C, name="_wrap_FCVodeSetMaxNumProjFails") &
+result(fresult)
+use, intrinsic :: ISO_C_BINDING
+type(C_PTR), value :: farg1
+integer(C_INT), intent(in) :: farg2
+integer(C_INT) :: fresult
+end function
+
+function swigc_FCVodeSetEpsProj(farg1, farg2) &
+bind(C, name="_wrap_FCVodeSetEpsProj") &
+result(fresult)
+use, intrinsic :: ISO_C_BINDING
+type(C_PTR), value :: farg1
+real(C_DOUBLE), intent(in) :: farg2
+integer(C_INT) :: fresult
+end function
+
+function swigc_FCVodeSetProjFailEta(farg1, farg2) &
+bind(C, name="_wrap_FCVodeSetProjFailEta") &
+result(fresult)
+use, intrinsic :: ISO_C_BINDING
+type(C_PTR), value :: farg1
+real(C_DOUBLE), intent(in) :: farg2
+integer(C_INT) :: fresult
+end function
+
+function swigc_FCVodeGetNumProjEvals(farg1, farg2) &
+bind(C, name="_wrap_FCVodeGetNumProjEvals") &
+result(fresult)
+use, intrinsic :: ISO_C_BINDING
+type(C_PTR), value :: farg1
+type(C_PTR), value :: farg2
+integer(C_INT) :: fresult
+end function
+
+function swigc_FCVodeGetNumProjFails(farg1, farg2) &
+bind(C, name="_wrap_FCVodeGetNumProjFails") &
+result(fresult)
+use, intrinsic :: ISO_C_BINDING
+type(C_PTR), value :: farg1
+type(C_PTR), value :: farg2
+integer(C_INT) :: fresult
+end function
+
 end interface
 
 
@@ -1138,6 +1278,38 @@ function FCVodeSetUserData(cvode_mem, user_data) &
 swig_result = fresult
 end function
 
+function FCVodeSetMonitorFn(cvode_mem, fn) &
+result(swig_result)
+use, intrinsic :: ISO_C_BINDING
+integer(C_INT) :: swig_result
+type(C_PTR) :: cvode_mem
+type(C_FUNPTR), intent(in), value :: fn
+integer(C_INT) :: fresult 
+type(C_PTR) :: farg1 
+type(C_FUNPTR) :: farg2 
+
+farg1 = cvode_mem
+farg2 = fn
+fresult = swigc_FCVodeSetMonitorFn(farg1, farg2)
+swig_result = fresult
+end function
+
+function FCVodeSetMonitorFrequency(cvode_mem, nst) &
+result(swig_result)
+use, intrinsic :: ISO_C_BINDING
+integer(C_INT) :: swig_result
+type(C_PTR) :: cvode_mem
+integer(C_LONG), intent(in) :: nst
+integer(C_INT) :: fresult 
+type(C_PTR) :: farg1 
+integer(C_LONG) :: farg2 
+
+farg1 = cvode_mem
+farg2 = nst
+fresult = swigc_FCVodeSetMonitorFrequency(farg1, farg2)
+swig_result = fresult
+end function
+
 function FCVodeSetMaxOrd(cvode_mem, maxord) &
 result(swig_result)
 use, intrinsic :: ISO_C_BINDING
@@ -1362,6 +1534,22 @@ function FCVodeSetNonlinearSolver(cvode_mem, nls) &
 swig_result = fresult
 end function
 
+function FCVodeSetUseIntegratorFusedKernels(cvode_mem, onoff) &
+result(swig_result)
+use, intrinsic :: ISO_C_BINDING
+integer(C_INT) :: swig_result
+type(C_PTR) :: cvode_mem
+integer(C_INT), intent(in) :: onoff
+integer(C_INT) :: fresult 
+type(C_PTR) :: farg1 
+integer(C_INT) :: farg2 
+
+farg1 = cvode_mem
+farg2 = onoff
+fresult = swigc_FCVodeSetUseIntegratorFusedKernels(farg1, farg2)
+swig_result = fresult
+end function
+
 function FCVodeRootInit(cvode_mem, nrtfn, g) &
 result(swig_result)
 use, intrinsic :: ISO_C_BINDING
@@ -1895,6 +2083,22 @@ subroutine FCVodeFree(cvode_mem)
 call swigc_FCVodeFree(farg1)
 end subroutine
 
+function FCVodeSetJacTimesRhsFn(cvode_mem, jtimesrhsfn) &
+result(swig_result)
+use, intrinsic :: ISO_C_BINDING
+integer(C_INT) :: swig_result
+type(C_PTR) :: cvode_mem
+type(C_FUNPTR), intent(in), value :: jtimesrhsfn
+integer(C_INT) :: fresult 
+type(C_PTR) :: farg1 
+type(C_FUNPTR) :: farg2 
+
+farg1 = cvode_mem
+farg2 = jtimesrhsfn
+fresult = swigc_FCVodeSetJacTimesRhsFn(farg1, farg2)
+swig_result = fresult
+end function
+
 function FCVBandPrecInit(cvode_mem, n, mu, ml) &
 result(swig_result)
 use, intrinsic :: ISO_C_BINDING
@@ -2408,6 +2612,43 @@ function FCVodeGetNumLinRhsEvals(cvode_mem, nfevalsls) &
 swig_result = fresult
 end function
 
+function FCVodeGetLinSolveStats(cvode_mem, njevals, nfevalsls, nliters, nlcfails, npevals, npsolves, njtsetups, njtimes) &
+result(swig_result)
+use, intrinsic :: ISO_C_BINDING
+integer(C_INT) :: swig_result
+type(C_PTR) :: cvode_mem
+integer(C_LONG), dimension(*), target, intent(inout) :: njevals
+integer(C_LONG), dimension(*), target, intent(inout) :: nfevalsls
+integer(C_LONG), dimension(*), target, intent(inout) :: nliters
+integer(C_LONG), dimension(*), target, intent(inout) :: nlcfails
+integer(C_LONG), dimension(*), target, intent(inout) :: npevals
+integer(C_LONG), dimension(*), target, intent(inout) :: npsolves
+integer(C_LONG), dimension(*), target, intent(inout) :: njtsetups
+integer(C_LONG), dimension(*), target, intent(inout) :: njtimes
+integer(C_INT) :: fresult 
+type(C_PTR) :: farg1 
+type(C_PTR) :: farg2 
+type(C_PTR) :: farg3 
+type(C_PTR) :: farg4 
+type(C_PTR) :: farg5 
+type(C_PTR) :: farg6 
+type(C_PTR) :: farg7 
+type(C_PTR) :: farg8 
+type(C_PTR) :: farg9 
+
+farg1 = cvode_mem
+farg2 = c_loc(njevals(1))
+farg3 = c_loc(nfevalsls(1))
+farg4 = c_loc(nliters(1))
+farg5 = c_loc(nlcfails(1))
+farg6 = c_loc(npevals(1))
+farg7 = c_loc(npsolves(1))
+farg8 = c_loc(njtsetups(1))
+farg9 = c_loc(njtimes(1))
+fresult = swigc_FCVodeGetLinSolveStats(farg1, farg2, farg3, farg4, farg5, farg6, farg7, farg8, farg9)
+swig_result = fresult
+end function
+
 function FCVodeGetLastLinFlag(cvode_mem, flag) &
 result(swig_result)
 use, intrinsic :: ISO_C_BINDING
@@ -2438,5 +2679,133 @@ function FCVodeGetLinReturnFlagName(flag) &
 if (.false.) call SWIG_free(fresult%data)
 end function
 
+function FCVodeSetProjFn(cvode_mem, pfun) &
+result(swig_result)
+use, intrinsic :: ISO_C_BINDING
+integer(C_INT) :: swig_result
+type(C_PTR) :: cvode_mem
+type(C_FUNPTR), intent(in), value :: pfun
+integer(C_INT) :: fresult 
+type(C_PTR) :: farg1 
+type(C_FUNPTR) :: farg2 
+
+farg1 = cvode_mem
+farg2 = pfun
+fresult = swigc_FCVodeSetProjFn(farg1, farg2)
+swig_result = fresult
+end function
+
+function FCVodeSetProjErrEst(cvode_mem, onoff) &
+result(swig_result)
+use, intrinsic :: ISO_C_BINDING
+integer(C_INT) :: swig_result
+type(C_PTR) :: cvode_mem
+integer(C_INT), intent(in) :: onoff
+integer(C_INT) :: fresult 
+type(C_PTR) :: farg1 
+integer(C_INT) :: farg2 
+
+farg1 = cvode_mem
+farg2 = onoff
+fresult = swigc_FCVodeSetProjErrEst(farg1, farg2)
+swig_result = fresult
+end function
+
+function FCVodeSetProjFrequency(cvode_mem, proj_freq) &
+result(swig_result)
+use, intrinsic :: ISO_C_BINDING
+integer(C_INT) :: swig_result
+type(C_PTR) :: cvode_mem
+integer(C_LONG), intent(in) :: proj_freq
+integer(C_INT) :: fresult 
+type(C_PTR) :: farg1 
+integer(C_LONG) :: farg2 
+
+farg1 = cvode_mem
+farg2 = proj_freq
+fresult = swigc_FCVodeSetProjFrequency(farg1, farg2)
+swig_result = fresult
+end function
+
+function FCVodeSetMaxNumProjFails(cvode_mem, max_fails) &
+result(swig_result)
+use, intrinsic :: ISO_C_BINDING
+integer(C_INT) :: swig_result
+type(C_PTR) :: cvode_mem
+integer(C_INT), intent(in) :: max_fails
+integer(C_INT) :: fresult 
+type(C_PTR) :: farg1 
+integer(C_INT) :: farg2 
+
+farg1 = cvode_mem
+farg2 = max_fails
+fresult = swigc_FCVodeSetMaxNumProjFails(farg1, farg2)
+swig_result = fresult
+end function
+
+function FCVodeSetEpsProj(cvode_mem, eps) &
+result(swig_result)
+use, intrinsic :: ISO_C_BINDING
+integer(C_INT) :: swig_result
+type(C_PTR) :: cvode_mem
+real(C_DOUBLE), intent(in) :: eps
+integer(C_INT) :: fresult 
+type(C_PTR) :: farg1 
+real(C_DOUBLE) :: farg2 
+
+farg1 = cvode_mem
+farg2 = eps
+fresult = swigc_FCVodeSetEpsProj(farg1, farg2)
+swig_result = fresult
+end function
+
+function FCVodeSetProjFailEta(cvode_mem, eta) &
+result(swig_result)
+use, intrinsic :: ISO_C_BINDING
+integer(C_INT) :: swig_result
+type(C_PTR) :: cvode_mem
+real(C_DOUBLE), intent(in) :: eta
+integer(C_INT) :: fresult 
+type(C_PTR) :: farg1 
+real(C_DOUBLE) :: farg2 
+
+farg1 = cvode_mem
+farg2 = eta
+fresult = swigc_FCVodeSetProjFailEta(farg1, farg2)
+swig_result = fresult
+end function
+
+function FCVodeGetNumProjEvals(cvode_mem, nproj) &
+result(swig_result)
+use, intrinsic :: ISO_C_BINDING
+integer(C_INT) :: swig_result
+type(C_PTR) :: cvode_mem
+integer(C_LONG), dimension(*), target, intent(inout) :: nproj
+integer(C_INT) :: fresult 
+type(C_PTR) :: farg1 
+type(C_PTR) :: farg2 
+
+farg1 = cvode_mem
+farg2 = c_loc(nproj(1))
+fresult = swigc_FCVodeGetNumProjEvals(farg1, farg2)
+swig_result = fresult
+end function
+
+function FCVodeGetNumProjFails(cvode_mem, nprf) &
+result(swig_result)
+use, intrinsic :: ISO_C_BINDING
+integer(C_INT) :: swig_result
+type(C_PTR) :: cvode_mem
+integer(C_LONG), dimension(*), target, intent(inout) :: nprf
+integer(C_INT) :: fresult 
+type(C_PTR) :: farg1 
+type(C_PTR) :: farg2 
+
+farg1 = cvode_mem
+farg2 = c_loc(nprf(1))
+fresult = swigc_FCVodeGetNumProjFails(farg1, farg2)
+swig_result = fresult
+end function
+
 
 end module
diff --git a/src/cvodes/README.md b/src/cvodes/README.md
index df95639d42..5033447406 100644
--- a/src/cvodes/README.md
+++ b/src/cvodes/README.md
@@ -1,5 +1,5 @@
 # CVODES
-### Version 5.2.0 (Mar 2020)
+### Version 5.3.0 (May 2020)
 
 **Alan C. Hindmarsh and Radu Serban  
   Center for Applied Scientific Computing, LLNL**
@@ -41,11 +41,11 @@ the "SUNDIALS Release History" appendix of the CVODES User Guide.
 
 ## References
 
-* A. C. Hindmarsh and R. Serban, "User Documentation for CVODES v5.2.0,"
-  LLNL technical report UCRL-SM-208111, Mar 2020.
+* A. C. Hindmarsh and R. Serban, "User Documentation for CVODES v5.3.0,"
+  LLNL technical report UCRL-SM-208111, May 2020.
 
-* A. C. Hindmarsh and R. Serban, "Example Programs for CVODES v5.2.0,"
-  LLNL technical report UCRL-SM-208115, Mar 2020.
+* A. C. Hindmarsh and R. Serban, "Example Programs for CVODES v5.3.0,"
+  LLNL technical report UCRL-SM-208115, May 2020.
 
 * R. Serban and A. C. Hindmarsh, "CVODES: the Sensitivity-Enabled ODE
   solver in SUNDIALS," Proceedings of IDETC/CIE 2005, Sept. 2005,
diff --git a/src/cvodes/cvodes_ls.c b/src/cvodes/cvodes_ls.c
index ebf38fbe40..a605574660 100644
--- a/src/cvodes/cvodes_ls.c
+++ b/src/cvodes/cvodes_ls.c
@@ -246,6 +246,7 @@ int CVodeSetLinearSolver(void *cvode_mem, SUNLinearSolver LS,
   cvls_mem->jtimesDQ = SUNTRUE;
   cvls_mem->jtsetup  = NULL;
   cvls_mem->jtimes   = cvLsDQJtimes;
+  cvls_mem->jt_f     = cv_mem->cv_f;
   cvls_mem->jt_data  = cv_mem;
 
   cvls_mem->user_linsys = SUNFALSE;
@@ -521,6 +522,7 @@ int CVodeSetJacTimes(void *cvode_mem, CVLsJacTimesSetupFn jtsetup,
     cvls_mem->jtimesDQ = SUNTRUE;
     cvls_mem->jtsetup  = NULL;
     cvls_mem->jtimes   = cvLsDQJtimes;
+    cvls_mem->jt_f     = cv_mem->cv_f;
     cvls_mem->jt_data  = cv_mem;
   }
 
@@ -528,6 +530,37 @@ int CVodeSetJacTimes(void *cvode_mem, CVLsJacTimesSetupFn jtsetup,
 }
 
 
+/* CVodeSetJacTimesRhsFn specifies an alternative user-supplied ODE right-hand
+   side function to use in the internal finite difference Jacobian-vector
+   product */
+int CVodeSetJacTimesRhsFn(void *cvode_mem, CVRhsFn jtimesRhsFn)
+{
+  CVodeMem cv_mem;
+  CVLsMem  cvls_mem;
+  int      retval;
+
+  /* access CVLsMem structure */
+  retval = cvLs_AccessLMem(cvode_mem, "CVodeSetJacTimesRhsFn",
+                           &cv_mem, &cvls_mem);
+  if (retval != CVLS_SUCCESS) return(retval);
+
+  /* check if using internal finite difference approximation */
+  if (!(cvls_mem->jtimesDQ)) {
+    cvProcessError(cv_mem, CVLS_ILL_INPUT, "CVSLS", "CVodeSetJacTimesRhsFn",
+                   "Internal finite-difference Jacobian-vector product is disabled.");
+    return(CVLS_ILL_INPUT);
+  }
+
+  /* store function pointers for RHS function (NULL implies use ODE RHS) */
+  if (jtimesRhsFn != NULL)
+    cvls_mem->jt_f = jtimesRhsFn;
+  else
+    cvls_mem->jt_f = cv_mem->cv_f;
+
+  return(CVLS_SUCCESS);
+}
+
+
 /* CVodeSetLinSysFn specifies the linear system setup function. */
 int CVodeSetLinSysFn(void *cvode_mem, CVLsLinSysFn linsys)
 {
@@ -1200,7 +1233,7 @@ int cvLsDQJtimes(N_Vector v, N_Vector Jv, realtype t,
     N_VLinearSum(sig, v, ONE, y, work);
 
     /* Set Jv = f(tn, y+sig*v) */
-    retval = cv_mem->cv_f(t, work, Jv, cv_mem->cv_user_data);
+    retval = cvls_mem->jt_f(t, work, Jv, cv_mem->cv_user_data);
     cvls_mem->nfeDQ++;
     if (retval == 0) break;
     if (retval < 0)  return(-1);
@@ -2111,6 +2144,26 @@ int CVodeSetJacTimesBS(void *cvode_mem, int which,
 }
 
 
+int CVodeSetJacTimesRhsFnB(void *cvode_mem, int which, CVRhsFn jtimesRhsFn)
+{
+  CVodeMem   cv_mem;
+  CVadjMem   ca_mem;
+  CVodeBMem  cvB_mem;
+  CVLsMemB   cvlsB_mem;
+  void      *cvodeB_mem;
+  int        retval;
+
+  /* access relevant memory structures */
+  retval = cvLs_AccessLMemB(cvode_mem, which, "CVodeSetJacTimesRhsFnB",
+                            &cv_mem, &ca_mem, &cvB_mem, &cvlsB_mem);
+  if (retval != CVLS_SUCCESS) return(retval);
+
+  /* Call the corresponding "set" routine for the backward problem */
+  cvodeB_mem = (void *) (cvB_mem->cv_mem);
+  return(CVodeSetJacTimesRhsFn(cvodeB_mem, jtimesRhsFn));
+}
+
+
 int CVodeSetLinSysFnB(void *cvode_mem, int which, CVLsLinSysFnB linsysB)
 {
   CVodeMem  cv_mem;
diff --git a/src/cvodes/cvodes_ls_impl.h b/src/cvodes/cvodes_ls_impl.h
index 18c5366d40..0f1566b3a8 100644
--- a/src/cvodes/cvodes_ls_impl.h
+++ b/src/cvodes/cvodes_ls_impl.h
@@ -114,6 +114,7 @@ typedef struct CVLsMemRec {
   booleantype jtimesDQ;
   CVLsJacTimesSetupFn jtsetup;
   CVLsJacTimesVecFn jtimes;
+  CVRhsFn jt_f;
   void *jt_data;
 
   /* Linear system setup function
diff --git a/src/cvodes/fmod/fcvodes_mod.c b/src/cvodes/fmod/fcvodes_mod.c
index bd1db84dae..66a22f14d5 100644
--- a/src/cvodes/fmod/fcvodes_mod.c
+++ b/src/cvodes/fmod/fcvodes_mod.c
@@ -1117,6 +1117,20 @@ SWIGEXPORT void _wrap_FCVodeFree(void *farg1) {
 }
 
 
+SWIGEXPORT int _wrap_FCVodeSetJacTimesRhsFn(void *farg1, CVRhsFn farg2) {
+  int fresult ;
+  void *arg1 = (void *) 0 ;
+  CVRhsFn arg2 = (CVRhsFn) 0 ;
+  int result;
+  
+  arg1 = (void *)(farg1);
+  arg2 = (CVRhsFn)(farg2);
+  result = (int)CVodeSetJacTimesRhsFn(arg1,arg2);
+  fresult = (int)(result);
+  return fresult;
+}
+
+
 SWIGEXPORT int _wrap_FCVodeQuadInit(void *farg1, CVQuadRhsFn farg2, N_Vector farg3) {
   int fresult ;
   void *arg1 = (void *) 0 ;
@@ -2701,6 +2715,22 @@ SWIGEXPORT int _wrap_FCVodeGetAdjCheckPointsInfo(void *farg1, SwigClassWrapper c
 }
 
 
+SWIGEXPORT int _wrap_FCVodeSetJacTimesRhsFnB(void *farg1, int const *farg2, CVRhsFn farg3) {
+  int fresult ;
+  void *arg1 = (void *) 0 ;
+  int arg2 ;
+  CVRhsFn arg3 = (CVRhsFn) 0 ;
+  int result;
+  
+  arg1 = (void *)(farg1);
+  arg2 = (int)(*farg2);
+  arg3 = (CVRhsFn)(farg3);
+  result = (int)CVodeSetJacTimesRhsFnB(arg1,arg2,arg3);
+  fresult = (int)(result);
+  return fresult;
+}
+
+
 SWIGEXPORT int _wrap_FCVodeGetAdjDataPointHermite(void *farg1, int const *farg2, double *farg3, N_Vector farg4, N_Vector farg5) {
   int fresult ;
   void *arg1 = (void *) 0 ;
diff --git a/src/cvodes/fmod/fcvodes_mod.f90 b/src/cvodes/fmod/fcvodes_mod.f90
index b7dc043ef6..43a5e072c1 100644
--- a/src/cvodes/fmod/fcvodes_mod.f90
+++ b/src/cvodes/fmod/fcvodes_mod.f90
@@ -158,6 +158,7 @@ module fcvodes_mod
  end type
  public :: FCVodeGetReturnFlagName
  public :: FCVodeFree
+ public :: FCVodeSetJacTimesRhsFn
  public :: FCVodeQuadInit
  public :: FCVodeQuadReInit
  public :: FCVodeQuadSStolerances
@@ -280,6 +281,7 @@ module fcvodes_mod
   module procedure swigf_create_CVadjCheckPointRec
  end interface
  public :: FCVodeGetAdjCheckPointsInfo
+ public :: FCVodeSetJacTimesRhsFnB
  public :: FCVodeGetAdjDataPointHermite
  public :: FCVodeGetAdjDataPointPolynomial
  public :: FCVodeGetAdjCurrentCheckPoint
@@ -871,6 +873,15 @@ subroutine swigc_FCVodeFree(farg1) &
 type(C_PTR), value :: farg1
 end subroutine
 
+function swigc_FCVodeSetJacTimesRhsFn(farg1, farg2) &
+bind(C, name="_wrap_FCVodeSetJacTimesRhsFn") &
+result(fresult)
+use, intrinsic :: ISO_C_BINDING
+type(C_PTR), value :: farg1
+type(C_FUNPTR), value :: farg2
+integer(C_INT) :: fresult
+end function
+
 function swigc_FCVodeQuadInit(farg1, farg2, farg3) &
 bind(C, name="_wrap_FCVodeQuadInit") &
 result(fresult)
@@ -1884,6 +1895,16 @@ function swigc_FCVodeGetAdjCheckPointsInfo(farg1, farg2) &
 integer(C_INT) :: fresult
 end function
 
+function swigc_FCVodeSetJacTimesRhsFnB(farg1, farg2, farg3) &
+bind(C, name="_wrap_FCVodeSetJacTimesRhsFnB") &
+result(fresult)
+use, intrinsic :: ISO_C_BINDING
+type(C_PTR), value :: farg1
+integer(C_INT), intent(in) :: farg2
+type(C_FUNPTR), value :: farg3
+integer(C_INT) :: fresult
+end function
+
 function swigc_FCVodeGetAdjDataPointHermite(farg1, farg2, farg3, farg4, farg5) &
 bind(C, name="_wrap_FCVodeGetAdjDataPointHermite") &
 result(fresult)
@@ -3331,6 +3352,22 @@ subroutine FCVodeFree(cvode_mem)
 call swigc_FCVodeFree(farg1)
 end subroutine
 
+function FCVodeSetJacTimesRhsFn(cvode_mem, jtimesrhsfn) &
+result(swig_result)
+use, intrinsic :: ISO_C_BINDING
+integer(C_INT) :: swig_result
+type(C_PTR) :: cvode_mem
+type(C_FUNPTR), intent(in), value :: jtimesrhsfn
+integer(C_INT) :: fresult 
+type(C_PTR) :: farg1 
+type(C_FUNPTR) :: farg2 
+
+farg1 = cvode_mem
+farg2 = jtimesrhsfn
+fresult = swigc_FCVodeSetJacTimesRhsFn(farg1, farg2)
+swig_result = fresult
+end function
+
 function FCVodeQuadInit(cvode_mem, fq, yq0) &
 result(swig_result)
 use, intrinsic :: ISO_C_BINDING
@@ -5182,6 +5219,25 @@ function FCVodeGetAdjCheckPointsInfo(cvode_mem, ckpnt) &
 swig_result = fresult
 end function
 
+function FCVodeSetJacTimesRhsFnB(cvode_mem, which, jtimesrhsfn) &
+result(swig_result)
+use, intrinsic :: ISO_C_BINDING
+integer(C_INT) :: swig_result
+type(C_PTR) :: cvode_mem
+integer(C_INT), intent(in) :: which
+type(C_FUNPTR), intent(in), value :: jtimesrhsfn
+integer(C_INT) :: fresult 
+type(C_PTR) :: farg1 
+integer(C_INT) :: farg2 
+type(C_FUNPTR) :: farg3 
+
+farg1 = cvode_mem
+farg2 = which
+farg3 = jtimesrhsfn
+fresult = swigc_FCVodeSetJacTimesRhsFnB(farg1, farg2, farg3)
+swig_result = fresult
+end function
+
 function FCVodeGetAdjDataPointHermite(cvode_mem, which, t, y, yd) &
 result(swig_result)
 use, intrinsic :: ISO_C_BINDING
diff --git a/src/ida/CMakeLists.txt b/src/ida/CMakeLists.txt
index 329d4de7cb..58613edd7b 100644
--- a/src/ida/CMakeLists.txt
+++ b/src/ida/CMakeLists.txt
@@ -93,7 +93,7 @@ add_prefix(${sundials_SOURCE_DIR}/include/ida/ ida_HEADERS)
 
 # Add source directories to include directories for access to
 # implementation only header files.
-include_directories(.)
+include_directories(. ${sundials_SOURCE_DIR}/src/sundials)
 
 # Define C preprocessor flag -DBUILD_SUNDIALS_LIBRARY
 add_definitions(-DBUILD_SUNDIALS_LIBRARY)
diff --git a/src/ida/README.md b/src/ida/README.md
index 6f7719a68d..c1db3fe836 100644
--- a/src/ida/README.md
+++ b/src/ida/README.md
@@ -1,5 +1,5 @@
 # IDA
-### Version 5.2.0 (Mar 2020)
+### Version 5.3.0 (May 2020)
 
 **Alan C. Hindmarsh and Radu Serban  
   Center for Applied Scientific Computing, LLNL**
@@ -44,11 +44,11 @@ the "SUNDIALS Release History" appendix of the IDA User Guide.
 
 ## References
 
-* A. C. Hindmarsh, R. Serban, and A. Collier, "User Documentation for IDA v5.2.0,"
-  LLNL technical report UCRL-SM-208112, Mar 2020.
+* A. C. Hindmarsh, R. Serban, and A. Collier, "User Documentation for IDA v5.3.0,"
+  LLNL technical report UCRL-SM-208112, May 2020.
 
-* A. C. Hindmarsh, R. Serban, and A. Collier, "Example Programs for IDA v5.2.0,"
-  LLNL technical report UCRL-SM-208113, Mar 2020.
+* A. C. Hindmarsh, R. Serban, and A. Collier, "Example Programs for IDA v5.3.0,"
+  LLNL technical report UCRL-SM-208113, May 2020.
 
 * A. C. Hindmarsh, P. N. Brown, K. E. Grant, S. L. Lee, R. Serban,
   D. E. Shumaker, and C. S. Woodward, "SUNDIALS, Suite of Nonlinear and
diff --git a/src/ida/fmod/fida_mod.c b/src/ida/fmod/fida_mod.c
index 61391f5122..13714cad4d 100644
--- a/src/ida/fmod/fida_mod.c
+++ b/src/ida/fmod/fida_mod.c
@@ -1178,6 +1178,20 @@ SWIGEXPORT void _wrap_FIDAFree(void *farg1) {
 }
 
 
+SWIGEXPORT int _wrap_FIDASetJacTimesResFn(void *farg1, IDAResFn farg2) {
+  int fresult ;
+  void *arg1 = (void *) 0 ;
+  IDAResFn arg2 = (IDAResFn) 0 ;
+  int result;
+  
+  arg1 = (void *)(farg1);
+  arg2 = (IDAResFn)(farg2);
+  result = (int)IDASetJacTimesResFn(arg1,arg2);
+  fresult = (int)(result);
+  return fresult;
+}
+
+
 SWIGEXPORT int _wrap_FIDABBDPrecInit(void *farg1, int64_t const *farg2, int64_t const *farg3, int64_t const *farg4, int64_t const *farg5, int64_t const *farg6, double const *farg7, IDABBDLocalFn farg8, IDABBDCommFn farg9) {
   int fresult ;
   void *arg1 = (void *) 0 ;
diff --git a/src/ida/fmod/fida_mod.f90 b/src/ida/fmod/fida_mod.f90
index 585ac92f79..c966a03532 100644
--- a/src/ida/fmod/fida_mod.f90
+++ b/src/ida/fmod/fida_mod.f90
@@ -138,6 +138,7 @@ module fida_mod
  end type
  public :: FIDAGetReturnFlagName
  public :: FIDAFree
+ public :: FIDASetJacTimesResFn
  public :: FIDABBDPrecInit
  public :: FIDABBDPrecReInit
  public :: FIDABBDPrecGetWorkSpace
@@ -778,6 +779,15 @@ subroutine swigc_FIDAFree(farg1) &
 type(C_PTR), value :: farg1
 end subroutine
 
+function swigc_FIDASetJacTimesResFn(farg1, farg2) &
+bind(C, name="_wrap_FIDASetJacTimesResFn") &
+result(fresult)
+use, intrinsic :: ISO_C_BINDING
+type(C_PTR), value :: farg1
+type(C_FUNPTR), value :: farg2
+integer(C_INT) :: fresult
+end function
+
 function swigc_FIDABBDPrecInit(farg1, farg2, farg3, farg4, farg5, farg6, farg7, farg8, farg9) &
 bind(C, name="_wrap_FIDABBDPrecInit") &
 result(fresult)
@@ -2105,6 +2115,22 @@ subroutine FIDAFree(ida_mem)
 call swigc_FIDAFree(farg1)
 end subroutine
 
+function FIDASetJacTimesResFn(ida_mem, jtimesresfn) &
+result(swig_result)
+use, intrinsic :: ISO_C_BINDING
+integer(C_INT) :: swig_result
+type(C_PTR) :: ida_mem
+type(C_FUNPTR), intent(in), value :: jtimesresfn
+integer(C_INT) :: fresult 
+type(C_PTR) :: farg1 
+type(C_FUNPTR) :: farg2 
+
+farg1 = ida_mem
+farg2 = jtimesresfn
+fresult = swigc_FIDASetJacTimesResFn(farg1, farg2)
+swig_result = fresult
+end function
+
 function FIDABBDPrecInit(ida_mem, nlocal, mudq, mldq, mukeep, mlkeep, dq_rel_yy, gres, gcomm) &
 result(swig_result)
 use, intrinsic :: ISO_C_BINDING
diff --git a/src/ida/ida_ls.c b/src/ida/ida_ls.c
index 6cede7d2bf..c15f2748b9 100644
--- a/src/ida/ida_ls.c
+++ b/src/ida/ida_ls.c
@@ -167,6 +167,7 @@ int IDASetLinearSolver(void *ida_mem, SUNLinearSolver LS, SUNMatrix A)
   idals_mem->jtimesDQ = SUNTRUE;
   idals_mem->jtsetup  = NULL;
   idals_mem->jtimes   = idaLsDQJtimes;
+  idals_mem->jt_res   = IDA_mem->ida_res;
   idals_mem->jt_data  = IDA_mem;
 
   /* Set defaults for preconditioner-related fields */
@@ -441,6 +442,7 @@ int IDASetJacTimes(void *ida_mem, IDALsJacTimesSetupFn jtsetup,
     idals_mem->jtimesDQ = SUNTRUE;
     idals_mem->jtsetup  = NULL;
     idals_mem->jtimes   = idaLsDQJtimes;
+    idals_mem->jt_res   = IDA_mem->ida_res;
     idals_mem->jt_data  = IDA_mem;
   }
 
@@ -448,6 +450,37 @@ int IDASetJacTimes(void *ida_mem, IDALsJacTimesSetupFn jtsetup,
 }
 
 
+/* IDASetJacTimesResFn specifies an alternative user-supplied DAE residual
+   function to use in the internal finite difference Jacobian-vector
+   product */
+int IDASetJacTimesResFn(void *ida_mem, IDAResFn jtimesResFn)
+{
+  IDAMem   IDA_mem;
+  IDALsMem idals_mem;
+  int      retval;
+
+  /* access IDALsMem structure */
+  retval = idaLs_AccessLMem(ida_mem, "IDASetJacTimesResFn",
+                            &IDA_mem, &idals_mem);
+  if (retval != IDALS_SUCCESS) return(retval);
+
+  /* check if using internal finite difference approximation */
+  if (!(idals_mem->jtimesDQ)) {
+    IDAProcessError(IDA_mem, IDALS_ILL_INPUT, "IDALS", "IDASetJacTimesResFn",
+                    "Internal finite-difference Jacobian-vector product is disabled.");
+    return(IDALS_ILL_INPUT);
+  }
+
+  /* store function pointers for Res function (NULL implies use DAE Res) */
+  if (jtimesResFn != NULL)
+    idals_mem->jt_res = jtimesResFn;
+  else
+    idals_mem->jt_res = IDA_mem->ida_res;
+
+  return(IDALS_SUCCESS);
+}
+
+
 /* IDAGetLinWorkSpace returns the length of workspace allocated
    for the IDALS linear solver interface */
 int IDAGetLinWorkSpace(void *ida_mem, long int *lenrwLS,
@@ -1107,7 +1140,7 @@ int idaLsDQJtimes(realtype tt, N_Vector yy, N_Vector yp, N_Vector rr,
     N_VLinearSum(c_j*sig, v, ONE, yp, yp_tmp);
 
     /* Call res for Jv = F(t, y_tmp, yp_tmp), and return if it failed. */
-    retval = IDA_mem->ida_res(tt, y_tmp, yp_tmp, Jv, IDA_mem->ida_user_data);
+    retval = idals_mem->jt_res(tt, y_tmp, yp_tmp, Jv, IDA_mem->ida_user_data);
     idals_mem->nreDQ++;
     if (retval == 0) break;
     if (retval < 0)  return(-1);
diff --git a/src/ida/ida_ls_impl.h b/src/ida/ida_ls_impl.h
index d2a042018f..1eada89c59 100644
--- a/src/ida/ida_ls_impl.h
+++ b/src/ida/ida_ls_impl.h
@@ -100,6 +100,7 @@ typedef struct IDALsMemRec {
   booleantype jtimesDQ;
   IDALsJacTimesSetupFn jtsetup;
   IDALsJacTimesVecFn jtimes;
+  IDAResFn jt_res;
   void *jt_data;
 
 } *IDALsMem;
diff --git a/src/idas/CMakeLists.txt b/src/idas/CMakeLists.txt
index f76380737a..bdec77208c 100644
--- a/src/idas/CMakeLists.txt
+++ b/src/idas/CMakeLists.txt
@@ -97,7 +97,7 @@ add_prefix(${sundials_SOURCE_DIR}/include/idas/ idas_HEADERS)
 
 # Add source directories to include directories for access to
 # implementation only header files.
-include_directories(.)
+include_directories(. ${sundials_SOURCE_DIR}/src/sundials)
 
 # Define C preprocessor flag -DBUILD_SUNDIALS_LIBRARY
 add_definitions(-DBUILD_SUNDIALS_LIBRARY)
diff --git a/src/idas/README.md b/src/idas/README.md
index d15aebde86..f1db3614f2 100644
--- a/src/idas/README.md
+++ b/src/idas/README.md
@@ -1,5 +1,5 @@
 # IDAS
-### Version 4.2.0 (Mar 2020)
+### Version 4.3.0 (May 2020)
 
 **Radu Serban  
   Center for Applied Scientific Computing, LLNL**
@@ -40,11 +40,11 @@ the "SUNDIALS Release History" appendix of the IDAS User Guide.
 
 ## References
 
-* R. Serban, C. Petra,and A. C. Hindmarsh,  "User Documentation for IDAS v4.2.0,"
-  LLNL technical report UCRL-SM-234051, Mar 2020.
+* R. Serban, C. Petra,and A. C. Hindmarsh,  "User Documentation for IDAS v4.3.0,"
+  LLNL technical report UCRL-SM-234051, May 2020.
 
-* R. Serban and A.C. Hindmarsh, "Example Programs for IDAS v4.2.0,"
-  LLNL technical report LLNL-TR-437091, Mar 2020.
+* R. Serban and A.C. Hindmarsh, "Example Programs for IDAS v4.3.0,"
+  LLNL technical report LLNL-TR-437091, May 2020.
 
 * A. C. Hindmarsh, P. N. Brown, K. E. Grant, S. L. Lee, R. Serban,
   D. E. Shumaker, and C. S. Woodward, "SUNDIALS, Suite of Nonlinear and
diff --git a/src/idas/fmod/fidas_mod.c b/src/idas/fmod/fidas_mod.c
index a41d14a7c0..51831bd888 100644
--- a/src/idas/fmod/fidas_mod.c
+++ b/src/idas/fmod/fidas_mod.c
@@ -1313,6 +1313,20 @@ SWIGEXPORT void _wrap_FIDAFree(void *farg1) {
 }
 
 
+SWIGEXPORT int _wrap_FIDASetJacTimesResFn(void *farg1, IDAResFn farg2) {
+  int fresult ;
+  void *arg1 = (void *) 0 ;
+  IDAResFn arg2 = (IDAResFn) 0 ;
+  int result;
+  
+  arg1 = (void *)(farg1);
+  arg2 = (IDAResFn)(farg2);
+  result = (int)IDASetJacTimesResFn(arg1,arg2);
+  fresult = (int)(result);
+  return fresult;
+}
+
+
 SWIGEXPORT int _wrap_FIDAQuadInit(void *farg1, IDAQuadRhsFn farg2, N_Vector farg3) {
   int fresult ;
   void *arg1 = (void *) 0 ;
@@ -2927,6 +2941,22 @@ SWIGEXPORT int _wrap_FIDAGetAdjCheckPointsInfo(void *farg1, SwigClassWrapper con
 }
 
 
+SWIGEXPORT int _wrap_FIDASetJacTimesResFnB(void *farg1, int const *farg2, IDAResFn farg3) {
+  int fresult ;
+  void *arg1 = (void *) 0 ;
+  int arg2 ;
+  IDAResFn arg3 = (IDAResFn) 0 ;
+  int result;
+  
+  arg1 = (void *)(farg1);
+  arg2 = (int)(*farg2);
+  arg3 = (IDAResFn)(farg3);
+  result = (int)IDASetJacTimesResFnB(arg1,arg2,arg3);
+  fresult = (int)(result);
+  return fresult;
+}
+
+
 SWIGEXPORT int _wrap_FIDAGetAdjDataPointHermite(void *farg1, int const *farg2, double *farg3, N_Vector farg4, N_Vector farg5) {
   int fresult ;
   void *arg1 = (void *) 0 ;
diff --git a/src/idas/fmod/fidas_mod.f90 b/src/idas/fmod/fidas_mod.f90
index 4a8da21372..d74a1d1de7 100644
--- a/src/idas/fmod/fidas_mod.f90
+++ b/src/idas/fmod/fidas_mod.f90
@@ -167,6 +167,7 @@ module fidas_mod
  end type
  public :: FIDAGetReturnFlagName
  public :: FIDAFree
+ public :: FIDASetJacTimesResFn
  public :: FIDAQuadInit
  public :: FIDAQuadReInit
  public :: FIDAQuadSStolerances
@@ -289,6 +290,7 @@ module fidas_mod
   module procedure swigf_create_IDAadjCheckPointRec
  end interface
  public :: FIDAGetAdjCheckPointsInfo
+ public :: FIDASetJacTimesResFnB
  public :: FIDAGetAdjDataPointHermite
  public :: FIDAGetAdjDataPointPolynomial
  public :: FIDAGetAdjCurrentCheckPoint
@@ -984,6 +986,15 @@ subroutine swigc_FIDAFree(farg1) &
 type(C_PTR), value :: farg1
 end subroutine
 
+function swigc_FIDASetJacTimesResFn(farg1, farg2) &
+bind(C, name="_wrap_FIDASetJacTimesResFn") &
+result(fresult)
+use, intrinsic :: ISO_C_BINDING
+type(C_PTR), value :: farg1
+type(C_FUNPTR), value :: farg2
+integer(C_INT) :: fresult
+end function
+
 function swigc_FIDAQuadInit(farg1, farg2, farg3) &
 bind(C, name="_wrap_FIDAQuadInit") &
 result(fresult)
@@ -2012,6 +2023,16 @@ function swigc_FIDAGetAdjCheckPointsInfo(farg1, farg2) &
 integer(C_INT) :: fresult
 end function
 
+function swigc_FIDASetJacTimesResFnB(farg1, farg2, farg3) &
+bind(C, name="_wrap_FIDASetJacTimesResFnB") &
+result(fresult)
+use, intrinsic :: ISO_C_BINDING
+type(C_PTR), value :: farg1
+integer(C_INT), intent(in) :: farg2
+type(C_FUNPTR), value :: farg3
+integer(C_INT) :: fresult
+end function
+
 function swigc_FIDAGetAdjDataPointHermite(farg1, farg2, farg3, farg4, farg5) &
 bind(C, name="_wrap_FIDAGetAdjDataPointHermite") &
 result(fresult)
@@ -3576,6 +3597,22 @@ subroutine FIDAFree(ida_mem)
 call swigc_FIDAFree(farg1)
 end subroutine
 
+function FIDASetJacTimesResFn(ida_mem, jtimesresfn) &
+result(swig_result)
+use, intrinsic :: ISO_C_BINDING
+integer(C_INT) :: swig_result
+type(C_PTR) :: ida_mem
+type(C_FUNPTR), intent(in), value :: jtimesresfn
+integer(C_INT) :: fresult 
+type(C_PTR) :: farg1 
+type(C_FUNPTR) :: farg2 
+
+farg1 = ida_mem
+farg2 = jtimesresfn
+fresult = swigc_FIDASetJacTimesResFn(farg1, farg2)
+swig_result = fresult
+end function
+
 function FIDAQuadInit(ida_mem, rhsq, yq0) &
 result(swig_result)
 use, intrinsic :: ISO_C_BINDING
@@ -5472,6 +5509,25 @@ function FIDAGetAdjCheckPointsInfo(ida_mem, ckpnt) &
 swig_result = fresult
 end function
 
+function FIDASetJacTimesResFnB(ida_mem, which, jtimesresfn) &
+result(swig_result)
+use, intrinsic :: ISO_C_BINDING
+integer(C_INT) :: swig_result
+type(C_PTR) :: ida_mem
+integer(C_INT), intent(in) :: which
+type(C_FUNPTR), intent(in), value :: jtimesresfn
+integer(C_INT) :: fresult 
+type(C_PTR) :: farg1 
+integer(C_INT) :: farg2 
+type(C_FUNPTR) :: farg3 
+
+farg1 = ida_mem
+farg2 = which
+farg3 = jtimesresfn
+fresult = swigc_FIDASetJacTimesResFnB(farg1, farg2, farg3)
+swig_result = fresult
+end function
+
 function FIDAGetAdjDataPointHermite(ida_mem, which, t, yy, yd) &
 result(swig_result)
 use, intrinsic :: ISO_C_BINDING
diff --git a/src/idas/idas_ls.c b/src/idas/idas_ls.c
index 16e5d17726..5f58927d6c 100644
--- a/src/idas/idas_ls.c
+++ b/src/idas/idas_ls.c
@@ -218,6 +218,7 @@ int IDASetLinearSolver(void *ida_mem, SUNLinearSolver LS, SUNMatrix A)
   idals_mem->jtimesDQ = SUNTRUE;
   idals_mem->jtsetup  = NULL;
   idals_mem->jtimes   = idaLsDQJtimes;
+  idals_mem->jt_res   = IDA_mem->ida_res;
   idals_mem->jt_data  = IDA_mem;
 
   /* Set defaults for preconditioner-related fields */
@@ -492,6 +493,7 @@ int IDASetJacTimes(void *ida_mem, IDALsJacTimesSetupFn jtsetup,
     idals_mem->jtimesDQ = SUNTRUE;
     idals_mem->jtsetup  = NULL;
     idals_mem->jtimes   = idaLsDQJtimes;
+    idals_mem->jt_res   = IDA_mem->ida_res;
     idals_mem->jt_data  = IDA_mem;
   }
 
@@ -499,6 +501,37 @@ int IDASetJacTimes(void *ida_mem, IDALsJacTimesSetupFn jtsetup,
 }
 
 
+/* IDASetJacTimesResFn specifies an alternative user-supplied DAE residual
+   function to use in the internal finite difference Jacobian-vector
+   product */
+int IDASetJacTimesResFn(void *ida_mem, IDAResFn jtimesResFn)
+{
+  IDAMem   IDA_mem;
+  IDALsMem idals_mem;
+  int      retval;
+
+  /* access IDALsMem structure */
+  retval = idaLs_AccessLMem(ida_mem, "IDASetJacTimesResFn",
+                            &IDA_mem, &idals_mem);
+  if (retval != IDALS_SUCCESS) return(retval);
+
+  /* check if using internal finite difference approximation */
+  if (!(idals_mem->jtimesDQ)) {
+    IDAProcessError(IDA_mem, IDALS_ILL_INPUT, "IDASLS", "IDASetJacTimesResFn",
+                    "Internal finite-difference Jacobian-vector product is disabled.");
+    return(IDALS_ILL_INPUT);
+  }
+
+  /* store function pointers for Res function (NULL implies use DAE Res) */
+  if (jtimesResFn != NULL)
+    idals_mem->jt_res = jtimesResFn;
+  else
+    idals_mem->jt_res = IDA_mem->ida_res;
+
+  return(IDALS_SUCCESS);
+}
+
+
 /* IDAGetLinWorkSpace returns the length of workspace allocated
    for the IDALS linear solver interface */
 int IDAGetLinWorkSpace(void *ida_mem, long int *lenrwLS,
@@ -964,7 +997,7 @@ int idaLsDenseDQJac(realtype tt, realtype c_j, N_Vector yy,
     y_data[j] += inc;
     yp_data[j] += c_j*inc;
 
-    retval = IDA_mem->ida_res(tt, yy, yp, rtemp, IDA_mem->ida_user_data);
+    retval = idals_mem->jt_res(tt, yy, yp, rtemp, IDA_mem->ida_user_data);
     idals_mem->nreDQ++;
     if (retval != 0) break;
 
@@ -1968,6 +2001,26 @@ int IDASetJacTimesBS(void *ida_mem, int which,
 }
 
 
+int IDASetJacTimesResFnB(void *ida_mem, int which, IDAResFn jtimesResFn)
+{
+  IDAadjMem  IDAADJ_mem;
+  IDAMem     IDA_mem;
+  IDABMem    IDAB_mem;
+  IDALsMemB  idalsB_mem;
+  void      *ida_memB;
+  int        retval;
+
+  /* access relevant memory structures */
+  retval = idaLs_AccessLMemB(ida_mem, which, "IDASetJacTimesResFnB", &IDA_mem,
+                             &IDAADJ_mem, &IDAB_mem, &idalsB_mem);
+  if (retval != IDALS_SUCCESS) return(retval);
+
+  /* call corresponding routine for IDAB_mem structure */
+  ida_memB = (void *) IDAB_mem->IDA_mem;
+  return(IDASetJacTimesResFn(ida_memB, jtimesResFn));
+}
+
+
 /*-----------------------------------------------------------------
   IDASLS Private functions for backwards problems
   -----------------------------------------------------------------*/
diff --git a/src/idas/idas_ls_impl.h b/src/idas/idas_ls_impl.h
index 9c742493a1..f0af48ae4c 100644
--- a/src/idas/idas_ls_impl.h
+++ b/src/idas/idas_ls_impl.h
@@ -100,6 +100,7 @@ typedef struct IDALsMemRec {
   booleantype jtimesDQ;
   IDALsJacTimesSetupFn jtsetup;
   IDALsJacTimesVecFn jtimes;
+  IDAResFn jt_res;
   void *jt_data;
 
 } *IDALsMem;
diff --git a/src/kinsol/README.md b/src/kinsol/README.md
index 8836e369d6..0e33b5a019 100644
--- a/src/kinsol/README.md
+++ b/src/kinsol/README.md
@@ -1,5 +1,5 @@
 # KINSOL
-### Version 5.2.0 (Mar 2020)
+### Version 5.3.0 (May 2020)
 
 **Aaron Collier, Alan C. Hindmarsh, Radu Serban, and Carol S. Woodward  
   Center for Applied Scientific Computing, LLNL**
@@ -45,11 +45,11 @@ the "SUNDIALS Release History" appendix of the KINSOL User Guide.
 ## References
 
 * A. M. Collier, A. C. Hindmarsh, R. Serban, and C. S. Woodward,
-  "User Documentation for KINSOL v5.2.0," LLNL technical report
-  UCRL-SM-208116, Mar 2020.
+  "User Documentation for KINSOL v5.3.0," LLNL technical report
+  UCRL-SM-208116, May 2020.
 
-* A. M. Collier and R. Serban, "Example Programs for KINSOL v5.2.0,"
-  LLNL technical report UCRL-SM-208114, Mar 2020.
+* A. M. Collier and R. Serban, "Example Programs for KINSOL v5.3.0,"
+  LLNL technical report UCRL-SM-208114, May 2020.
 
 * A. C. Hindmarsh, P. N. Brown, K. E. Grant, S. L. Lee, R. Serban,
   D. E. Shumaker, and C. S. Woodward, "SUNDIALS, Suite of Nonlinear and
diff --git a/src/kinsol/fmod/fkinsol_mod.c b/src/kinsol/fmod/fkinsol_mod.c
index 79ee37d8b2..4ca4167133 100644
--- a/src/kinsol/fmod/fkinsol_mod.c
+++ b/src/kinsol/fmod/fkinsol_mod.c
@@ -774,6 +774,20 @@ SWIGEXPORT void _wrap_FKINFree(void *farg1) {
 }
 
 
+SWIGEXPORT int _wrap_FKINSetJacTimesVecSysFn(void *farg1, KINSysFn farg2) {
+  int fresult ;
+  void *arg1 = (void *) 0 ;
+  KINSysFn arg2 = (KINSysFn) 0 ;
+  int result;
+  
+  arg1 = (void *)(farg1);
+  arg2 = (KINSysFn)(farg2);
+  result = (int)KINSetJacTimesVecSysFn(arg1,arg2);
+  fresult = (int)(result);
+  return fresult;
+}
+
+
 SWIGEXPORT int _wrap_FKINBBDPrecInit(void *farg1, int64_t const *farg2, int64_t const *farg3, int64_t const *farg4, int64_t const *farg5, int64_t const *farg6, double const *farg7, KINBBDLocalFn farg8, KINBBDCommFn farg9) {
   int fresult ;
   void *arg1 = (void *) 0 ;
diff --git a/src/kinsol/fmod/fkinsol_mod.f90 b/src/kinsol/fmod/fkinsol_mod.f90
index ff44215952..de18016eb1 100644
--- a/src/kinsol/fmod/fkinsol_mod.f90
+++ b/src/kinsol/fmod/fkinsol_mod.f90
@@ -104,6 +104,7 @@ module fkinsol_mod
  end type
  public :: FKINGetReturnFlagName
  public :: FKINFree
+ public :: FKINSetJacTimesVecSysFn
  integer(C_INT), parameter, public :: KINBBDPRE_SUCCESS = 0_C_INT
  integer(C_INT), parameter, public :: KINBBDPRE_PDATA_NULL = -11_C_INT
  integer(C_INT), parameter, public :: KINBBDPRE_FUNC_UNRECVR = -12_C_INT
@@ -487,6 +488,15 @@ subroutine swigc_FKINFree(farg1) &
 type(C_PTR), value :: farg1
 end subroutine
 
+function swigc_FKINSetJacTimesVecSysFn(farg1, farg2) &
+bind(C, name="_wrap_FKINSetJacTimesVecSysFn") &
+result(fresult)
+use, intrinsic :: ISO_C_BINDING
+type(C_PTR), value :: farg1
+type(C_FUNPTR), value :: farg2
+integer(C_INT) :: fresult
+end function
+
 function swigc_FKINBBDPrecInit(farg1, farg2, farg3, farg4, farg5, farg6, farg7, farg8, farg9) &
 bind(C, name="_wrap_FKINBBDPrecInit") &
 result(fresult)
@@ -1290,6 +1300,22 @@ subroutine FKINFree(kinmem)
 call swigc_FKINFree(farg1)
 end subroutine
 
+function FKINSetJacTimesVecSysFn(kinmem, jtimessysfn) &
+result(swig_result)
+use, intrinsic :: ISO_C_BINDING
+integer(C_INT) :: swig_result
+type(C_PTR) :: kinmem
+type(C_FUNPTR), intent(in), value :: jtimessysfn
+integer(C_INT) :: fresult 
+type(C_PTR) :: farg1 
+type(C_FUNPTR) :: farg2 
+
+farg1 = kinmem
+farg2 = jtimessysfn
+fresult = swigc_FKINSetJacTimesVecSysFn(farg1, farg2)
+swig_result = fresult
+end function
+
 function FKINBBDPrecInit(kinmem, nlocal, mudq, mldq, mukeep, mlkeep, dq_rel_uu, gloc, gcomm) &
 result(swig_result)
 use, intrinsic :: ISO_C_BINDING
diff --git a/src/kinsol/kinsol_ls.c b/src/kinsol/kinsol_ls.c
index cd8bd2b5f8..b74890a396 100644
--- a/src/kinsol/kinsol_ls.c
+++ b/src/kinsol/kinsol_ls.c
@@ -153,6 +153,7 @@ int KINSetLinearSolver(void *kinmem, SUNLinearSolver LS, SUNMatrix A)
   }
   kinls_mem->jtimesDQ = SUNTRUE;
   kinls_mem->jtimes   = kinLsDQJtimes;
+  kinls_mem->jt_func  = kin_mem->kin_func;
   kinls_mem->jt_data  = kin_mem;
 
   /* Set defaults for preconditioner-related fields */
@@ -317,12 +318,44 @@ int KINSetJacTimesVecFn(void *kinmem, KINLsJacTimesVecFn jtv)
   } else {
     kinls_mem->jtimesDQ = SUNTRUE;
     kinls_mem->jtimes   = kinLsDQJtimes;
+    kinls_mem->jt_func  = kin_mem->kin_func;
     kinls_mem->jt_data  = kin_mem;
   }
 
   return(KINLS_SUCCESS);
 }
 
+
+/* KINSetJacTimesVecSysFn specifies an alternative user-supplied system function
+   to use in the internal finite difference Jacobian-vector product */
+int KINSetJacTimesVecSysFn(void *kinmem, KINSysFn jtimesSysFn)
+{
+  int      retval;
+  KINMem   kin_mem = NULL;
+  KINLsMem kinls_mem = NULL;
+
+  /* access KINLsMem structure */
+  retval = kinLs_AccessLMem(kin_mem, "KINSetJacTimesVecSysFn",
+                            &kin_mem, &kinls_mem);
+  if (retval != KIN_SUCCESS) return(retval);
+
+  /* check if using internal finite difference approximation */
+  if (!(kinls_mem->jtimesDQ)) {
+    KINProcessError(kin_mem, KINLS_ILL_INPUT, "KINLS", "KINSetJacTimesVecSysFn",
+                    "Internal finite-difference Jacobian-vector product is disabled.");
+    return(KINLS_ILL_INPUT);
+  }
+
+  /* store function pointers for system function (NULL implies use kin_func) */
+  if (jtimesSysFn != NULL)
+    kinls_mem->jt_func = jtimesSysFn;
+  else
+    kinls_mem->jt_func = kin_mem->kin_func;
+
+  return(KINLS_SUCCESS);
+}
+
+
 /*------------------------------------------------------------------
   KINGetLinWorkSpace returns the integer and real workspace size
   ------------------------------------------------------------------*/
@@ -923,8 +956,8 @@ int kinLsDQJtimes(N_Vector v, N_Vector Jv, N_Vector u,
   N_VLinearSum(ONE, u, sigma, v, kin_mem->kin_vtemp1);
 
   /* call the system function to calculate func(u+sigma*v) */
-  retval = kin_mem->kin_func(kin_mem->kin_vtemp1, kin_mem->kin_vtemp2,
-                             kin_mem->kin_user_data);
+  retval = kinls_mem->jt_func(kin_mem->kin_vtemp1, kin_mem->kin_vtemp2,
+                              kin_mem->kin_user_data);
   kinls_mem->nfeDQ++;
   if (retval != 0) return(retval);
 
diff --git a/src/kinsol/kinsol_ls_impl.h b/src/kinsol/kinsol_ls_impl.h
index ffcf35ff10..77b1a5bb82 100644
--- a/src/kinsol/kinsol_ls_impl.h
+++ b/src/kinsol/kinsol_ls_impl.h
@@ -96,6 +96,7 @@ typedef struct KINLsMemRec {
          - jtimesDQ == SUNTRUE */
   booleantype jtimesDQ;
   KINLsJacTimesVecFn jtimes;
+  KINSysFn jt_func;
   void *jt_data;
 
 } *KINLsMem;
diff --git a/src/nvector/cuda/CMakeLists.txt b/src/nvector/cuda/CMakeLists.txt
index 063906f5a6..824355e390 100644
--- a/src/nvector/cuda/CMakeLists.txt
+++ b/src/nvector/cuda/CMakeLists.txt
@@ -17,7 +17,7 @@
 install(CODE "MESSAGE(\"\nInstall NVECTOR_CUDA\n\")")
 
 # Add variable nveccuda_SOURCES with the sources for the NVECUDA lib
-set(nveccuda_SOURCES nvector_cuda.cu)
+set(nveccuda_SOURCES nvector_cuda.cu VectorKernels.cuh VectorArrayKernels.cuh)
 
 # Add variable shared_SOURCES with the common SUNDIALS sources which will
 # also be included in the NVECCUDA library
@@ -28,8 +28,7 @@ set(shared_SOURCES
 add_prefix(${sundials_SOURCE_DIR}/src/sundials/ shared_SOURCES)
 
 # Add variable nveccuda_HEADERS with the exported NVECUDA header files
-set(nveccuda_HEADERS nvector_cuda.h)
-add_prefix(${sundials_SOURCE_DIR}/include/nvector/ nveccuda_HEADERS)
+set(nveccuda_HEADERS ${sundials_SOURCE_DIR}/include/nvector/nvector_cuda.h)
 
 # Define C preprocessor flag -DBUILD_SUNDIALS_LIBRARY
 add_definitions(-DBUILD_SUNDIALS_LIBRARY)
@@ -41,6 +40,7 @@ add_definitions(-DBUILD_SUNDIALS_LIBRARY)
 if(BUILD_STATIC_LIBS)
   add_library(sundials_nveccuda_static STATIC ${nveccuda_SOURCES} ${shared_SOURCES})
   target_compile_features(sundials_nveccuda_static PUBLIC cxx_std_11)
+  target_include_directories(sundials_nveccuda_static PRIVATE . ${sundials_SOURCE_DIR}/src/sundials)
   set_target_properties(sundials_nveccuda_static PROPERTIES
                         OUTPUT_NAME sundials_nveccuda
                         CLEAN_DIRECT_OUTPUT 1)
@@ -55,6 +55,7 @@ endif(BUILD_STATIC_LIBS)
 if(BUILD_SHARED_LIBS)
   add_library(sundials_nveccuda_shared SHARED ${nveccuda_SOURCES} ${shared_SOURCES})
   target_compile_features(sundials_nveccuda_shared PUBLIC cxx_std_11)
+  target_include_directories(sundials_nveccuda_shared PRIVATE . ${sundials_SOURCE_DIR}/src/sundials)
   set_target_properties(sundials_nveccuda_shared PROPERTIES
                         OUTPUT_NAME sundials_nveccuda
                         CLEAN_DIRECT_OUTPUT 1
@@ -65,6 +66,5 @@ endif(BUILD_SHARED_LIBS)
 
 # Install the CUDA NVector header files
 install(FILES ${nveccuda_HEADERS} DESTINATION include/nvector)
-install(DIRECTORY ${sundials_SOURCE_DIR}/include/nvector/cuda DESTINATION include/nvector)
 
 message(STATUS "Added NVECTOR_CUDA module")
diff --git a/src/nvector/cuda/VectorArrayKernels.cuh b/src/nvector/cuda/VectorArrayKernels.cuh
new file mode 100644
index 0000000000..f6365b7495
--- /dev/null
+++ b/src/nvector/cuda/VectorArrayKernels.cuh
@@ -0,0 +1,239 @@
+/*
+ * -----------------------------------------------------------------
+ * Programmer(s): David Gardner, Cody J. Balos @ LLNL
+ * -----------------------------------------------------------------
+ * SUNDIALS Copyright Start
+ * Copyright (c) 2002-2020, Lawrence Livermore National Security
+ * and Southern Methodist University.
+ * All rights reserved.
+ *
+ * See the top-level LICENSE and NOTICE files for details.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ * SUNDIALS Copyright End
+ * -----------------------------------------------------------------
+ */
+
+
+#ifndef _NVECTOR_CUDA_ARRAY_KERNELS_CUH_
+#define _NVECTOR_CUDA_ARRAY_KERNELS_CUH_
+
+#include <limits>
+#include <cuda_runtime.h>
+
+#include "sundials_cuda_kernels.cuh"
+
+using namespace sundials::cuda;
+
+namespace sundials
+{
+namespace nvector_cuda
+{
+
+/* -----------------------------------------------------------------
+ * The namespace for CUDA kernels
+ *
+ * Reduction CUDA kernels in nvector are based in part on "reduction"
+ * example in NVIDIA Corporation CUDA Samples, and parallel reduction
+ * examples in textbook by J. Cheng at al. "CUDA C Programming".
+ * -----------------------------------------------------------------
+ */
+
+/*
+ * -----------------------------------------------------------------------------
+ * fused vector operation kernels
+ * -----------------------------------------------------------------------------
+ */
+
+/*
+ * Computes the linear combination of nv vectors
+ */
+template <typename T, typename I>
+__global__ void
+linearCombinationKernel(int nv, T* c, T** xd, T* zd, I n)
+{
+  GRID_STRIDE_XLOOP(I, i, n)
+  {
+    zd[i] = c[0]*xd[0][i];
+    for (int j=1; j<nv; j++)
+      zd[i] += c[j]*xd[j][i];
+  }
+}
+
+/*
+ * Computes the scaled sum of one vector with nv other vectors
+ */
+template <typename T, typename I>
+__global__ void
+scaleAddMultiKernel(int nv, T* c, T* xd, T** yd, T** zd, I n)
+{
+  GRID_STRIDE_XLOOP(I, i, n)
+  {
+    for (int j=0; j<nv; j++)
+      zd[j][i] = c[j] * xd[i] + yd[j][i];
+  }
+}
+
+
+/*
+ * Dot product of one vector with nv other vectors.
+ *
+ */
+template <typename T, typename I>
+__global__ void
+dotProdMultiKernel(int nv, const T* xd, T** yd, T* out, I n)
+{
+  // REQUIRES nv blocks (i.e. gridDim.x == nv)
+  const I k = blockIdx.x;
+
+  // Initialize to zero.
+  T sum = 0.0;
+  for (I i = threadIdx.x; i < n; i += blockDim.x)
+  { // each thread computes n/blockDim.x elements
+    sum += xd[i] * yd[k][i];
+  }
+  sum = blockReduce<T, RSUM>(sum, 0.0); 
+
+  // Copy reduction result for each block to global memory
+  if (threadIdx.x == 0) atomicAdd(&out[k], sum);
+}
+
+
+/*
+ * -----------------------------------------------------------------------------
+ * vector array operation kernels
+ * -----------------------------------------------------------------------------
+ */
+
+ 
+/*
+ * Computes the linear sum of multiple vectors
+ */
+template <typename T, typename I>
+__global__ void
+linearSumVectorArrayKernel(int nv, T a, T** xd, T b, T** yd, T** zd, I n)
+{
+  GRID_STRIDE_XLOOP(I, i, n)
+  {
+    for (int j=0; j<nv; j++)
+      zd[j][i] = a * xd[j][i] + b * yd[j][i];
+  }
+}
+
+
+/*
+ * Scales multiple vectors
+ */
+template <typename T, typename I>
+__global__ void
+scaleVectorArrayKernel(int nv, T* c, T** xd, T** zd, I n)
+{
+  GRID_STRIDE_XLOOP(I, i, n)
+  {
+    for (int j=0; j<nv; j++)
+      zd[j][i] = c[j] * xd[j][i];
+  }
+}
+
+
+/*
+ * Sets multiple vectors equal to a constant
+ */
+template <typename T, typename I>
+__global__ void
+constVectorArrayKernel(int nv, T c, T** zd, I n)
+{
+  GRID_STRIDE_XLOOP(I, i, n)
+  {
+    for (int j=0; j<nv; j++)
+      zd[j][i] = c;
+  }
+}
+
+
+/*
+ * WRMS norm of nv vectors.
+ *
+ */
+template <typename T, typename I>
+__global__ void
+wL2NormSquareVectorArrayKernel(int nv, T** xd, T** wd, T* out, I n)
+{
+  // REQUIRES nv blocks (i.e. gridDim.x == nv)
+  const I k = blockIdx.x;
+
+  // Initialize to zero.
+  T sum = 0.0;
+  for (I i = threadIdx.x; i < n; i += blockDim.x)
+  { // each thread computes n/blockDim.x elements
+    sum += xd[k][i] * wd[k][i] * xd[k][i] * wd[k][i];
+  }
+  sum = blockReduce<T, RSUM>(sum, 0.0);
+
+  // Copy reduction result for each block to global memory
+  if (threadIdx.x == 0) atomicAdd(&out[k], sum);
+}
+
+
+/*
+ * Masked WRMS norm of nv vectors.
+ *
+ */
+template <typename T, typename I>
+__global__ void
+wL2NormSquareMaskVectorArrayKernel(int nv, T** xd, T** wd, T* id, T* out, I n)
+{
+  // REQUIRES nv blocks (i.e. gridDim.x == nv)
+  const I k = blockIdx.x;
+
+  // Initialize to zero.
+  T sum = 0.0;
+  for (I i = threadIdx.x; i < n; i += blockDim.x)
+  { // each thread computes n/blockDim.x elements
+    if (id[i] > 0.0) sum += xd[k][i] * wd[k][i] * xd[k][i] * wd[k][i];
+  }
+  sum = blockReduce<T, RSUM>(sum, 0.0);
+
+  // Copy reduction result for each block to global memory
+  if (threadIdx.x == 0) atomicAdd(&out[k], sum);
+}
+
+
+/*
+ * Computes the scaled sum of a vector array with multiple other vector arrays
+ */
+template <typename T, typename I>
+__global__ void
+scaleAddMultiVectorArrayKernel(int nv, int ns, T* c, T** xd, T** yd, T** zd, I n)
+{
+  GRID_STRIDE_XLOOP(I, i, n)
+  {
+    for (int k=0; k<nv; k++)
+      for (int j=0; j<ns; j++)
+        zd[k*ns+j][i] = c[j] * xd[k][i] + yd[k*ns+j][i];
+  }
+}
+
+
+/*
+ * Computes the scaled sum of a vector array with multiple other vector arrays
+ */
+template <typename T, typename I>
+__global__ void
+linearCombinationVectorArrayKernel(int nv, int ns, T* c, T** xd, T** zd, I n)
+{
+  GRID_STRIDE_XLOOP(I, i, n)
+  {
+    for (int k=0; k<nv; k++)
+    {
+      zd[k][i] = c[0]*xd[k*ns][i];
+      for (int j=1; j<ns; j++)
+        zd[k][i] += c[j]*xd[k*ns+j][i];
+    }
+  }
+}
+
+} // namespace nvector_cuda
+} // namespace sundials
+
+#endif // _NVECTOR_CUDA_ARRAY_KERNELS_CUH_
diff --git a/src/nvector/cuda/VectorKernels.cuh b/src/nvector/cuda/VectorKernels.cuh
new file mode 100644
index 0000000000..f8d85cb792
--- /dev/null
+++ b/src/nvector/cuda/VectorKernels.cuh
@@ -0,0 +1,378 @@
+/*
+ * -----------------------------------------------------------------
+ * Programmer(s): Slaven Peles, Cody J. Balos @ LLNL
+ * -----------------------------------------------------------------
+ * SUNDIALS Copyright Start
+ * Copyright (c) 2002-2020, Lawrence Livermore National Security
+ * and Southern Methodist University.
+ * All rights reserved.
+ *
+ * See the top-level LICENSE and NOTICE files for details.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ * SUNDIALS Copyright End
+ * -----------------------------------------------------------------
+ */
+
+#ifndef _NVECTOR_CUDA_KERNELS_CUH_
+#define _NVECTOR_CUDA_KERNELS_CUH_
+
+#include <limits>
+#include <cuda_runtime.h>
+
+#include "sundials_cuda_kernels.cuh"
+
+using namespace sundials::cuda;
+
+namespace sundials
+{
+namespace nvector_cuda
+{
+
+/* -----------------------------------------------------------------
+ * The namespace for CUDA kernels
+ *
+ * Reduction CUDA kernels in nvector are based in part on "reduction"
+ * example in NVIDIA Corporation CUDA Samples, and parallel reduction
+ * examples in textbook by J. Cheng at al. "CUDA C Programming".
+ * -----------------------------------------------------------------
+ */
+
+/*
+ * Sets all elements of the vector X to constant value a.
+ *
+ */
+
+template <typename T, typename I>
+__global__ void
+setConstKernel(T a, T *X, I n)
+{
+  GRID_STRIDE_XLOOP(I, i, n)
+  {
+    X[i] = a;
+  }
+}
+
+
+/*
+ * Computes linear sum (combination) of two vectors.
+ *
+ */
+
+template <typename T, typename I>
+__global__ void
+linearSumKernel(T a, const T *X, T b, const T *Y, T *Z, I n)
+{
+  GRID_STRIDE_XLOOP(I, i, n)
+  {
+    Z[i] = a*X[i] + b*Y[i];
+  }
+}
+
+
+/*
+ * Elementwise product of two vectors.
+ *
+ */
+
+template <typename T, typename I>
+__global__ void
+prodKernel(const T *X, const T *Y, T *Z, I n)
+{
+  GRID_STRIDE_XLOOP(I, i, n)
+  {
+    Z[i] = X[i]*Y[i];
+  }
+}
+
+
+/*
+ * Elementwise division of two vectors.
+ *
+ */
+
+template <typename T, typename I>
+__global__ void
+divKernel(const T *X, const T *Y, T *Z, I n)
+{
+  GRID_STRIDE_XLOOP(I, i, n)
+  {
+    Z[i] = X[i]/Y[i];
+  }
+}
+
+
+/*
+ * Scale vector with scalar value 'a'.
+ *
+ */
+
+template <typename T, typename I>
+__global__ void
+scaleKernel(T a, const T *X, T *Z, I n)
+{
+  GRID_STRIDE_XLOOP(I, i, n)
+  {
+    Z[i] = a*X[i];
+  }
+}
+
+
+/*
+ * Stores absolute values of vector X elements into vector Z.
+ *
+ */
+
+template <typename T, typename I>
+__global__ void
+absKernel(const T *X, T *Z, I n)
+{
+  GRID_STRIDE_XLOOP(I, i, n)
+  {
+    Z[i] = abs(X[i]);
+  }
+}
+
+
+/*
+ * Elementwise inversion.
+ *
+ */
+
+template <typename T, typename I>
+__global__ void
+invKernel(const T *X, T *Z, I n)
+{
+  GRID_STRIDE_XLOOP(I, i, n)
+  {
+    Z[i] = 1.0/(X[i]);
+  }
+}
+
+
+/*
+ * Add constant 'c' to each vector element.
+ *
+ */
+
+template <typename T, typename I>
+__global__ void
+addConstKernel(T a, const T *X, T *Z, I n)
+{
+  GRID_STRIDE_XLOOP(I, i, n)
+  {
+    Z[i] = a + X[i];
+  }
+}
+
+
+/*
+ * Compare absolute values of vector 'X' with constant 'c'.
+ *
+ */
+
+template <typename T, typename I>
+__global__ void
+compareKernel(T c, const T *X, T *Z, I n)
+{
+  GRID_STRIDE_XLOOP(I, i, n)
+  {
+    Z[i] = (abs(X[i]) >= c) ? 1.0 : 0.0;
+  }
+}
+
+
+/*
+ * Dot product of two vectors.
+ *
+ */
+template <typename T, typename I>
+__global__ void
+dotProdKernel(const T *x, const T *y, T *out, I n)
+{
+  T sum = 0.0;
+  GRID_STRIDE_XLOOP(I, i, n)
+  {
+    sum += x[i] * y[i];
+  }
+  sum = blockReduce<T, RSUM>(sum, 0.0);
+
+  // Copy reduction result for each block to global memory
+  if (threadIdx.x == 0) atomicAdd(out, sum);
+}
+
+
+/*
+ * Finds max norm the vector.
+ *
+ */
+template <typename T, typename I>
+__global__ void
+maxNormKernel(const T *x, T *out, I n)
+{
+  T maximum = 0.0;
+  GRID_STRIDE_XLOOP(I, i, n)
+  {
+    maximum = max(abs(x[i]), maximum);
+  }
+  maximum = blockReduce<T, RMAX>(maximum, 0.0); 
+
+  // Maximum of reduction result for each block
+  if (threadIdx.x == 0) AtomicMax(out, maximum);
+}
+
+
+/*
+ * Weighted L2 norm squared.
+ *
+ */
+template <typename T, typename I>
+__global__ void
+wL2NormSquareKernel(const T *x, const T *w, T *out, I n)
+{
+  T sum = 0.0;
+  GRID_STRIDE_XLOOP(I, i, n)
+  {
+    sum += x[i] * w[i] * x[i] * w[i];
+  }
+  sum = blockReduce<T, RSUM>(sum, 0.0); 
+
+  // Copy reduction result for each block to global memory
+  if (threadIdx.x == 0) atomicAdd(out, sum);
+}
+
+/*
+ * Weighted L2 norm squared with mask. Vector id specifies the mask.
+ *
+ */
+template <typename T, typename I>
+__global__ void
+wL2NormSquareMaskKernel(const T *x, const T *w, const T *id, T *out, I n)
+{
+  T sum = 0.0;
+  GRID_STRIDE_XLOOP(I, i, n)
+  {
+    if(id[i] > 0.0) sum += x[i] * w[i] * x[i] * w[i];
+  }
+  sum = blockReduce<T, RSUM>(sum, 0.0); 
+
+  // Copy reduction result for each block to global memory
+  if (threadIdx.x == 0) atomicAdd(out, sum);
+}
+
+
+/*
+ * Finds min value in the vector.
+ *
+ */
+template <typename T, typename I>
+__global__ void
+findMinKernel(T MAX_VAL, const T *x, T *out, I n)
+{
+  T minimum = MAX_VAL;
+  GRID_STRIDE_XLOOP(I, i, n)
+  {
+    minimum = min(x[i], minimum);
+  }
+  minimum = blockReduce<T, RMIN>(minimum, MAX_VAL); 
+
+  // minimum of reduction result for each block
+  if (threadIdx.x == 0) AtomicMin(out, minimum);
+}
+
+
+/*
+ * Computes L1 norm of vector
+ *
+ */
+template <typename T, typename I>
+__global__ void
+L1NormKernel(const T *x, T *out, I n)
+{
+  T sum = 0.0;
+  GRID_STRIDE_XLOOP(I, i, n)
+  {
+    sum += abs(x[i]);
+  }
+  sum = blockReduce<T, RSUM>(sum, 0.0); 
+
+  // Copy reduction result for each block to global memory
+  if (threadIdx.x == 0) atomicAdd(out, sum);
+}
+
+/*
+ * Vector inverse  z[i] = 1/x[i] with check for zeros. Reduction is performed
+ * to flag the result if any x[i] = 0.
+ *
+ */
+template <typename T, typename I>
+__global__ void
+invTestKernel(const T *x, T *z, T *out, I n)
+{
+  T flag = 0.0;
+  GRID_STRIDE_XLOOP(I, i, n)
+  {
+    if (x[i] == static_cast<T>(0.0))
+      flag += 1.0;
+    else
+      z[i] = 1.0/x[i];
+  }
+  flag = blockReduce<T, RSUM>(flag, 0.0); 
+
+  // Copy reduction result for each block to global memory
+  if (threadIdx.x == 0) atomicAdd(out, flag);
+}
+
+
+/*
+ * Checks if inequality constraints are satisfied. Constraint check
+ * results are stored in vector 'm'. A sum reduction over all elements
+ * of 'm' is performed to find if any of the constraints is violated.
+ * If all constraints are satisfied sum == 0.
+ *
+ */
+template <typename T, typename I>
+__global__ void
+constrMaskKernel(const T *c, const T *x, T *m, T *out, I n)
+{
+  T sum = 0.0;
+  GRID_STRIDE_XLOOP(I, i, n)
+  {
+    // test = true if constraints violated
+    bool test = (std::abs(c[i]) > 1.5 && c[i]*x[i] <= 0.0) ||
+                (std::abs(c[i]) > 0.5 && c[i]*x[i] <  0.0);
+    m[i] = test ? 1.0 : 0.0;
+    sum = m[i];
+  }
+  sum = blockReduce<T, RSUM>(sum, 0.0); 
+
+  // Copy reduction result for each block to global memory
+  if (threadIdx.x == 0) atomicAdd(out, sum);
+}
+
+
+/*
+ * Finds minimum component-wise quotient.
+ *
+ */
+template <typename T, typename I>
+__global__ void
+minQuotientKernel(const T MAX_VAL, const T *num, const T *den, T *min_quotient, I n)
+{
+  T minimum = MAX_VAL;
+  T quotient = 0.0;
+  GRID_STRIDE_XLOOP(I, i, n)
+  {
+    quotient = (den[i] == static_cast<T>(0.0)) ? MAX_VAL : num[i]/den[i];
+    minimum = min(quotient, minimum);
+  }
+  minimum = blockReduce<T, RMIN>(minimum, MAX_VAL); 
+
+  // minimum of reduction result for each block
+  if (threadIdx.x == 0) AtomicMin(min_quotient, minimum);
+}
+
+} // namespace nvector_cuda
+} // namespace sundials
+
+#endif // _NVECTOR_CUDA_KERNELS_CUH_
diff --git a/src/nvector/cuda/nvector_cuda.cu b/src/nvector/cuda/nvector_cuda.cu
index 12ea644d42..55fa4f5a66 100644
--- a/src/nvector/cuda/nvector_cuda.cu
+++ b/src/nvector/cuda/nvector_cuda.cu
@@ -15,29 +15,63 @@
  * of the NVECTOR package.
  * -----------------------------------------------------------------*/
 
-#include <stdio.h>
-#include <stdlib.h>
+#include <cstdio>
+#include <cstdlib>
 #include <cmath>
+#include <limits>
 
-#include <nvector/cuda/Vector.hpp>
-#include <nvector/cuda/VectorKernels.cuh>
-#include <nvector/cuda/VectorArrayKernels.cuh>
+#include <nvector/nvector_cuda.h>
+#include "VectorArrayKernels.cuh"
+#include "VectorKernels.cuh"
 
-#define ZERO   RCONST(0.0)
-#define HALF   RCONST(0.5)
-#define ONE    RCONST(1.0)
-#define ONEPT5 RCONST(1.5)
+#include "sundials_cuda.h"
+#include "sundials_debug.h"
+
+#define ZERO RCONST(0.0) 
+#define HALF RCONST(0.5)
 
 extern "C" {
 
-using namespace suncudavec;
+using namespace sundials;
+using namespace sundials::nvector_cuda;
 
 /*
- * Type definitions
+ * Macro definitions
  */
 
-typedef suncudavec::Vector<realtype, sunindextype> vector_type;
-typedef suncudavec::ThreadPartitioning<realtype, sunindextype> part_type;
+#define NVEC_CUDA_CONTENT(x) ((N_VectorContent_Cuda)(x->content))
+#define NVEC_CUDA_PRIVATE(x) ((N_PrivateVectorContent_Cuda)(NVEC_CUDA_CONTENT(x)->priv))
+#define NVEC_CUDA_MEMSIZE(x) (NVEC_CUDA_CONTENT(x)->length * sizeof(realtype))
+#define NVEC_CUDA_STREAM(x)  (NVEC_CUDA_CONTENT(x)->stream_exec_policy->stream())
+
+/*
+ * Private structure definition
+ */
+
+struct _N_PrivateVectorContent_Cuda
+{
+  booleantype use_managed_mem; /* indicates if the data pointers and buffer pointers are managed memory */
+  size_t      reduce_buffer_allocated_bytes; /* current size of the reduction buffer */
+  realtype*   reduce_buffer_dev;      /* device buffer used for reductions */
+  realtype*   reduce_buffer_host;     /* host buffer used for reductions */
+  void*       (*userallocfn)(size_t); /* a user provided allocator (assumes managed mem) */
+  void        (*userfreefn)(void*);   /* a user provided free function */
+  booleantype own_exec; /* indicates if the exec policy is owned by the vector */
+};
+
+typedef struct _N_PrivateVectorContent_Cuda *N_PrivateVectorContent_Cuda;
+
+/*
+ * Private function definitions
+ */
+
+static int AllocateData(N_Vector v);
+static int InitializeReductionBuffer(N_Vector v, const realtype value);
+static void FreeReductionBuffer(N_Vector v);
+static int CopyReductionBufferFromDevice(N_Vector v, size_t n = 1);
+static int GetKernelParameters(N_Vector v, booleantype reduction, size_t& grid, size_t& block,
+                               size_t& shMemSize, cudaStream_t& stream, size_t n = 0);
+static void PostKernelLaunch();                               
 
 /* ----------------------------------------------------------------
  * Returns vector type ID. Used to identify vector implementation
@@ -101,6 +135,36 @@ N_Vector N_VNewEmpty_Cuda()
   v->ops->nvwsqrsumlocal     = N_VWSqrSumLocal_Cuda;
   v->ops->nvwsqrsummasklocal = N_VWSqrSumMaskLocal_Cuda;
 
+  /* Create content */
+
+  v->content = (N_VectorContent_Cuda) malloc(sizeof(_N_VectorContent_Cuda));
+  if (v->content == NULL)
+  {
+    N_VDestroy(v);
+    return NULL;
+  }
+
+  NVEC_CUDA_CONTENT(v)->priv = malloc(sizeof(_N_PrivateVectorContent_Cuda));
+  if (NVEC_CUDA_CONTENT(v)->priv == NULL)
+  {
+    N_VDestroy(v);
+    return NULL;
+  }
+
+  NVEC_CUDA_CONTENT(v)->length                        = 0;
+  NVEC_CUDA_CONTENT(v)->own_data                      = SUNFALSE;
+  NVEC_CUDA_CONTENT(v)->host_data                     = NULL;
+  NVEC_CUDA_CONTENT(v)->device_data                   = NULL;
+  NVEC_CUDA_CONTENT(v)->stream_exec_policy            = NULL;
+  NVEC_CUDA_CONTENT(v)->reduce_exec_policy            = NULL;
+  NVEC_CUDA_PRIVATE(v)->own_exec                      = SUNTRUE;
+  NVEC_CUDA_PRIVATE(v)->use_managed_mem               = SUNFALSE;
+  NVEC_CUDA_PRIVATE(v)->userallocfn                   = NULL;
+  NVEC_CUDA_PRIVATE(v)->userfreefn                    = NULL;
+  NVEC_CUDA_PRIVATE(v)->reduce_buffer_dev             = NULL;
+  NVEC_CUDA_PRIVATE(v)->reduce_buffer_host            = NULL;
+  NVEC_CUDA_PRIVATE(v)->reduce_buffer_allocated_bytes = 0;
+  
   return(v);
 }
 
@@ -112,7 +176,26 @@ N_Vector N_VNew_Cuda(sunindextype length)
   v = N_VNewEmpty_Cuda();
   if (v == NULL) return(NULL);
 
-  v->content = new vector_type(length);
+  NVEC_CUDA_CONTENT(v)->length                        = length;
+  NVEC_CUDA_CONTENT(v)->own_data                      = SUNTRUE;
+  NVEC_CUDA_CONTENT(v)->host_data                     = NULL;
+  NVEC_CUDA_CONTENT(v)->device_data                   = NULL;
+  NVEC_CUDA_CONTENT(v)->stream_exec_policy            = new CudaThreadDirectExecPolicy(256);
+  NVEC_CUDA_CONTENT(v)->reduce_exec_policy            = new CudaBlockReduceExecPolicy(256);
+  NVEC_CUDA_PRIVATE(v)->own_exec                      = SUNTRUE;
+  NVEC_CUDA_PRIVATE(v)->use_managed_mem               = SUNFALSE;
+  NVEC_CUDA_PRIVATE(v)->userallocfn                   = NULL;
+  NVEC_CUDA_PRIVATE(v)->userfreefn                    = NULL;
+  NVEC_CUDA_PRIVATE(v)->reduce_buffer_dev             = NULL;
+  NVEC_CUDA_PRIVATE(v)->reduce_buffer_host            = NULL;
+  NVEC_CUDA_PRIVATE(v)->reduce_buffer_allocated_bytes = 0;
+
+  if (AllocateData(v))
+  {
+    SUNDIALS_DEBUG_PRINT("ERROR in N_VNew_Cuda: AllocateData returned nonzero\n");
+    N_VDestroy(v);
+    return NULL;
+  }
 
   return(v);
 }
@@ -129,8 +212,26 @@ N_Vector N_VNewManaged_Cuda(sunindextype length)
      nvgetarraypointer since the host and device pointers are the same */
   v->ops->nvgetarraypointer = N_VGetHostArrayPointer_Cuda;
 
-  /* create suncudavec::Vector with managed memory */
-  v->content = new vector_type(length, true);
+  NVEC_CUDA_CONTENT(v)->length                        = length;
+  NVEC_CUDA_CONTENT(v)->own_data                      = SUNTRUE;
+  NVEC_CUDA_CONTENT(v)->host_data                     = NULL;
+  NVEC_CUDA_CONTENT(v)->device_data                   = NULL;
+  NVEC_CUDA_CONTENT(v)->stream_exec_policy            = new CudaThreadDirectExecPolicy(256);
+  NVEC_CUDA_CONTENT(v)->reduce_exec_policy            = new CudaBlockReduceExecPolicy(256);
+  NVEC_CUDA_PRIVATE(v)->own_exec                      = SUNTRUE;
+  NVEC_CUDA_PRIVATE(v)->use_managed_mem               = SUNTRUE;
+  NVEC_CUDA_PRIVATE(v)->userallocfn                   = NULL;
+  NVEC_CUDA_PRIVATE(v)->userfreefn                    = NULL;
+  NVEC_CUDA_PRIVATE(v)->reduce_buffer_dev             = NULL;
+  NVEC_CUDA_PRIVATE(v)->reduce_buffer_host            = NULL;
+  NVEC_CUDA_PRIVATE(v)->reduce_buffer_allocated_bytes = 0;
+
+  if (AllocateData(v))
+  {
+    SUNDIALS_DEBUG_PRINT("ERROR in N_VNewManaged_Cuda: AllocateData returned nonzero\n");
+    N_VDestroy(v);
+    return NULL;
+  }
 
   return(v);
 }
@@ -145,8 +246,19 @@ N_Vector N_VMake_Cuda(sunindextype length, realtype *h_vdata, realtype *d_vdata)
   v = N_VNewEmpty_Cuda();
   if (v == NULL) return(NULL);
 
-  /* create suncudavec::Vector using the user-provided data arrays */
-  v->content = new vector_type(length, false, false, h_vdata, d_vdata);
+  NVEC_CUDA_CONTENT(v)->length                        = length;
+  NVEC_CUDA_CONTENT(v)->own_data                      = SUNFALSE;
+  NVEC_CUDA_CONTENT(v)->host_data                     = h_vdata;
+  NVEC_CUDA_CONTENT(v)->device_data                   = d_vdata;
+  NVEC_CUDA_CONTENT(v)->stream_exec_policy            = new CudaThreadDirectExecPolicy(256);
+  NVEC_CUDA_CONTENT(v)->reduce_exec_policy            = new CudaBlockReduceExecPolicy(256);
+  NVEC_CUDA_PRIVATE(v)->own_exec                      = SUNTRUE;
+  NVEC_CUDA_PRIVATE(v)->use_managed_mem               = SUNFALSE;
+  NVEC_CUDA_PRIVATE(v)->userallocfn                   = NULL;
+  NVEC_CUDA_PRIVATE(v)->userfreefn                    = NULL;
+  NVEC_CUDA_PRIVATE(v)->reduce_buffer_dev             = NULL;
+  NVEC_CUDA_PRIVATE(v)->reduce_buffer_host            = NULL;
+  NVEC_CUDA_PRIVATE(v)->reduce_buffer_allocated_bytes = 0;
 
   return(v);
 }
@@ -165,8 +277,19 @@ N_Vector N_VMakeManaged_Cuda(sunindextype length, realtype *vdata)
      nvgetarraypointer since the host and device pointers are the same */
   v->ops->nvgetarraypointer = N_VGetHostArrayPointer_Cuda;
 
-  /* create suncudavec::Vector with managed memory using the user-provided data arrays */
-  v->content = new vector_type(length, true, false, vdata, vdata);
+  NVEC_CUDA_CONTENT(v)->length                        = length;
+  NVEC_CUDA_CONTENT(v)->own_data                      = SUNFALSE;
+  NVEC_CUDA_CONTENT(v)->host_data                     = vdata;
+  NVEC_CUDA_CONTENT(v)->device_data                   = vdata;
+  NVEC_CUDA_CONTENT(v)->stream_exec_policy            = new CudaThreadDirectExecPolicy(256);
+  NVEC_CUDA_CONTENT(v)->reduce_exec_policy            = new CudaBlockReduceExecPolicy(256);
+  NVEC_CUDA_PRIVATE(v)->own_exec                      = SUNTRUE;
+  NVEC_CUDA_PRIVATE(v)->use_managed_mem               = SUNTRUE;
+  NVEC_CUDA_PRIVATE(v)->userallocfn                   = NULL;
+  NVEC_CUDA_PRIVATE(v)->userfreefn                    = NULL;
+  NVEC_CUDA_PRIVATE(v)->reduce_buffer_dev             = NULL;
+  NVEC_CUDA_PRIVATE(v)->reduce_buffer_host            = NULL;
+  NVEC_CUDA_PRIVATE(v)->reduce_buffer_allocated_bytes = 0;
 
   return(v);
 }
@@ -185,8 +308,26 @@ N_Vector N_VMakeWithManagedAllocator_Cuda(sunindextype length,
      nvgetarraypointer since the host and device pointers are the same */
   v->ops->nvgetarraypointer = N_VGetHostArrayPointer_Cuda;
 
-  /* create suncudavec::Vector with a custom allocator/deallocator */
-  v->content = new vector_type(length, allocfn, freefn, true);
+  NVEC_CUDA_CONTENT(v)->length                        = length;
+  NVEC_CUDA_CONTENT(v)->own_data                      = SUNTRUE;
+  NVEC_CUDA_CONTENT(v)->host_data                     = NULL;
+  NVEC_CUDA_CONTENT(v)->device_data                   = NULL;
+  NVEC_CUDA_CONTENT(v)->stream_exec_policy            = new CudaThreadDirectExecPolicy(256);
+  NVEC_CUDA_CONTENT(v)->reduce_exec_policy            = new CudaBlockReduceExecPolicy(256);
+  NVEC_CUDA_PRIVATE(v)->own_exec                      = SUNTRUE;
+  NVEC_CUDA_PRIVATE(v)->use_managed_mem               = SUNTRUE;
+  NVEC_CUDA_PRIVATE(v)->userallocfn                   = allocfn;
+  NVEC_CUDA_PRIVATE(v)->userfreefn                    = freefn;
+  NVEC_CUDA_PRIVATE(v)->reduce_buffer_dev             = NULL;
+  NVEC_CUDA_PRIVATE(v)->reduce_buffer_host            = NULL;
+  NVEC_CUDA_PRIVATE(v)->reduce_buffer_allocated_bytes = 0;
+
+  if (AllocateData(v))
+  {
+    SUNDIALS_DEBUG_PRINT("ERROR in N_VMakeWithManagedAllocator_Cuda: AllocateData returned nonzero\n");
+    N_VDestroy(v);
+    return NULL;
+  }
 
   return(v);
 }
@@ -194,10 +335,9 @@ N_Vector N_VMakeWithManagedAllocator_Cuda(sunindextype length,
 /* -----------------------------------------------------------------
  * Function to return the global length of the vector.
  */
-sunindextype N_VGetLength_Cuda(N_Vector v)
+sunindextype N_VGetLength_Cuda(N_Vector x)
 {
-  vector_type* xd = static_cast<vector_type*>(v->content);
-  return (xd->size());
+  return NVEC_CUDA_CONTENT(x)->length;
 }
 
 /* ----------------------------------------------------------------------------
@@ -206,8 +346,7 @@ sunindextype N_VGetLength_Cuda(N_Vector v)
 
 realtype *N_VGetHostArrayPointer_Cuda(N_Vector x)
 {
-  vector_type* xv = static_cast<vector_type*>(x->content);
-  return (xv->host());
+  return NVEC_CUDA_CONTENT(x)->host_data;
 }
 
 /* ----------------------------------------------------------------------------
@@ -216,8 +355,7 @@ realtype *N_VGetHostArrayPointer_Cuda(N_Vector x)
 
 realtype *N_VGetDeviceArrayPointer_Cuda(N_Vector x)
 {
-  vector_type* xv = static_cast<vector_type*>(x->content);
-  return (xv->device());
+  return NVEC_CUDA_CONTENT(x)->device_data;
 }
 
 /* ----------------------------------------------------------------------------
@@ -225,19 +363,44 @@ realtype *N_VGetDeviceArrayPointer_Cuda(N_Vector x)
  */
 booleantype N_VIsManagedMemory_Cuda(N_Vector x)
 {
-  vector_type* xv = static_cast<vector_type*>(x->content);
-  return (xv->isManaged());
+  return NVEC_CUDA_PRIVATE(x)->use_managed_mem;
+}
+
+int N_VSetKernelExecPolicy_Cuda(N_Vector x,
+                                SUNCudaExecPolicy* stream_exec_policy,
+                                SUNCudaExecPolicy* reduce_exec_policy)
+{
+  if (x == NULL || stream_exec_policy == NULL || reduce_exec_policy == NULL)
+    return -1;
+
+  if (NVEC_CUDA_PRIVATE(x)->own_exec)
+  {
+    delete NVEC_CUDA_CONTENT(x)->stream_exec_policy;
+    delete NVEC_CUDA_CONTENT(x)->reduce_exec_policy;
+  }
+
+  NVEC_CUDA_CONTENT(x)->stream_exec_policy = stream_exec_policy;
+  NVEC_CUDA_CONTENT(x)->reduce_exec_policy = reduce_exec_policy;
+  NVEC_CUDA_PRIVATE(x)->own_exec = SUNFALSE;
+
+  return 0;
 }
 
 /*
  * ----------------------------------------------------------------------------
+ * DEPRECATED: will be removed in SUNDIALS v6.
  * Sets the cudaStream_t to use for execution of the CUDA kernels.
  */
 void N_VSetCudaStream_Cuda(N_Vector x, cudaStream_t *stream)
 {
-  vector_type* xv = static_cast<vector_type*>(x->content);
-  xv->partStream().setStream(*stream);
-  xv->partReduce().setStream(*stream);
+  const CudaExecPolicy* xs = NVEC_CUDA_CONTENT(x)->stream_exec_policy;
+  const CudaExecPolicy* xr = NVEC_CUDA_CONTENT(x)->reduce_exec_policy;
+  CudaThreadDirectExecPolicy* s = 
+    new CudaThreadDirectExecPolicy(xs->blockSize(), *stream);
+  CudaBlockReduceExecPolicy* r =
+    new CudaBlockReduceExecPolicy(xr->blockSize(), xr->gridSize(), *stream);
+  N_VSetKernelExecPolicy_Cuda(x, s, r);
+  NVEC_CUDA_PRIVATE(x)->own_exec = SUNTRUE;
 }
 
 /* ----------------------------------------------------------------------------
@@ -246,8 +409,26 @@ void N_VSetCudaStream_Cuda(N_Vector x, cudaStream_t *stream)
 
 void N_VCopyToDevice_Cuda(N_Vector x)
 {
-  vector_type* xv = static_cast<vector_type*>(x->content);
-  xv->copyToDev();
+  cudaError_t err;
+
+  /* If the host and device pointers are the same, then we don't need
+     to do a copy (this happens in the managed memory case), but we
+     still need to synchronize the device to adhere to the unified
+     memory access rules. */
+  if (NVEC_CUDA_PRIVATE(x)->use_managed_mem)
+  {
+    err = cudaStreamSynchronize(NVEC_CUDA_STREAM(x));
+    SUNDIALS_CUDA_VERIFY(err);
+  }
+  else
+  {
+    err = cudaMemcpyAsync(NVEC_CUDA_CONTENT(x)->device_data,
+                          NVEC_CUDA_CONTENT(x)->host_data,
+                          NVEC_CUDA_MEMSIZE(x),
+                          cudaMemcpyHostToDevice,
+                          NVEC_CUDA_STREAM(x));
+    SUNDIALS_CUDA_VERIFY(err);
+  }
 }
 
 /* ----------------------------------------------------------------------------
@@ -256,8 +437,26 @@ void N_VCopyToDevice_Cuda(N_Vector x)
 
 void N_VCopyFromDevice_Cuda(N_Vector x)
 {
-  vector_type* xv = static_cast<vector_type*>(x->content);
-  xv->copyFromDev();
+  cudaError_t err;
+
+  /* If the host and device pointers are the same, then we don't need
+     to do a copy (this happens in the managed memory case), but we
+     still need to synchronize the device to adhere to the unified
+     memory access rules. */
+  if (NVEC_CUDA_PRIVATE(x)->use_managed_mem)
+  {
+    err = cudaStreamSynchronize(NVEC_CUDA_STREAM(x));
+    SUNDIALS_CUDA_VERIFY(err);
+  }
+  else
+  {
+    err = cudaMemcpyAsync(NVEC_CUDA_CONTENT(x)->host_data,
+                          NVEC_CUDA_CONTENT(x)->device_data,
+                          NVEC_CUDA_MEMSIZE(x),
+                          cudaMemcpyDeviceToHost,
+                          NVEC_CUDA_STREAM(x));
+    SUNDIALS_CUDA_VERIFY(err);
+  }
 }
 
 /* ----------------------------------------------------------------------------
@@ -276,15 +475,14 @@ void N_VPrint_Cuda(N_Vector x)
 void N_VPrintFile_Cuda(N_Vector x, FILE *outfile)
 {
   sunindextype i;
-  vector_type* xd = static_cast<vector_type*>(x->content);
 
-  for (i = 0; i < xd->size(); i++) {
+  for (i = 0; i < NVEC_CUDA_CONTENT(x)->length; i++) {
 #if defined(SUNDIALS_EXTENDED_PRECISION)
-    fprintf(outfile, "%35.32Lg\n", xd->host()[i]);
+    fprintf(outfile, "%35.32Lg\n", NVEC_CUDA_CONTENT(x)->host_data[i]);
 #elif defined(SUNDIALS_DOUBLE_PRECISION)
-    fprintf(outfile, "%19.16g\n", xd->host()[i]);
+    fprintf(outfile, "%19.16g\n", NVEC_CUDA_CONTENT(x)->host_data[i]);
 #else
-    fprintf(outfile, "%11.8g\n", xd->host()[i]);
+    fprintf(outfile, "%11.8g\n", NVEC_CUDA_CONTENT(x)->host_data[i]);
 #endif
   }
   fprintf(outfile, "\n");
@@ -307,12 +505,25 @@ N_Vector N_VCloneEmpty_Cuda(N_Vector w)
 
   /* Create vector */
   v = NULL;
-  v = N_VNewEmpty();
+  v = N_VNewEmpty_Cuda();
   if (v == NULL) return(NULL);
 
   /* Attach operations */
   if (N_VCopyOps(w, v)) { N_VDestroy(v); return(NULL); }
 
+  /* Set content */
+  NVEC_CUDA_CONTENT(v)->length                        = NVEC_CUDA_CONTENT(w)->length;
+  NVEC_CUDA_CONTENT(v)->own_data                      = SUNFALSE;
+  NVEC_CUDA_CONTENT(v)->host_data                     = NULL;
+  NVEC_CUDA_CONTENT(v)->device_data                   = NULL;
+  NVEC_CUDA_PRIVATE(v)->own_exec                      = SUNTRUE;
+  NVEC_CUDA_PRIVATE(v)->use_managed_mem               = NVEC_CUDA_PRIVATE(w)->use_managed_mem;
+  NVEC_CUDA_PRIVATE(v)->userallocfn                   = NVEC_CUDA_PRIVATE(w)->userallocfn;
+  NVEC_CUDA_PRIVATE(v)->userfreefn                    = NVEC_CUDA_PRIVATE(w)->userfreefn;
+  NVEC_CUDA_PRIVATE(v)->reduce_buffer_dev             = NULL;
+  NVEC_CUDA_PRIVATE(v)->reduce_buffer_host            = NULL;
+  NVEC_CUDA_PRIVATE(v)->reduce_buffer_allocated_bytes = 0;
+
   return(v);
 }
 
@@ -323,10 +534,16 @@ N_Vector N_VClone_Cuda(N_Vector w)
   v = N_VCloneEmpty_Cuda(w);
   if (v == NULL) return(NULL);
 
-  vector_type* wdat = static_cast<vector_type*>(w->content);
-  vector_type* vdat = new vector_type(*wdat);
+  NVEC_CUDA_CONTENT(v)->stream_exec_policy = NVEC_CUDA_CONTENT(w)->stream_exec_policy->clone();
+  NVEC_CUDA_CONTENT(v)->reduce_exec_policy = NVEC_CUDA_CONTENT(w)->reduce_exec_policy->clone();
 
-  v->content = vdat;
+  NVEC_CUDA_CONTENT(v)->own_data = SUNTRUE;
+  if (AllocateData(v))
+  {
+    SUNDIALS_DEBUG_PRINT("ERROR in N_VClone_Cuda: AllocateData returned nonzero\n");
+    N_VDestroy(v);
+    return NULL;
+  }
 
   return(v);
 }
@@ -336,130 +553,348 @@ void N_VDestroy_Cuda(N_Vector v)
 {
   if (v == NULL) return;
 
-  vector_type* x = static_cast<vector_type*>(v->content);
-  if (x != NULL) {
-    delete x;
-    v->content = NULL;
+  N_VectorContent_Cuda vc = NVEC_CUDA_CONTENT(v);
+  if (vc == NULL)
+  {
+    free(v);
+    v = NULL;
+    return;
+  }
+
+  N_PrivateVectorContent_Cuda vcp = NVEC_CUDA_PRIVATE(v);
+
+  /* free items in content */
+  if (vc->own_data)
+  {
+    if (vcp != NULL && vcp->userfreefn)
+    {
+      vcp->userfreefn(vc->device_data);
+      vc->device_data = NULL;
+      vc->host_data = NULL;
+    }
+    else
+    {
+      if (vcp != NULL && !vcp->use_managed_mem) free(vc->host_data);
+      SUNDIALS_CUDA_VERIFY(cudaFree(vc->device_data));
+      vc->device_data = NULL;
+      vc->host_data = NULL;
+    }
+  }
+
+  /* free execution policies */
+  if (vcp != NULL && vcp->own_exec)
+  {
+    delete vc->stream_exec_policy;
+    vc->stream_exec_policy = NULL;
+    delete vc->reduce_exec_policy;
+    vc->reduce_exec_policy = NULL;
   }
 
-  /* free ops and vector */
-  if (v->ops != NULL) { free(v->ops); v->ops = NULL; }
-  free(v); v = NULL;
+  /* free reduction buffer */
+  FreeReductionBuffer(v);
+
+  /* free private content struct */
+  if (vcp) free(vcp);
+  vc->priv = NULL;
+
+  /* free content struct */
+  free(vc);
+  v->content = NULL;
+
+  /* free ops */
+  if (v->ops != NULL)
+  {
+    free(v->ops);
+    v->ops = NULL;
+  }
+
+  /* free vector */
+  free(v);
+  v = NULL;
 
   return;
 }
 
 void N_VSpace_Cuda(N_Vector X, sunindextype *lrw, sunindextype *liw)
 {
-  vector_type* x = static_cast<vector_type*>(X->content);
-  *lrw = x->size();
+  *lrw = NVEC_CUDA_CONTENT(X)->length;
   *liw = 2;
 }
 
 void N_VConst_Cuda(realtype a, N_Vector X)
 {
-  vector_type *xvec = static_cast<vector_type*>(X->content);
-  setConst(a, *xvec);
+  size_t grid, block, shMemSize;
+  cudaStream_t stream;
+
+  GetKernelParameters(X, false, grid, block, shMemSize, stream);
+  setConstKernel<<<grid, block, shMemSize, stream>>>
+  (
+    a,
+    NVEC_CUDA_CONTENT(X)->device_data,
+    NVEC_CUDA_CONTENT(X)->length
+  );
+  PostKernelLaunch();
 }
 
 void N_VLinearSum_Cuda(realtype a, N_Vector X, realtype b, N_Vector Y, N_Vector Z)
 {
-  const vector_type *xvec = static_cast<vector_type*>(X->content);
-  const vector_type *yvec = static_cast<vector_type*>(Y->content);
-  vector_type *zvec = static_cast<vector_type*>(Z->content);
-  linearSum(a, *xvec, b, *yvec, *zvec);
+  size_t grid, block, shMemSize;
+  cudaStream_t stream;
+
+  GetKernelParameters(X, false, grid, block, shMemSize, stream);
+  linearSumKernel<<<grid, block, shMemSize, stream>>>
+  (
+    a,
+    NVEC_CUDA_CONTENT(X)->device_data,
+    b,
+    NVEC_CUDA_CONTENT(Y)->device_data,
+    NVEC_CUDA_CONTENT(Z)->device_data,
+    NVEC_CUDA_CONTENT(X)->length
+  );
+  PostKernelLaunch();
 }
 
 void N_VProd_Cuda(N_Vector X, N_Vector Y, N_Vector Z)
 {
-  const vector_type *xvec = static_cast<vector_type*>(X->content);
-  const vector_type *yvec = static_cast<vector_type*>(Y->content);
-  vector_type *zvec = static_cast<vector_type*>(Z->content);
-  prod(*xvec, *yvec, *zvec);
+  size_t grid, block, shMemSize;
+  cudaStream_t stream;
+
+  GetKernelParameters(X, false, grid, block, shMemSize, stream);
+  prodKernel<<<grid, block, shMemSize, stream>>>
+  (
+    NVEC_CUDA_CONTENT(X)->device_data,
+    NVEC_CUDA_CONTENT(Y)->device_data,
+    NVEC_CUDA_CONTENT(Z)->device_data,
+    NVEC_CUDA_CONTENT(X)->length
+  );
+  PostKernelLaunch();
 }
 
 void N_VDiv_Cuda(N_Vector X, N_Vector Y, N_Vector Z)
 {
-  const vector_type *xvec = static_cast<vector_type*>(X->content);
-  const vector_type *yvec = static_cast<vector_type*>(Y->content);
-  vector_type *zvec = static_cast<vector_type*>(Z->content);
-  div(*xvec, *yvec, *zvec);
+  size_t grid, block, shMemSize;
+  cudaStream_t stream;
+
+  GetKernelParameters(X, false, grid, block, shMemSize, stream);
+  divKernel<<<grid, block, shMemSize, stream>>>
+  (
+    NVEC_CUDA_CONTENT(X)->device_data,
+    NVEC_CUDA_CONTENT(Y)->device_data,
+    NVEC_CUDA_CONTENT(Z)->device_data,
+    NVEC_CUDA_CONTENT(X)->length
+  );
+  PostKernelLaunch();
 }
 
 void N_VScale_Cuda(realtype a, N_Vector X, N_Vector Z)
 {
-  const vector_type *xvec = static_cast<vector_type*>(X->content);
-  vector_type *zvec = static_cast<vector_type*>(Z->content);
-  scale(a, *xvec, *zvec);
+  size_t grid, block, shMemSize;
+  cudaStream_t stream;
+
+  GetKernelParameters(X, false, grid, block, shMemSize, stream);
+  scaleKernel<<<grid, block, shMemSize, stream>>>
+  (
+    a,
+    NVEC_CUDA_CONTENT(X)->device_data,
+    NVEC_CUDA_CONTENT(Z)->device_data,
+    NVEC_CUDA_CONTENT(X)->length
+  );
+  PostKernelLaunch();
 }
 
 void N_VAbs_Cuda(N_Vector X, N_Vector Z)
 {
-  const vector_type *xvec = static_cast<vector_type*>(X->content);
-  vector_type *zvec = static_cast<vector_type*>(Z->content);
-  absVal(*xvec, *zvec);
+  size_t grid, block, shMemSize;
+  cudaStream_t stream;
+
+  GetKernelParameters(X, false, grid, block, shMemSize, stream);
+  absKernel<<<grid, block, shMemSize, stream>>>
+  (
+    NVEC_CUDA_CONTENT(X)->device_data,
+    NVEC_CUDA_CONTENT(Z)->device_data,
+    NVEC_CUDA_CONTENT(X)->length
+  );
+  PostKernelLaunch();
 }
 
 void N_VInv_Cuda(N_Vector X, N_Vector Z)
 {
-  const vector_type *xvec = static_cast<vector_type*>(X->content);
-  vector_type *zvec = static_cast<vector_type*>(Z->content);
-  inv(*xvec, *zvec);
+  size_t grid, block, shMemSize;
+  cudaStream_t stream;
+
+  GetKernelParameters(X, false, grid, block, shMemSize, stream);
+  invKernel<<<grid, block, shMemSize, stream>>>
+  (
+    NVEC_CUDA_CONTENT(X)->device_data,
+    NVEC_CUDA_CONTENT(Z)->device_data,
+    NVEC_CUDA_CONTENT(X)->length
+  );
+  PostKernelLaunch();
 }
 
 void N_VAddConst_Cuda(N_Vector X, realtype b, N_Vector Z)
 {
-  const vector_type *xvec = static_cast<vector_type*>(X->content);
-  vector_type *zvec = static_cast<vector_type*>(Z->content);
-  addConst(b, *xvec, *zvec);
+  size_t grid, block, shMemSize;
+  cudaStream_t stream;
+
+  GetKernelParameters(X, false, grid, block, shMemSize, stream);
+  addConstKernel<<<grid, block, shMemSize, stream>>>
+  (
+    b,
+    NVEC_CUDA_CONTENT(X)->device_data,
+    NVEC_CUDA_CONTENT(Z)->device_data,
+    NVEC_CUDA_CONTENT(X)->length
+  );
+  PostKernelLaunch();
 }
 
 realtype N_VDotProd_Cuda(N_Vector X, N_Vector Y)
 {
-  const vector_type *xvec = static_cast<vector_type*>(X->content);
-  const vector_type *yvec = static_cast<vector_type*>(Y->content);
-  return(dotProd(*xvec, *yvec));
+  size_t grid, block, shMemSize;
+  cudaStream_t stream;
+
+  if (InitializeReductionBuffer(X, ZERO)) 
+  {
+    SUNDIALS_DEBUG_PRINT("ERROR in N_VDotProd_Cuda: InitializeReductionBuffer returned nonzero\n");
+  }
+
+  GetKernelParameters(X, true, grid, block, shMemSize, stream);
+  dotProdKernel<realtype, sunindextype><<<grid, block, shMemSize, stream>>>
+  (
+    NVEC_CUDA_CONTENT(X)->device_data,
+    NVEC_CUDA_CONTENT(Y)->device_data,
+    NVEC_CUDA_PRIVATE(X)->reduce_buffer_dev,
+    NVEC_CUDA_CONTENT(X)->length
+  );
+  PostKernelLaunch();
+
+  // Get result from the GPU
+  CopyReductionBufferFromDevice(X);
+  realtype gpu_result = NVEC_CUDA_PRIVATE(X)->reduce_buffer_host[0];
+
+  return gpu_result;
 }
 
 realtype N_VMaxNorm_Cuda(N_Vector X)
 {
-  const vector_type *xvec = static_cast<vector_type*>(X->content);
-  return(maxNorm(*xvec));
+  size_t grid, block, shMemSize;
+  cudaStream_t stream;
+
+  if (InitializeReductionBuffer(X, ZERO)) 
+  {
+    SUNDIALS_DEBUG_PRINT("ERROR in N_VMaxNorm_Cuda: InitializeReductionBuffer returned nonzero\n");
+  }
+
+  GetKernelParameters(X, true, grid, block, shMemSize, stream);
+  maxNormKernel<realtype, sunindextype><<<grid, block, shMemSize, stream>>>
+  (
+    NVEC_CUDA_CONTENT(X)->device_data,
+    NVEC_CUDA_PRIVATE(X)->reduce_buffer_dev,
+    NVEC_CUDA_CONTENT(X)->length
+  );
+  PostKernelLaunch();
+
+  // Finish reduction on CPU if there are less than two blocks of data left.
+  CopyReductionBufferFromDevice(X);
+  realtype gpu_result = NVEC_CUDA_PRIVATE(X)->reduce_buffer_host[0];
+
+  return gpu_result;
 }
 
 realtype N_VWSqrSumLocal_Cuda(N_Vector X, N_Vector W)
 {
-  const vector_type *xvec = static_cast<vector_type*>(X->content);
-  const vector_type *wvec = static_cast<vector_type*>(W->content);
-  return(wL2NormSquare(*xvec, *wvec));
+  size_t grid, block, shMemSize;
+  cudaStream_t stream;
+
+  if (InitializeReductionBuffer(X, ZERO)) 
+  {
+    SUNDIALS_DEBUG_PRINT("ERROR in N_VWSqrSumLocal_Cuda: InitializeReductionBuffer returned nonzero\n");
+  }
+
+  GetKernelParameters(X, true, grid, block, shMemSize, stream);
+  wL2NormSquareKernel<realtype, sunindextype><<<grid, block, shMemSize, stream>>>
+  (
+    NVEC_CUDA_CONTENT(X)->device_data,
+    NVEC_CUDA_CONTENT(W)->device_data,
+    NVEC_CUDA_PRIVATE(X)->reduce_buffer_dev,
+    NVEC_CUDA_CONTENT(X)->length
+  );
+  PostKernelLaunch();
+
+  // Get result from the GPU
+  CopyReductionBufferFromDevice(X);
+  realtype gpu_result = NVEC_CUDA_PRIVATE(X)->reduce_buffer_host[0];
+
+  return gpu_result;
 }
 
 realtype N_VWrmsNorm_Cuda(N_Vector X, N_Vector W)
 {
   const realtype sum = N_VWSqrSumLocal_Cuda(X, W);
-  const vector_type *xvec = static_cast<vector_type*>(X->content);
-  return std::sqrt(sum/xvec->size());
+  return std::sqrt(sum/NVEC_CUDA_CONTENT(X)->length);
 }
 
 realtype N_VWSqrSumMaskLocal_Cuda(N_Vector X, N_Vector W, N_Vector Id)
 {
-  const vector_type *xvec = static_cast<vector_type*>(X->content);
-  const vector_type *wvec = static_cast<vector_type*>(W->content);
-  const vector_type *ivec = static_cast<vector_type*>(Id->content);
-  return(wL2NormSquareMask(*xvec, *wvec, *ivec));
+  size_t grid, block, shMemSize;
+  cudaStream_t stream;
+
+  if (InitializeReductionBuffer(X, ZERO)) 
+  {
+    SUNDIALS_DEBUG_PRINT("ERROR in N_VWSqrSumMaskLocal_Cuda: InitializeReductionBuffer returned nonzero\n");
+  }
+
+  GetKernelParameters(X, true, grid, block, shMemSize, stream);
+  wL2NormSquareMaskKernel<realtype, sunindextype><<<grid, block, shMemSize, stream>>>
+  (
+    NVEC_CUDA_CONTENT(X)->device_data,
+    NVEC_CUDA_CONTENT(W)->device_data,
+    NVEC_CUDA_CONTENT(Id)->device_data,
+    NVEC_CUDA_PRIVATE(X)->reduce_buffer_dev,
+    NVEC_CUDA_CONTENT(X)->length
+  );
+  PostKernelLaunch();
+
+  // Get result from the GPU
+  CopyReductionBufferFromDevice(X);
+  realtype gpu_result = NVEC_CUDA_PRIVATE(X)->reduce_buffer_host[0];
+
+  return gpu_result;
 }
 
 realtype N_VWrmsNormMask_Cuda(N_Vector X, N_Vector W, N_Vector Id)
 {
   const realtype sum = N_VWSqrSumMaskLocal_Cuda(X, W, Id);
-  const vector_type *xvec = static_cast<vector_type*>(X->content);
-  return std::sqrt(sum/xvec->size());
+  return std::sqrt(sum/NVEC_CUDA_CONTENT(X)->length);
 }
 
 realtype N_VMin_Cuda(N_Vector X)
 {
-  const vector_type *xvec = static_cast<vector_type*>(X->content);
-  return(findMin(*xvec));
+  const realtype maxVal = std::numeric_limits<realtype>::max();
+
+  size_t grid, block, shMemSize;
+  cudaStream_t stream;
+
+  if (InitializeReductionBuffer(X, maxVal)) 
+  {
+    SUNDIALS_DEBUG_PRINT("ERROR in N_VMin_Cuda: InitializeReductionBuffer returned nonzero\n");
+  }
+
+  GetKernelParameters(X, true, grid, block, shMemSize, stream);
+  findMinKernel<realtype, sunindextype><<<grid, block, shMemSize, stream>>>
+  (
+    maxVal,
+    NVEC_CUDA_CONTENT(X)->device_data,
+    NVEC_CUDA_PRIVATE(X)->reduce_buffer_dev,
+    NVEC_CUDA_CONTENT(X)->length
+  );
+  PostKernelLaunch();
+
+  // Get result from the GPU
+  CopyReductionBufferFromDevice(X);
+  realtype gpu_result = NVEC_CUDA_PRIVATE(X)->reduce_buffer_host[0];
+
+  return gpu_result;
 }
 
 realtype N_VWL2Norm_Cuda(N_Vector X, N_Vector W)
@@ -470,39 +905,129 @@ realtype N_VWL2Norm_Cuda(N_Vector X, N_Vector W)
 
 realtype N_VL1Norm_Cuda(N_Vector X)
 {
-  const vector_type *xvec = static_cast<vector_type*>(X->content);
-  return(L1Norm(*xvec));
+  size_t grid, block, shMemSize;
+  cudaStream_t stream;
+
+  if (InitializeReductionBuffer(X, ZERO)) 
+  {
+    SUNDIALS_DEBUG_PRINT("ERROR in N_VL1Norm_Cuda: InitializeReductionBuffer returned nonzero\n");
+  }
+
+  GetKernelParameters(X, true, grid, block, shMemSize, stream);
+  L1NormKernel<realtype, sunindextype><<<grid, block, shMemSize, stream>>>
+  (
+    NVEC_CUDA_CONTENT(X)->device_data,
+    NVEC_CUDA_PRIVATE(X)->reduce_buffer_dev,
+    NVEC_CUDA_CONTENT(X)->length
+  );
+  PostKernelLaunch();
+
+  // Get result from the GPU
+  CopyReductionBufferFromDevice(X);
+  realtype gpu_result = NVEC_CUDA_PRIVATE(X)->reduce_buffer_host[0];
+
+  return gpu_result;
 }
 
 void N_VCompare_Cuda(realtype c, N_Vector X, N_Vector Z)
 {
-  const vector_type *xvec = static_cast<vector_type*>(X->content);
-  vector_type *zvec = static_cast<vector_type*>(Z->content);
-  compare(c, *xvec, *zvec);
+  size_t grid, block, shMemSize;
+  cudaStream_t stream;
+
+  GetKernelParameters(X, false, grid, block, shMemSize, stream);
+  compareKernel<<<grid, block, shMemSize, stream>>>
+  (
+    c,
+    NVEC_CUDA_CONTENT(X)->device_data,
+    NVEC_CUDA_CONTENT(Z)->device_data,
+    NVEC_CUDA_CONTENT(X)->length
+  );
+  PostKernelLaunch();
 }
 
 booleantype N_VInvTest_Cuda(N_Vector X, N_Vector Z)
 {
-  const vector_type *xvec = static_cast<vector_type*>(X->content);
-  vector_type *zvec = static_cast<vector_type*>(Z->content);
-  const realtype locmin = invTest(*xvec, *zvec);
-  return (locmin < HALF);
+  size_t grid, block, shMemSize;
+  cudaStream_t stream;
+
+  if (InitializeReductionBuffer(X, ZERO)) 
+  {
+    SUNDIALS_DEBUG_PRINT("ERROR in N_VInvTest_Cuda: InitializeReductionBuffer returned nonzero\n");
+  }
+
+  GetKernelParameters(X, true, grid, block, shMemSize, stream);
+  invTestKernel<realtype, sunindextype><<<grid, block, shMemSize, stream>>>
+  (
+    NVEC_CUDA_CONTENT(X)->device_data,
+    NVEC_CUDA_CONTENT(Z)->device_data,
+    NVEC_CUDA_PRIVATE(X)->reduce_buffer_dev,
+    NVEC_CUDA_CONTENT(X)->length
+  );
+  PostKernelLaunch();
+
+  // Get result from the GPU
+  CopyReductionBufferFromDevice(X);
+  realtype gpu_result = NVEC_CUDA_PRIVATE(X)->reduce_buffer_host[0];
+
+  return (gpu_result < HALF);
 }
 
 booleantype N_VConstrMask_Cuda(N_Vector C, N_Vector X, N_Vector M)
 {
-  const vector_type *cvec = static_cast<vector_type*>(C->content);
-  const vector_type *xvec = static_cast<vector_type*>(X->content);
-  vector_type *mvec = static_cast<vector_type*>(M->content);
-  const realtype locsum = constrMask(*cvec, *xvec, *mvec);
-  return (locsum < HALF);
+  size_t grid, block, shMemSize;
+  cudaStream_t stream;
+
+  if (InitializeReductionBuffer(X, ZERO)) 
+  {
+    SUNDIALS_DEBUG_PRINT("ERROR in N_VConstrMask_Cuda: InitializeReductionBuffer returned nonzero\n");
+  }
+
+  GetKernelParameters(X, true, grid, block, shMemSize, stream);
+  constrMaskKernel<realtype, sunindextype><<<grid, block, shMemSize, stream>>>
+  (
+    NVEC_CUDA_CONTENT(C)->device_data,
+    NVEC_CUDA_CONTENT(X)->device_data,
+    NVEC_CUDA_CONTENT(M)->device_data,
+    NVEC_CUDA_PRIVATE(X)->reduce_buffer_dev,
+    NVEC_CUDA_CONTENT(X)->length
+  );
+  PostKernelLaunch();
+
+  // Get result from the GPU
+  CopyReductionBufferFromDevice(X);
+  realtype gpu_result = NVEC_CUDA_PRIVATE(X)->reduce_buffer_host[0];
+
+  return (gpu_result < HALF);
 }
 
 realtype N_VMinQuotient_Cuda(N_Vector num, N_Vector denom)
 {
-  const vector_type *numvec = static_cast<vector_type*>(num->content);
-  const vector_type *denvec = static_cast<vector_type*>(denom->content);
-  return(minQuotient(*numvec, *denvec));
+  // Starting value for min reduction
+  const realtype maxVal = std::numeric_limits<realtype>::max();
+  size_t grid, block, shMemSize;
+  cudaStream_t stream;
+
+  if (InitializeReductionBuffer(num, maxVal)) 
+  {
+    SUNDIALS_DEBUG_PRINT("ERROR in N_VMinQuotient_Cuda: InitializeReductionBuffer returned nonzero\n");
+  }
+
+  GetKernelParameters(num, true, grid, block, shMemSize, stream);
+  minQuotientKernel<realtype, sunindextype><<<grid, block, shMemSize, stream>>>
+  (
+    maxVal,
+    NVEC_CUDA_CONTENT(num)->device_data,
+    NVEC_CUDA_CONTENT(denom)->device_data,
+    NVEC_CUDA_PRIVATE(num)->reduce_buffer_dev,
+    NVEC_CUDA_CONTENT(num)->length
+  );
+  PostKernelLaunch();
+
+  // Get result from the GPU
+  CopyReductionBufferFromDevice(num);
+  realtype gpu_result = NVEC_CUDA_PRIVATE(num)->reduce_buffer_host[0];
+
+  return gpu_result;
 }
 
 /*
@@ -514,66 +1039,172 @@ realtype N_VMinQuotient_Cuda(N_Vector num, N_Vector denom)
 int N_VLinearCombination_Cuda(int nvec, realtype* c, N_Vector* X, N_Vector Z)
 {
   cudaError_t err;
-  vector_type** Xv;
-  vector_type*  Zv;
 
-  Zv = static_cast<vector_type*>(Z->content);
+  // Copy c array to device
+  realtype* d_c;
+  err = cudaMalloc((void**) &d_c, nvec*sizeof(realtype));
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+  err = cudaMemcpy(d_c, c, nvec*sizeof(realtype), cudaMemcpyHostToDevice);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
 
-  Xv = new vector_type*[nvec];
+  // Create array of device pointers on host
+  realtype** h_Xd = new realtype*[nvec];
   for (int i=0; i<nvec; i++)
-    Xv[i] = static_cast<vector_type*>(X[i]->content);
-
-  err = linearCombination(nvec, c, Xv, Zv);
-
-  delete[] Xv;
+    h_Xd[i] = NVEC_CUDA_CONTENT(X[i])->device_data;
+
+  // Copy array of device pointers to device from host
+  realtype** d_Xd;
+  err = cudaMalloc((void**) &d_Xd, nvec*sizeof(realtype*));
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+  err = cudaMemcpy(d_Xd, h_Xd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+
+  // Set kernel parameters and launch
+  size_t grid, block, shMemSize;
+  cudaStream_t stream;
+
+  if (GetKernelParameters(X[0], false, grid, block, shMemSize, stream)) return -1;
+  linearCombinationKernel<<<grid, block, shMemSize, stream>>>
+  (
+    nvec,
+    d_c,
+    d_Xd,
+    NVEC_CUDA_CONTENT(Z)->device_data,
+    NVEC_CUDA_CONTENT(Z)->length
+  );
+  PostKernelLaunch();
+
+  // Free host array
+  delete[] h_Xd;
+
+  // Free device arrays
+  err = cudaFree(d_c);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+  err = cudaFree(d_Xd);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
 
-  return err == cudaSuccess ? 0 : -1;
+  return 0;
 }
 
 int N_VScaleAddMulti_Cuda(int nvec, realtype* c, N_Vector X, N_Vector* Y,
-                           N_Vector* Z)
+                          N_Vector* Z)
 {
   cudaError_t err;
-  vector_type*  Xv;
-  vector_type** Yv;
-  vector_type** Zv;
 
-  Xv = static_cast<vector_type*>(X->content);
+  // Copy c array to device
+  realtype* d_c;
+  err = cudaMalloc((void**) &d_c, nvec*sizeof(realtype));
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+  err = cudaMemcpy(d_c, c, nvec*sizeof(realtype), cudaMemcpyHostToDevice);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
 
-  Yv = new vector_type*[nvec];
+  // Create array of device pointers on host
+  realtype** h_Yd = new realtype*[nvec];
   for (int i=0; i<nvec; i++)
-    Yv[i] = static_cast<vector_type*>(Y[i]->content);
+    h_Yd[i] = NVEC_CUDA_CONTENT(Y[i])->device_data;
 
-  Zv = new vector_type*[nvec];
+  realtype** h_Zd = new realtype*[nvec];
   for (int i=0; i<nvec; i++)
-    Zv[i] = static_cast<vector_type*>(Z[i]->content);
-
-  err = scaleAddMulti(nvec, c, Xv, Yv, Zv);
-
-  delete[] Yv;
-  delete[] Zv;
+    h_Zd[i] = NVEC_CUDA_CONTENT(Z[i])->device_data;
+
+  // Copy array of device pointers to device from host
+  realtype** d_Yd;
+  err = cudaMalloc((void**) &d_Yd, nvec*sizeof(realtype*));
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+  err = cudaMemcpy(d_Yd, h_Yd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+
+  realtype** d_Zd;
+  err = cudaMalloc((void**) &d_Zd, nvec*sizeof(realtype*));
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+  err = cudaMemcpy(d_Zd, h_Zd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+
+  // Set kernel parameters
+  size_t grid, block, shMemSize;
+  cudaStream_t stream;
+
+  if (GetKernelParameters(X, false, grid, block, shMemSize, stream)) return -1;
+  scaleAddMultiKernel<<<grid, block, shMemSize, stream>>>
+  (
+    nvec,
+    d_c,
+    NVEC_CUDA_CONTENT(X)->device_data,
+    d_Yd,
+    d_Zd,
+    NVEC_CUDA_CONTENT(X)->length
+  );
+  PostKernelLaunch();
+
+  // Free host array
+  delete[] h_Yd;
+  delete[] h_Zd;
+
+  // Free device arrays
+  err = cudaFree(d_c);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+  err = cudaFree(d_Yd);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+  err = cudaFree(d_Zd);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
 
-  return err == cudaSuccess ? 0 : -1;
+  return 0;
 }
 
-
-int N_VDotProdMulti_Cuda(int nvec, N_Vector x, N_Vector* Y, realtype* dotprods)
+int N_VDotProdMulti_Cuda(int nvec, N_Vector X, N_Vector* Y, realtype* dots)
 {
   cudaError_t err;
-  vector_type*  Xv;
-  vector_type** Yv;
-
-  Xv = static_cast<vector_type*>(x->content);
 
-  Yv = new vector_type*[nvec];
+  // Create array of device pointers on host
+  realtype** h_Yd = new realtype*[nvec];
   for (int i=0; i<nvec; i++)
-    Yv[i] = static_cast<vector_type*>(Y[i]->content);
+    h_Yd[i] = NVEC_CUDA_CONTENT(Y[i])->device_data;
+
+  // Copy array of device pointers to device from host
+  realtype** d_Yd;
+  err = cudaMalloc((void**) &d_Yd, nvec*sizeof(realtype*));
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+  err = cudaMemcpy(d_Yd, h_Yd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+
+  // Set kernel parameters
+  size_t grid, block, shMemSize;
+  cudaStream_t stream;
+  
+  if (GetKernelParameters(X, false, grid, block, shMemSize, stream)) return -1;
+  grid = nvec;
+
+  // Allocate reduction buffer on device
+  realtype* d_buff;
+  err = cudaMalloc((void**) &d_buff, grid*sizeof(realtype));
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+  err = cudaMemsetAsync(d_buff, 0, grid*sizeof(realtype));
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+
+  dotProdMultiKernel<realtype, sunindextype><<<grid, block, shMemSize, stream>>>
+  (
+    nvec,
+    NVEC_CUDA_CONTENT(X)->device_data,
+    d_Yd,
+    d_buff,
+    NVEC_CUDA_CONTENT(X)->length
+  );
+  PostKernelLaunch();
+
+  // Copy GPU result to the cpu.
+  err = cudaMemcpy(dots, d_buff, grid*sizeof(realtype), cudaMemcpyDeviceToHost);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+
+  // Free host array
+  delete[] h_Yd;
+
+  // Free device arrays
+  err = cudaFree(d_Yd);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+  err = cudaFree(d_buff);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
 
-  err = dotProdMulti(nvec, Xv, Yv, dotprods);
-
-  delete[] Yv;
-
-  return err == cudaSuccess ? 0 : -1;
+  return 0;
 }
 
 
@@ -588,134 +1219,321 @@ int N_VLinearSumVectorArray_Cuda(int nvec, realtype a, N_Vector* X, realtype b,
                                  N_Vector* Y, N_Vector* Z)
 {
   cudaError_t err;
-  vector_type** Xv;
-  vector_type** Yv;
-  vector_type** Zv;
 
-  Xv = new vector_type*[nvec];
+  // Create array of device pointers on host
+  realtype** h_Xd = new realtype*[nvec];
   for (int i=0; i<nvec; i++)
-    Xv[i] = static_cast<vector_type*>(X[i]->content);
+    h_Xd[i] = NVEC_CUDA_CONTENT(X[i])->device_data;
 
-  Yv = new vector_type*[nvec];
+  realtype** h_Yd = new realtype*[nvec];
   for (int i=0; i<nvec; i++)
-    Yv[i] = static_cast<vector_type*>(Y[i]->content);
+    h_Yd[i] = NVEC_CUDA_CONTENT(Y[i])->device_data;
 
-  Zv = new vector_type*[nvec];
+  realtype** h_Zd = new realtype*[nvec];
   for (int i=0; i<nvec; i++)
-    Zv[i] = static_cast<vector_type*>(Z[i]->content);
+    h_Zd[i] = NVEC_CUDA_CONTENT(Z[i])->device_data;
+
+  // Copy array of device pointers to device from host
+  realtype** d_Xd;
+  err = cudaMalloc((void**) &d_Xd, nvec*sizeof(realtype*));
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+  err = cudaMemcpy(d_Xd, h_Xd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+
+  realtype** d_Yd;
+  err = cudaMalloc((void**) &d_Yd, nvec*sizeof(realtype*));
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+  err = cudaMemcpy(d_Yd, h_Yd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+
+  realtype** d_Zd;
+  err = cudaMalloc((void**) &d_Zd, nvec*sizeof(realtype*));
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+  err = cudaMemcpy(d_Zd, h_Zd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+
+  // Set kernel parameters
+  size_t grid, block, shMemSize;
+  cudaStream_t stream;
+
+  if (GetKernelParameters(Z[0], false, grid, block, shMemSize, stream)) return -1;
+  linearSumVectorArrayKernel<<<grid, block, shMemSize, stream>>>
+  (
+    nvec,
+    a,
+    d_Xd,
+    b,
+    d_Yd,
+    d_Zd,
+    NVEC_CUDA_CONTENT(Z[0])->length
+  );
+  PostKernelLaunch();
+
+  // Free host array
+  delete[] h_Xd;
+  delete[] h_Yd;
+  delete[] h_Zd;
+
+  // Free device arrays
+  err = cudaFree(d_Xd);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+  err = cudaFree(d_Yd);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+  err = cudaFree(d_Zd);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
 
-  err = linearSumVectorArray(nvec, a, Xv, b, Yv, Zv);
-
-  delete[] Xv;
-  delete[] Yv;
-  delete[] Zv;
-
-  return err == cudaSuccess ? 0 : -1;
+  return 0;
 }
 
 
 int N_VScaleVectorArray_Cuda(int nvec, realtype* c, N_Vector* X, N_Vector* Z)
 {
   cudaError_t err;
-  vector_type** Xv;
-  vector_type** Zv;
 
-  Xv = new vector_type*[nvec];
-  for (int i=0; i<nvec; i++)
-    Xv[i] = static_cast<vector_type*>(X[i]->content);
+  // Copy c array to device
+  realtype* d_c;
+  err = cudaMalloc((void**) &d_c, nvec*sizeof(realtype));
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+  err = cudaMemcpy(d_c, c, nvec*sizeof(realtype), cudaMemcpyHostToDevice);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
 
-  Zv = new vector_type*[nvec];
+  // Create array of device pointers on host
+  realtype** h_Xd = new realtype*[nvec];
   for (int i=0; i<nvec; i++)
-    Zv[i] = static_cast<vector_type*>(Z[i]->content);
-
-  err = scaleVectorArray(nvec, c, Xv, Zv);
+    h_Xd[i] = NVEC_CUDA_CONTENT(X[i])->device_data;
 
-  delete[] Xv;
-  delete[] Zv;
+  realtype** h_Zd = new realtype*[nvec];
+  for (int i=0; i<nvec; i++)
+    h_Zd[i] = NVEC_CUDA_CONTENT(Z[i])->device_data;
+
+  // Copy array of device pointers to device from host
+  realtype** d_Xd;
+  err = cudaMalloc((void**) &d_Xd, nvec*sizeof(realtype*));
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+  err = cudaMemcpy(d_Xd, h_Xd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+
+  realtype** d_Zd;
+  err = cudaMalloc((void**) &d_Zd, nvec*sizeof(realtype*));
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+  err = cudaMemcpy(d_Zd, h_Zd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+
+  // Set kernel parameters
+  size_t grid, block, shMemSize;
+  cudaStream_t stream;
+
+  if (GetKernelParameters(Z[0], false, grid, block, shMemSize, stream)) return -1;
+  scaleVectorArrayKernel<<<grid, block, shMemSize, stream>>>
+  (
+    nvec,
+    d_c,
+    d_Xd,
+    d_Zd,
+    NVEC_CUDA_CONTENT(Z[0])->length
+  );
+  PostKernelLaunch();
+
+  // Free host array
+  delete[] h_Xd;
+  delete[] h_Zd;
+
+  // Free device arrays
+  err = cudaFree(d_c);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+  err = cudaFree(d_Xd);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+  err = cudaFree(d_Zd);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
 
-  return err == cudaSuccess ? 0 : -1;
+  return 0;
 }
 
 
 int N_VConstVectorArray_Cuda(int nvec, realtype c, N_Vector* Z)
 {
   cudaError_t err;
-  vector_type** Zv;
 
-  Zv = new vector_type*[nvec];
+  // Create array of device pointers on host
+  realtype** h_Zd = new realtype*[nvec];
   for (int i=0; i<nvec; i++)
-    Zv[i] = static_cast<vector_type*>(Z[i]->content);
-
-  err = constVectorArray(nvec, c, Zv);
-
-  delete[] Zv;
+    h_Zd[i] = NVEC_CUDA_CONTENT(Z[i])->device_data;
+
+  // Copy array of device pointers to device from host
+  realtype** d_Zd;
+  err = cudaMalloc((void**) &d_Zd, nvec*sizeof(realtype*));
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+  err = cudaMemcpy(d_Zd, h_Zd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+
+  // Set kernel parameters
+  size_t grid, block, shMemSize;
+  cudaStream_t stream;
+
+  if (GetKernelParameters(Z[0], false, grid, block, shMemSize, stream)) return -1;
+  constVectorArrayKernel<<<grid, block, shMemSize, stream>>>
+  (
+    nvec,
+    c,
+    d_Zd,
+    NVEC_CUDA_CONTENT(Z[0])->length
+  );
+  PostKernelLaunch();
+
+  // Free host array
+  delete[] h_Zd;
+
+  // Free device arrays
+  err = cudaFree(d_Zd);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
 
-  return err == cudaSuccess ? 0 : -1;
+  return 0;
 }
 
-
 int N_VWrmsNormVectorArray_Cuda(int nvec, N_Vector* X, N_Vector* W,
                                 realtype* norms)
 {
   cudaError_t err;
-  const vector_type* xvec = static_cast<vector_type*>(X[0]->content);
-  vector_type** Xv;
-  vector_type** Wv;
-
-  sunindextype N = xvec->size();
-
-  Xv = new vector_type*[nvec];
-  for (int k=0; k<nvec; k++)
-    Xv[k] = static_cast<vector_type*>(X[k]->content);
 
-  Wv = new vector_type*[nvec];
-  for (int k=0; k<nvec; k++)
-    Wv[k] = static_cast<vector_type*>(W[k]->content);
-
-  err = wL2NormSquareVectorArray(nvec, Xv, Wv, norms);
-
-  delete[] Xv;
-  delete[] Wv;
+  // Create array of device pointers on host
+  realtype** h_Xd = new realtype*[nvec];
+  for (int i=0; i<nvec; i++)
+    h_Xd[i] = NVEC_CUDA_CONTENT(X[i])->device_data;
+  realtype** h_Wd = new realtype*[nvec];
+  for (int i=0; i<nvec; i++)
+    h_Wd[i] = NVEC_CUDA_CONTENT(W[i])->device_data;
+
+  // Copy array of device pointers to device from host
+  realtype** d_Xd;
+  err = cudaMalloc((void**) &d_Xd, nvec*sizeof(realtype*));
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+  err = cudaMemcpy(d_Xd, h_Xd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+
+  realtype** d_Wd;
+  err = cudaMalloc((void**) &d_Wd, nvec*sizeof(realtype*));
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+  err = cudaMemcpy(d_Wd, h_Wd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+
+  // Set kernel parameters
+  size_t grid, block, shMemSize;
+  cudaStream_t stream;
+  
+  if (GetKernelParameters(X[0], true, grid, block, shMemSize, stream)) return -1;
+  grid = nvec;
+
+  // Allocate reduction buffer on device
+  realtype* d_buff;
+  err = cudaMalloc((void**) &d_buff, grid*sizeof(realtype));
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+  err = cudaMemsetAsync(d_buff, 0, grid*sizeof(realtype));
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+
+  wL2NormSquareVectorArrayKernel<realtype, sunindextype><<<grid, block, shMemSize, stream>>>
+  (
+    nvec,
+    d_Xd,
+    d_Wd,
+    d_buff,
+    NVEC_CUDA_CONTENT(X[0])->length
+  );
+  PostKernelLaunch();
+
+  // Copy GPU result to the cpu.
+  err = cudaMemcpy(norms, d_buff, grid*sizeof(realtype), cudaMemcpyDeviceToHost);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+
+  // Finish computation
+  for (int k=0; k<nvec; ++k)
+    norms[k] = std::sqrt(norms[k]/NVEC_CUDA_CONTENT(X[0])->length);
 
-  if (err != cudaSuccess)  return(-1);
+  // Free host array
+  delete[] h_Xd;
+  delete[] h_Wd;
 
-  for (int k=0; k<nvec; ++k)
-    norms[k] = std::sqrt(norms[k]/N);
+  // Free device arrays
+  err = cudaFree(d_Xd);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+  err = cudaFree(d_Wd);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+  err = cudaFree(d_buff);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
 
   return 0;
 }
 
-
 int N_VWrmsNormMaskVectorArray_Cuda(int nvec, N_Vector* X, N_Vector* W,
                                     N_Vector id, realtype* norms)
 {
   cudaError_t err;
-  const vector_type* xvec = static_cast<vector_type*>(X[0]->content);
-  vector_type** Xv;
-  vector_type** Wv;
-  vector_type*  IDv;
-
-  sunindextype N = xvec->size();
-
-  Xv = new vector_type*[nvec];
-  for (int k=0; k<nvec; k++)
-    Xv[k] = static_cast<vector_type*>(X[k]->content);
 
-  Wv = new vector_type*[nvec];
-  for (int k=0; k<nvec; k++)
-    Wv[k] = static_cast<vector_type*>(W[k]->content);
-
-  IDv = static_cast<vector_type*>(id->content);
-
-  err = wL2NormSquareMaskVectorArray(nvec, Xv, Wv, IDv, norms);
+  // Create array of device pointers on host
+  realtype** h_Xd = new realtype*[nvec];
+  for (int i=0; i<nvec; i++)
+    h_Xd[i] = NVEC_CUDA_CONTENT(X[i])->device_data;
 
-  delete[] Xv;
-  delete[] Wv;
+  realtype** h_Wd = new realtype*[nvec];
+  for (int i=0; i<nvec; i++)
+    h_Wd[i] = NVEC_CUDA_CONTENT(W[i])->device_data;
+
+  // Copy array of device pointers to device from host
+  realtype** d_Xd;
+  err = cudaMalloc((void**) &d_Xd, nvec*sizeof(realtype*));
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+  err = cudaMemcpy(d_Xd, h_Xd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+
+  realtype** d_Wd;
+  err = cudaMalloc((void**) &d_Wd, nvec*sizeof(realtype*));
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+  err = cudaMemcpy(d_Wd, h_Wd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+
+  // Set kernel parameters
+  size_t grid, block, shMemSize;
+  cudaStream_t stream;
+  
+  if (GetKernelParameters(X[0], true, grid, block, shMemSize, stream)) return -1;
+  grid = nvec;
+
+  // Allocate reduction buffer on device
+  realtype* d_buff;
+  err = cudaMalloc((void**) &d_buff, grid*sizeof(realtype));
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+  err = cudaMemsetAsync(d_buff, 0, grid*sizeof(realtype));
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+
+  wL2NormSquareMaskVectorArrayKernel<realtype, sunindextype><<<grid, block, shMemSize, stream>>>
+  (
+    nvec,
+    d_Xd,
+    d_Wd,
+    NVEC_CUDA_CONTENT(id)->device_data,
+    d_buff,
+    NVEC_CUDA_CONTENT(X[0])->length
+  );
+  PostKernelLaunch();
+
+  // Copy GPU result to the cpu.
+  err = cudaMemcpy(norms, d_buff, grid*sizeof(realtype), cudaMemcpyDeviceToHost);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+
+  // Finish computation
+  for (int k=0; k<nvec; ++k)
+    norms[k] = std::sqrt(norms[k]/NVEC_CUDA_CONTENT(X[0])->length);
 
-  if (err != cudaSuccess)  return(-1);
+  // Free host array
+  delete[] h_Xd;
+  delete[] h_Wd;
 
-  for (int k=0; k<nvec; ++k)
-    norms[k] = std::sqrt(norms[k]/N);
+  // Free device arrays
+  err = cudaFree(d_Xd);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+  err = cudaFree(d_Wd);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+  err = cudaFree(d_buff);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
 
   return 0;
 }
@@ -725,31 +1543,81 @@ int N_VScaleAddMultiVectorArray_Cuda(int nvec, int nsum, realtype* c,
                                      N_Vector* X, N_Vector** Y, N_Vector** Z)
 {
   cudaError_t err;
-  vector_type** Xv;
-  vector_type** Yv;
-  vector_type** Zv;
-
-  Xv = new vector_type*[nvec];
-  for (int k=0; k<nvec; k++)
-    Xv[k] = static_cast<vector_type*>(X[k]->content);
 
-  Yv = new vector_type*[nsum*nvec];
-  for (int k=0; k<nvec; k++)
-    for (int j=0; j<nsum; j++)
-      Yv[k*nsum+j] = static_cast<vector_type*>(Y[j][k]->content);
+  // Copy c array to device
+  realtype* d_c;
+  err = cudaMalloc((void**) &d_c, nsum*sizeof(realtype));
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+  err = cudaMemcpy(d_c, c, nsum*sizeof(realtype), cudaMemcpyHostToDevice);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
 
-  Zv = new vector_type*[nsum*nvec];
-  for (int k=0; k<nvec; k++)
-    for (int j=0; j<nsum; j++)
-      Zv[k*nsum+j] = static_cast<vector_type*>(Z[j][k]->content);
-
-  err = scaleAddMultiVectorArray(nvec, nsum, c, Xv, Yv, Zv);
-
-  delete[] Xv;
-  delete[] Yv;
-  delete[] Zv;
+  // Create array of device pointers on host
+  realtype** h_Xd = new realtype*[nvec];
+  for (int i=0; i<nvec; i++)
+    h_Xd[i] = NVEC_CUDA_CONTENT(X[i])->device_data;
+
+  realtype** h_Yd = new realtype*[nsum*nvec];
+  for (int j=0; j<nvec; j++)
+    for (int i=0; i<nsum; i++)
+      h_Yd[j*nsum+i] = NVEC_CUDA_CONTENT(Y[i][j])->device_data;
+
+  realtype** h_Zd = new realtype*[nsum*nvec];
+  for (int j=0; j<nvec; j++)
+    for (int i=0; i<nsum; i++)
+      h_Zd[j*nsum+i] = NVEC_CUDA_CONTENT(Z[i][j])->device_data;
+
+  // Copy array of device pointers to device from host
+  realtype** d_Xd;
+  err = cudaMalloc((void**) &d_Xd, nvec*sizeof(realtype*));
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+  err = cudaMemcpy(d_Xd, h_Xd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+
+  realtype** d_Yd;
+  err = cudaMalloc((void**) &d_Yd, nsum*nvec*sizeof(realtype*));
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+  err = cudaMemcpy(d_Yd, h_Yd, nsum*nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+
+  realtype** d_Zd;
+  err = cudaMalloc((void**) &d_Zd, nsum*nvec*sizeof(realtype*));
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+  err = cudaMemcpy(d_Zd, h_Zd, nsum*nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+
+  // Set kernel parameters
+  size_t grid, block, shMemSize;
+  cudaStream_t stream;
+
+  if (GetKernelParameters(Z[0][0], false, grid, block, shMemSize, stream)) return -1;
+  scaleAddMultiVectorArrayKernel<<<grid, block, shMemSize, stream>>>
+  (
+    nvec,
+    nsum,
+    d_c,
+    d_Xd,
+    d_Yd,
+    d_Zd,
+    NVEC_CUDA_CONTENT(Z[0][0])->length
+  );
+  PostKernelLaunch();
+
+  // Free host array
+  delete[] h_Xd;
+  delete[] h_Yd;
+  delete[] h_Zd;
+
+  // Free device arrays
+  err = cudaFree(d_c);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+  err = cudaFree(d_Xd);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+  err = cudaFree(d_Yd);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+  err = cudaFree(d_Zd);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
 
-  return err == cudaSuccess ? 0 : -1;
+  return 0;
 }
 
 
@@ -757,24 +1625,66 @@ int N_VLinearCombinationVectorArray_Cuda(int nvec, int nsum, realtype* c,
                                          N_Vector** X, N_Vector* Z)
 {
   cudaError_t err;
-  vector_type** Xv;
-  vector_type** Zv;
-
-  Xv = new vector_type*[nsum*nvec];
-  for (int k=0; k<nvec; k++)
-    for (int j=0; j<nsum; j++)
-      Xv[k*nsum+j] = static_cast<vector_type*>(X[j][k]->content);
-
-  Zv = new vector_type*[nvec];
-  for (int k=0; k<nvec; k++)
-    Zv[k] = static_cast<vector_type*>(Z[k]->content);
 
-  err = linearCombinationVectorArray(nvec, nsum, c, Xv, Zv);
+  // Copy c array to device
+  realtype* d_c;
+  err = cudaMalloc((void**) &d_c, nsum*sizeof(realtype));
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+  err = cudaMemcpy(d_c, c, nsum*sizeof(realtype), cudaMemcpyHostToDevice);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
 
-  delete[] Xv;
-  delete[] Zv;
+  // Create array of device pointers on host
+  realtype** h_Xd = new realtype*[nsum*nvec];
+  for (int j=0; j<nvec; j++)
+    for (int i=0; i<nsum; i++)
+      h_Xd[j*nsum+i] = NVEC_CUDA_CONTENT(X[i][j])->device_data;
 
-  return err == cudaSuccess ? 0 : -1;
+  realtype** h_Zd = new realtype*[nvec];
+  for (int i=0; i<nvec; i++)
+    h_Zd[i] = NVEC_CUDA_CONTENT(Z[i])->device_data;
+
+  // Copy array of device pointers to device from host
+  realtype** d_Xd;
+  err = cudaMalloc((void**) &d_Xd, nsum*nvec*sizeof(realtype*));
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+  err = cudaMemcpy(d_Xd, h_Xd, nsum*nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+
+  realtype** d_Zd;
+  err = cudaMalloc((void**) &d_Zd, nvec*sizeof(realtype*));
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+  err = cudaMemcpy(d_Zd, h_Zd, nvec*sizeof(realtype*), cudaMemcpyHostToDevice);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+
+  // Set kernel parameters
+  size_t grid, block, shMemSize;
+  cudaStream_t stream;
+
+  if (GetKernelParameters(Z[0], false, grid, block, shMemSize, stream)) return -1;
+  linearCombinationVectorArrayKernel<<<grid, block, shMemSize, stream>>>
+  (
+    nvec,
+    nsum,
+    d_c,
+    d_Xd,
+    d_Zd,
+    NVEC_CUDA_CONTENT(Z[0])->length
+  );
+  PostKernelLaunch();
+
+  // Free host array
+  delete[] h_Xd;
+  delete[] h_Zd;
+
+  // Free device arrays
+  err = cudaFree(d_c);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+  err = cudaFree(d_Xd);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+  err = cudaFree(d_Zd);
+  if (!SUNDIALS_CUDA_VERIFY(err)) return -1;
+
+  return cudaGetLastError();
 }
 
 
@@ -792,7 +1702,8 @@ int N_VEnableFusedOps_Cuda(N_Vector v, booleantype tf)
   /* check that ops structure is non-NULL */
   if (v->ops == NULL) return(-1);
 
-  if (tf) {
+  if (tf)
+  {
     /* enable all fused vector operations */
     v->ops->nvlinearcombination = N_VLinearCombination_Cuda;
     v->ops->nvscaleaddmulti     = N_VScaleAddMulti_Cuda;
@@ -805,7 +1716,9 @@ int N_VEnableFusedOps_Cuda(N_Vector v, booleantype tf)
     v->ops->nvwrmsnormmaskvectorarray      = N_VWrmsNormMaskVectorArray_Cuda;
     v->ops->nvscaleaddmultivectorarray     = N_VScaleAddMultiVectorArray_Cuda;
     v->ops->nvlinearcombinationvectorarray = N_VLinearCombinationVectorArray_Cuda;
-  } else {
+  }
+  else
+  {
     /* disable all fused vector operations */
     v->ops->nvlinearcombination = NULL;
     v->ops->nvscaleaddmulti     = NULL;
@@ -1005,4 +1918,256 @@ int N_VEnableLinearCombinationVectorArray_Cuda(N_Vector v, booleantype tf)
   return(0);
 }
 
+/*
+ * Private helper functions.
+ */
+
+int AllocateData(N_Vector v)
+{
+  cudaError_t err;
+
+  N_VectorContent_Cuda vc = NVEC_CUDA_CONTENT(v);
+  N_PrivateVectorContent_Cuda vcp = NVEC_CUDA_PRIVATE(v);
+
+  if (vcp->userallocfn)
+  {
+    /* We assume managed memory when a custom allocator is provided */
+    vc->device_data =  (realtype *) vcp->userallocfn(NVEC_CUDA_MEMSIZE(v));
+    vc->host_data = vc->device_data;
+    if (vc->device_data == NULL)
+    {
+      SUNDIALS_DEBUG_PRINT("ERROR in AllocateData: user provided allocator function failed\n");
+      return -1;
+    }
+  } 
+  else if (vcp->use_managed_mem)
+  {
+    err = cudaMallocManaged((void**) &vc->device_data, NVEC_CUDA_MEMSIZE(v));
+    vc->host_data = vc->device_data;
+    if (!SUNDIALS_CUDA_VERIFY(err))
+    {
+      SUNDIALS_DEBUG_PRINT("ERROR in AllocateData: cudaMallocManaged failed\n");
+      return -1;
+    }
+  }
+  else
+  {
+    vc->host_data = (realtype*) malloc(NVEC_CUDA_MEMSIZE(v));
+    err = cudaMalloc((void**) &vc->device_data, NVEC_CUDA_MEMSIZE(v));
+    if (!SUNDIALS_CUDA_VERIFY(err))
+    {
+      SUNDIALS_DEBUG_PRINT("ERROR in AllocateData: cudaMalloc failed\n");
+      return -1;
+    }
+  }
+
+  return 0;
+}
+
+/* 
+ * Initializes the internal buffer used for reductions.
+ * If the buffer is already allocated, it will only be reallocated
+ * if it is no longer large enough. This may occur if the length
+ * of the vector is increased. The buffer is initialized to the
+ * value given.
+ */
+int InitializeReductionBuffer(N_Vector v, const realtype value)
+{
+  cudaError_t err;
+  size_t bytes = sizeof(realtype);
+  N_PrivateVectorContent_Cuda vcp = NVEC_CUDA_PRIVATE(v);
+
+  /* we allocate if the existing reduction buffer is not large enough */
+  if (vcp->reduce_buffer_allocated_bytes < bytes)
+  {
+    if (vcp->reduce_buffer_allocated_bytes) FreeReductionBuffer(v);
+  }
+  else
+  {
+    err = cudaMemcpyAsync(vcp->reduce_buffer_dev, &value,
+                          bytes, cudaMemcpyHostToDevice, NVEC_CUDA_STREAM(v));
+    if (!SUNDIALS_CUDA_VERIFY(err))
+    {
+      SUNDIALS_DEBUG_PRINT("ERROR in InitializeReductionBuffer: cudaMemcpyAsync failed\n");
+      return -1;
+    }
+    return 0;
+  }
+
+  if (vcp->userallocfn != nullptr)
+  {
+    vcp->reduce_buffer_dev = (realtype*) vcp->userallocfn(bytes);
+    if (vcp->reduce_buffer_dev == NULL)
+    {
+      SUNDIALS_DEBUG_PRINT("ERROR in InitializeReductionBuffer: could not allocate device data with user allocator\n");
+      return -1;
+    }
+    vcp->reduce_buffer_host = vcp->reduce_buffer_dev;
+  }
+  else if (vcp->use_managed_mem)
+  {
+    err = cudaMallocManaged((void**) &vcp->reduce_buffer_dev, bytes);
+    if (!SUNDIALS_CUDA_VERIFY(err))
+    {
+      SUNDIALS_DEBUG_PRINT("ERROR in InitializeReductionBuffer: could not allocate device data with cudaMallocManaged\n");
+      return -1;
+    }
+    vcp->reduce_buffer_host = vcp->reduce_buffer_dev;
+  }
+  else
+  {
+    vcp->reduce_buffer_host = (realtype *) malloc(bytes);
+    if (vcp->reduce_buffer_host == NULL)
+    {
+      SUNDIALS_DEBUG_PRINT("ERROR in InitializeReductionBuffer: could not allocate host data with malloc\n");
+      return -1;
+    }
+    err = cudaMalloc((void**) &vcp->reduce_buffer_dev, bytes);
+    if (!SUNDIALS_CUDA_VERIFY(err))
+    {
+      SUNDIALS_DEBUG_PRINT("ERROR in InitializeReductionBuffer: could not allocate device data with cudaMalloc\n");
+      return -1;
+    }
+  }
+
+  vcp->reduce_buffer_allocated_bytes = bytes;
+
+  err = cudaMemcpyAsync(vcp->reduce_buffer_dev, &value,
+                        bytes, cudaMemcpyHostToDevice, NVEC_CUDA_STREAM(v));
+  if (!SUNDIALS_CUDA_VERIFY(err))
+  {
+    SUNDIALS_DEBUG_PRINT("ERROR in InitializeReductionBuffer: could not allocate host data\n");
+    return -1;
+  }
+
+  return 0;
+}
+
+/* Free the reduction buffer
+ */
+void FreeReductionBuffer(N_Vector v)
+{
+  cudaError_t err;
+
+  N_PrivateVectorContent_Cuda vcp = NVEC_CUDA_PRIVATE(v);
+  if (vcp == NULL) return;
+
+  if (vcp->use_managed_mem)
+  {
+    /* managed memory */
+    if (vcp->userfreefn)
+    {
+      if (vcp->reduce_buffer_dev != NULL) 
+        vcp->userfreefn(vcp->reduce_buffer_dev);
+    } 
+    else
+    {
+      if (vcp->reduce_buffer_dev != NULL)
+      {
+        err = cudaFree(vcp->reduce_buffer_dev);
+        SUNDIALS_CUDA_VERIFY(err);
+      }
+    }
+    vcp->reduce_buffer_dev = vcp->reduce_buffer_host = NULL;
+  }
+  else
+  {
+    /* unmanaged memory */
+    if (vcp->reduce_buffer_dev != NULL)
+    {
+      err = cudaFree(vcp->reduce_buffer_dev);
+      SUNDIALS_CUDA_VERIFY(err);
+    }
+    if (vcp->reduce_buffer_host != NULL) free(vcp->reduce_buffer_host);
+    vcp->reduce_buffer_dev = NULL;
+    vcp->reduce_buffer_host = NULL;
+  }
+}
+
+/* Copy the reduction buffer from the device to the host.
+ */
+int CopyReductionBufferFromDevice(N_Vector v, size_t n)
+{
+  cudaError_t err;
+
+  /* If using managed memory, then we don't need to do a copy, but we
+      still need to synchronize the device to adhere to the unified
+      memory access rules. */
+  if (NVEC_CUDA_PRIVATE(v)->use_managed_mem)
+  { 
+    err = cudaStreamSynchronize(NVEC_CUDA_STREAM(v));
+  }
+  else
+  {
+    err = cudaMemcpyAsync(NVEC_CUDA_PRIVATE(v)->reduce_buffer_host,
+                          NVEC_CUDA_PRIVATE(v)->reduce_buffer_dev,
+                          n*sizeof(realtype),
+                          cudaMemcpyDeviceToHost,
+                          NVEC_CUDA_STREAM(v));
+  }
+  return (!SUNDIALS_CUDA_VERIFY(err)) ? -1 : 0;
+}
+
+/* Get the kernel launch parameters based on the kernel type (reduction or not),
+ * using the appropriate kernel execution policy.
+ */
+int GetKernelParameters(N_Vector v, booleantype reduction, size_t& grid, size_t& block,
+                        size_t& shMemSize, cudaStream_t& stream, size_t n)
+{
+  n = (n == 0) ? NVEC_CUDA_CONTENT(v)->length : n;
+  if (reduction)
+  {
+    SUNCudaExecPolicy* reduce_exec_policy = NVEC_CUDA_CONTENT(v)->reduce_exec_policy;
+    grid      = reduce_exec_policy->gridSize(n);
+    block     = reduce_exec_policy->blockSize();
+    shMemSize = 0;
+    stream    = reduce_exec_policy->stream();
+    if (block % CUDA_WARP_SIZE)
+    {
+#ifdef SUNDIALS_DEBUG
+      throw std::runtime_error("the block size must be a multiple must be of CUDA warp size");
+#endif
+      return -1;
+    }
+  }
+  else
+  {
+    SUNCudaExecPolicy* stream_exec_policy = NVEC_CUDA_CONTENT(v)->stream_exec_policy;
+    grid      = stream_exec_policy->gridSize(n);
+    block     = stream_exec_policy->blockSize();
+    shMemSize = 0;
+    stream    = stream_exec_policy->stream();
+  }
+
+  if (grid == 0)
+  {
+#ifdef SUNDIALS_DEBUG
+    throw std::runtime_error("the grid size must be > 0");
+#endif
+    return -1;
+  }
+  if (block == 0)
+  {
+#ifdef SUNDIALS_DEBUG
+    throw std::runtime_error("the block size must be > 0");
+#endif
+    return -1;
+  }
+
+  return 0;
+}
+
+/* Should be called after a kernel launch.
+ * If SUNDIALS_DEBUG_CUDA_LASTERROR is not defined, then the function does nothing.
+ * If it is defined, the function will synchronize and check the last CUDA error.
+ */
+void PostKernelLaunch()
+{
+#ifdef SUNDIALS_DEBUG_CUDA_LASTERROR
+  cudaDeviceSynchronize();
+  SUNDIALS_CUDA_VERIFY(cudaGetLastError());
+#endif
+}
+
+
 } // extern "C"
diff --git a/src/nvector/parallel/nvector_parallel.c b/src/nvector/parallel/nvector_parallel.c
index cc93aa3844..468e68b1dd 100644
--- a/src/nvector/parallel/nvector_parallel.c
+++ b/src/nvector/parallel/nvector_parallel.c
@@ -140,7 +140,11 @@ N_Vector N_VNewEmpty_Parallel(MPI_Comm comm,
   v->ops->nvminquotientlocal = N_VMinQuotientLocal_Parallel;
   v->ops->nvwsqrsumlocal     = N_VWSqrSumLocal_Parallel;
   v->ops->nvwsqrsummasklocal = N_VWSqrSumMaskLocal_Parallel;
-  
+
+  /* debugging functions */
+  v->ops->nvprint     = N_VPrint_Parallel;
+  v->ops->nvprintfile = N_VPrintFile_Parallel;
+
   /* Create content */
   content = NULL;
   content = (N_VectorContent_Parallel) malloc(sizeof *content);
@@ -360,7 +364,7 @@ N_Vector N_VCloneEmpty_Parallel(N_Vector w)
 
   /* Attach operations */
   if (N_VCopyOps(w, v)) { N_VDestroy(v); return(NULL); }
-  
+
   /* Create content */
   content = NULL;
   content = (N_VectorContent_Parallel) malloc(sizeof *content);
@@ -710,7 +714,7 @@ realtype N_VMaxNormLocal_Parallel(N_Vector x)
 
   max = ZERO;
 
-  for (i = 0; i < N; i++) 
+  for (i = 0; i < N; i++)
     if (SUNRabs(xd[i]) > max) max = SUNRabs(xd[i]);
 
   return(max);
@@ -794,7 +798,7 @@ realtype N_VMinLocal_Parallel(N_Vector x)
   if (N > 0) {
     xd = NV_DATA_P(x);
     min = xd[0];
-    for (i = 1; i < N; i++) 
+    for (i = 1; i < N; i++)
       if (xd[i] < min) min = xd[i];
   }
   return(min);
@@ -828,7 +832,7 @@ realtype N_VL1NormLocal_Parallel(N_Vector x)
 
   for (i = 0; i<N; i++)
     sum += SUNRabs(xd[i]);
-  
+
   return(sum);
 }
 
@@ -1221,7 +1225,7 @@ int N_VLinearSumVectorArray_Parallel(int nvec,
   /*   (1) a == other, b == 0.0 - user should have called N_VScale */
   /*   (2) a == 0.0, b == other - user should have called N_VScale */
   /*   (3) a,b == other, a !=b, a != -b                            */
-  
+
   /* get vector length */
   N = NV_LOCLENGTH_P(Z[0]);
 
@@ -1986,7 +1990,7 @@ static int VaxpyVectorArray_Parallel(int nvec, realtype a, N_Vector* X, N_Vector
     }
 
     return(0);
-  }    
+  }
 
   for (i=0; i<nvec; i++) {
     xd = NV_DATA_P(X[i]);
diff --git a/src/nvector/serial/nvector_serial.c b/src/nvector/serial/nvector_serial.c
index 5b5dbd6bd4..cf653a8a6e 100644
--- a/src/nvector/serial/nvector_serial.c
+++ b/src/nvector/serial/nvector_serial.c
@@ -123,6 +123,10 @@ N_Vector N_VNewEmpty_Serial(sunindextype length)
   v->ops->nvwsqrsumlocal     = N_VWSqrSumLocal_Serial;
   v->ops->nvwsqrsummasklocal = N_VWSqrSumMaskLocal_Serial;
 
+  /* debugging functions */
+  v->ops->nvprint     = N_VPrint_Serial;
+  v->ops->nvprintfile = N_VPrintFile_Serial;
+
   /* Create content */
   content = NULL;
   content = (N_VectorContent_Serial) malloc(sizeof *content);
@@ -1876,7 +1880,7 @@ static int VaxpyVectorArray_Serial(int nvec, realtype a, N_Vector* X, N_Vector*
     }
 
     return(0);
-  }    
+  }
 
   for (i=0; i<nvec; i++) {
     xd = NV_DATA_S(X[i]);
diff --git a/src/sundials/CMakeLists.txt b/src/sundials/CMakeLists.txt
index 72a0dd496b..f1e4f7e1a0 100644
--- a/src/sundials/CMakeLists.txt
+++ b/src/sundials/CMakeLists.txt
@@ -42,6 +42,10 @@ set(sundials_HEADERS
   sundials_version.h
   )
 
+if(CUDA_ENABLE AND CMAKE_CUDA_COMPILER)
+  list(APPEND sundials_HEADERS sundials_cuda_policies.hpp)
+endif()
+
 # Add prefix with complete path to the SUNDIALS header files
 add_prefix(${sundials_SOURCE_DIR}/include/sundials/ sundials_HEADERS)
 
diff --git a/src/sundials/fmod/fsundials_linearsolver_mod.f90 b/src/sundials/fmod/fsundials_linearsolver_mod.f90
index dc9802730e..7d5c4126d3 100644
--- a/src/sundials/fmod/fsundials_linearsolver_mod.f90
+++ b/src/sundials/fmod/fsundials_linearsolver_mod.f90
@@ -103,13 +103,15 @@ module fsundials_linearsolver_mod
  integer(C_INT), parameter, public :: SUNLS_MEM_NULL = -801_C_INT
  integer(C_INT), parameter, public :: SUNLS_ILL_INPUT = -802_C_INT
  integer(C_INT), parameter, public :: SUNLS_MEM_FAIL = -803_C_INT
- integer(C_INT), parameter, public :: SUNLS_ATIMES_FAIL_UNREC = -804_C_INT
- integer(C_INT), parameter, public :: SUNLS_PSET_FAIL_UNREC = -805_C_INT
- integer(C_INT), parameter, public :: SUNLS_PSOLVE_FAIL_UNREC = -806_C_INT
- integer(C_INT), parameter, public :: SUNLS_PACKAGE_FAIL_UNREC = -807_C_INT
- integer(C_INT), parameter, public :: SUNLS_GS_FAIL = -808_C_INT
- integer(C_INT), parameter, public :: SUNLS_QRSOL_FAIL = -809_C_INT
- integer(C_INT), parameter, public :: SUNLS_VECTOROP_ERR = -810_C_INT
+ integer(C_INT), parameter, public :: SUNLS_ATIMES_NULL = -804_C_INT
+ integer(C_INT), parameter, public :: SUNLS_ATIMES_FAIL_UNREC = -805_C_INT
+ integer(C_INT), parameter, public :: SUNLS_PSET_FAIL_UNREC = -806_C_INT
+ integer(C_INT), parameter, public :: SUNLS_PSOLVE_NULL = -807_C_INT
+ integer(C_INT), parameter, public :: SUNLS_PSOLVE_FAIL_UNREC = -808_C_INT
+ integer(C_INT), parameter, public :: SUNLS_PACKAGE_FAIL_UNREC = -809_C_INT
+ integer(C_INT), parameter, public :: SUNLS_GS_FAIL = -810_C_INT
+ integer(C_INT), parameter, public :: SUNLS_QRSOL_FAIL = -811_C_INT
+ integer(C_INT), parameter, public :: SUNLS_VECTOROP_ERR = -812_C_INT
  integer(C_INT), parameter, public :: SUNLS_RES_REDUCED = 801_C_INT
  integer(C_INT), parameter, public :: SUNLS_CONV_FAIL = 802_C_INT
  integer(C_INT), parameter, public :: SUNLS_ATIMES_FAIL_REC = 803_C_INT
diff --git a/src/sundials/fmod/fsundials_nonlinearsolver_mod.c b/src/sundials/fmod/fsundials_nonlinearsolver_mod.c
index de8c83860e..b9922ac464 100644
--- a/src/sundials/fmod/fsundials_nonlinearsolver_mod.c
+++ b/src/sundials/fmod/fsundials_nonlinearsolver_mod.c
@@ -188,7 +188,7 @@
 #define SWIG_as_voidptrptr(a) ((void)SWIG_as_voidptr(*a),(void**)(a)) 
 
 
-#include "sundials/sundials_nonlinearsolver.h"  
+#include "sundials/sundials_nonlinearsolver.h"
 
 SWIGEXPORT SUNNonlinearSolver _wrap_FSUNNonlinSolNewEmpty() {
   SUNNonlinearSolver fresult ;
diff --git a/src/sundials/fmod/fsundials_nvector_mod.c b/src/sundials/fmod/fsundials_nvector_mod.c
index 45c0bdac70..8163040b0e 100644
--- a/src/sundials/fmod/fsundials_nvector_mod.c
+++ b/src/sundials/fmod/fsundials_nvector_mod.c
@@ -918,4 +918,22 @@ SWIGEXPORT void _wrap_FN_VSetVecAtIndexVectorArray(void *farg1, int const *farg2
 }
 
 
+SWIGEXPORT void _wrap_FN_VPrint(N_Vector farg1) {
+  N_Vector arg1 = (N_Vector) 0 ;
+  
+  arg1 = (N_Vector)(farg1);
+  N_VPrint(arg1);
+}
+
+
+SWIGEXPORT void _wrap_FN_VPrintFile(N_Vector farg1, void *farg2) {
+  N_Vector arg1 = (N_Vector) 0 ;
+  FILE *arg2 = (FILE *) 0 ;
+  
+  arg1 = (N_Vector)(farg1);
+  arg2 = (FILE *)(farg2);
+  N_VPrintFile(arg1,arg2);
+}
+
+
 
diff --git a/src/sundials/fmod/fsundials_nvector_mod.f90 b/src/sundials/fmod/fsundials_nvector_mod.f90
index 988163fe29..bf69daeacf 100644
--- a/src/sundials/fmod/fsundials_nvector_mod.f90
+++ b/src/sundials/fmod/fsundials_nvector_mod.f90
@@ -80,6 +80,8 @@ module fsundials_nvector_mod
   type(C_FUNPTR), public :: nvminquotientlocal
   type(C_FUNPTR), public :: nvwsqrsumlocal
   type(C_FUNPTR), public :: nvwsqrsummasklocal
+  type(C_FUNPTR), public :: nvprint
+  type(C_FUNPTR), public :: nvprintfile
  end type N_Vector_Ops
  ! struct struct _generic_N_Vector
  type, bind(C), public :: N_Vector
@@ -140,6 +142,8 @@ module fsundials_nvector_mod
  public :: FN_VDestroyVectorArray
  public :: FN_VGetVecAtIndexVectorArray
  public :: FN_VSetVecAtIndexVectorArray
+ public :: FN_VPrint
+ public :: FN_VPrintFile
 
 ! WRAPPER DECLARATIONS
 interface
@@ -615,6 +619,19 @@ subroutine swigc_FN_VSetVecAtIndexVectorArray(farg1, farg2, farg3) &
 type(C_PTR), value :: farg3
 end subroutine
 
+subroutine swigc_FN_VPrint(farg1) &
+bind(C, name="_wrap_FN_VPrint")
+use, intrinsic :: ISO_C_BINDING
+type(C_PTR), value :: farg1
+end subroutine
+
+subroutine swigc_FN_VPrintFile(farg1, farg2) &
+bind(C, name="_wrap_FN_VPrintFile")
+use, intrinsic :: ISO_C_BINDING
+type(C_PTR), value :: farg1
+type(C_PTR), value :: farg2
+end subroutine
+
 end interface
 
 
@@ -1472,5 +1489,26 @@ subroutine FN_VSetVecAtIndexVectorArray(vs, index, w)
 call swigc_FN_VSetVecAtIndexVectorArray(farg1, farg2, farg3)
 end subroutine
 
+subroutine FN_VPrint(v)
+use, intrinsic :: ISO_C_BINDING
+type(N_Vector), target, intent(inout) :: v
+type(C_PTR) :: farg1 
+
+farg1 = c_loc(v)
+call swigc_FN_VPrint(farg1)
+end subroutine
+
+subroutine FN_VPrintFile(v, outfile)
+use, intrinsic :: ISO_C_BINDING
+type(N_Vector), target, intent(inout) :: v
+type(C_PTR) :: outfile
+type(C_PTR) :: farg1 
+type(C_PTR) :: farg2 
+
+farg1 = c_loc(v)
+farg2 = outfile
+call swigc_FN_VPrintFile(farg1, farg2)
+end subroutine
+
 
 end module
diff --git a/src/sundials/sundials_cuda.h b/src/sundials/sundials_cuda.h
index e629ae423b..fefdf63c33 100644
--- a/src/sundials/sundials_cuda.h
+++ b/src/sundials/sundials_cuda.h
@@ -37,6 +37,7 @@ extern "C" {
  * ---------------------------------------------------------------------------*/
 
 #define CUDA_WARP_SIZE 32
+#define MAX_CUDA_BLOCKSIZE 1024
 
 
 /* ---------------------------------------------------------------------------
diff --git a/src/sundials/sundials_cuda_kernels.cuh b/src/sundials/sundials_cuda_kernels.cuh
new file mode 100644
index 0000000000..80a33d9b3f
--- /dev/null
+++ b/src/sundials/sundials_cuda_kernels.cuh
@@ -0,0 +1,268 @@
+/*
+ * -----------------------------------------------------------------
+ * Programmer(s): Cody J. Balos @ LLNL
+ * -----------------------------------------------------------------
+ * SUNDIALS Copyright Start
+ * Copyright (c) 2002-2020, Lawrence Livermore National Security
+ * and Southern Methodist University.
+ * All rights reserved.
+ *
+ * See the top-level LICENSE and NOTICE files for details.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ * SUNDIALS Copyright End
+ * -----------------------------------------------------------------
+ */
+
+#ifndef _SUNDIALS_CUDA_KERNELS_CUH
+#define _SUNDIALS_CUDA_KERNELS_CUH
+
+#define GRID_STRIDE_XLOOP(type, iter, max)  \
+  for (type iter = blockDim.x * blockIdx.x + threadIdx.x; \
+       iter < max; \
+       iter += blockDim.x * gridDim.x)
+
+#include "sundials_cuda.h"
+
+namespace sundials
+{
+namespace cuda
+{
+
+/* The atomic functions below are implemented using the atomic compare and swap
+   function atomicCAS which performs an atomic version of
+   (*address == assumed) ? (assumed + val) : *address. Since *address could change
+   between when the value is loaded and the atomicCAS call the operation is repeated
+   until *address does not change between the read and the compare and swap operation. */
+
+typedef enum { RSUM, RMAX, RMIN } BinaryReductionOp;
+
+#if __CUDA_ARCH__ < 600
+__forceinline__ __device__
+double atomicAdd(double* address, double val)
+{
+  unsigned long long int* address_as_ull = (unsigned long long int*)address;
+  unsigned long long int old = *address_as_ull, assumed;
+
+  do {
+      assumed = old;
+      old = atomicCAS(address_as_ull, assumed,
+                      __double_as_longlong(val +
+                              __longlong_as_double(assumed)));
+  // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+  } while (assumed != old);
+
+  return __longlong_as_double(old);
+}
+#endif
+
+/*
+ * Compute the maximum of 2 double-precision floating point values using an atomic operation
+ * "address" is the address of the reference value which might get updated with the maximum
+ * "value" is the value that is compared to the reference in order to determine the maximum
+ */
+__forceinline__ __device__ 
+void AtomicMax(double* const address, const double value)
+{
+  if (*address >= value)
+  {
+    return;
+  }
+
+  unsigned long long * const address_as_i = (unsigned long long *)address;
+  unsigned long long old = * address_as_i, assumed;
+
+  do 
+  {
+    assumed = old;
+    if (__longlong_as_double(assumed) >= value)
+    {
+      break;
+    }
+    old = atomicCAS(address_as_i, assumed, __double_as_longlong(value));
+  } while (assumed != old);
+}
+
+/*
+ * Compute the maximum of 2 single-precision floating point values using an atomic operation
+ * "address" is the address of the reference value which might get updated with the maximum
+ * "value" is the value that is compared to the reference in order to determine the maximum
+ */
+ __forceinline__ __device__ 
+void AtomicMax(float* const address, const float value)
+{
+  if (*address >= value)
+  {
+    return;
+  }
+
+  unsigned int* const address_as_i = (unsigned int *)address;
+  unsigned int old = *address_as_i, assumed;
+
+  do 
+  {
+    assumed = old;
+    if (__int_as_float(assumed) >= value)
+    {
+      break;
+    }
+    old = atomicCAS(address_as_i, assumed, __float_as_int(value));
+  } while (assumed != old);
+}
+
+/*
+ * Compute the minimum of 2 double-precision floating point values using an atomic operation
+ * "address" is the address of the reference value which might get updated with the minimum
+ * "value" is the value that is compared to the reference in order to determine the minimum
+ */
+__forceinline__ __device__ 
+void AtomicMin(double* const address, const double value)
+{
+  if (*address <= value)
+  {
+    return;
+  }
+
+  unsigned long long* const address_as_i = (unsigned long long *)address;
+  unsigned long long old = *address_as_i, assumed;
+
+  do 
+  {
+    assumed = old;
+    if (__longlong_as_double(assumed) <= value)
+    {
+      break;
+    }
+    old = atomicCAS(address_as_i, assumed, __double_as_longlong(value));
+  } while (assumed != old);
+}
+
+/*
+ * Compute the minimum of 2 single-precision floating point values using an atomic operation
+ * "address" is the address of the reference value which might get updated with the minimum
+ * "value" is the value that is compared to the reference in order to determine the minimum
+ */
+__forceinline__ __device__ 
+void AtomicMin(float* const address, const float value)
+{
+  if (*address <= value)
+  {
+    return;
+  }
+
+  unsigned int* const address_as_i = (unsigned int *)address;
+  unsigned int old = *address_as_i, assumed;
+
+  do 
+  {
+    assumed = old;
+    if (__int_as_float(assumed) <= value)
+    {
+      break;
+    }
+    old = atomicCAS(address_as_i, assumed, __float_as_int(value));
+  } while (assumed != old);
+}
+
+/*
+ * Perform a reduce on the warp to get the sum.
+ */
+template <typename T>
+__inline__ __device__
+T warpReduceSum(T val)
+{
+  for (int offset = warpSize/2; offset > 0; offset /= 2) 
+    val += __shfl_down_sync(0xFFFFFFFF, val, offset);
+  return val;
+}
+
+/*
+ * Perform a reduce on the warp to get the maximum value.
+ */
+template<typename T>
+__inline__ __device__
+T warpReduceMax(T val)
+{
+  for (int offset = warpSize/2; offset > 0; offset /= 2) 
+    val = max(__shfl_down_sync(0xFFFFFFFF, val, offset), val);
+  return val;
+}
+
+/*
+ * Perform a reduce on the warp to get the minimum value.
+ */
+template<typename T>
+__inline__ __device__
+T warpReduceMin(T val)
+{
+  for (int offset = warpSize/2; offset > 0; offset /= 2) 
+    val = min(__shfl_down_sync(0xFFFFFFFF, val, offset), val);
+  return val;
+}
+
+/*
+ * Reduce value across the thread block. 
+ */
+template <typename T, BinaryReductionOp op>
+__inline__ __device__
+T blockReduce(T val, T default_value)
+{
+  // Shared memory for the partial sums
+  static __shared__ T shared[CUDA_WARP_SIZE]; 
+
+  int lane = threadIdx.x % warpSize; // thread lane within warp
+  int wid = threadIdx.x / warpSize;  // warp ID
+
+  // Each warp performs partial reduction
+  switch(op)
+  {
+    case RSUM:
+      val = warpReduceSum<T>(val);
+      break;
+    case RMAX:
+      val = warpReduceMax<T>(val);
+      break;
+    case RMIN:
+      val = warpReduceMin<T>(val);
+      break;
+    default:
+      asm("trap;"); // illegal instruction
+      break;
+  }
+
+  // Write reduced value from each warp to shared memory
+  if (lane == 0) shared[wid] = val; 
+  
+  // Wait for all partial reductions to complete
+  __syncthreads();
+
+  // Read from shared memory only if that warp existed
+  val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : default_value;
+
+  // Final reduce within first warp
+  if (wid == 0)
+  {
+    switch(op)
+    {
+      case RSUM:
+        val = warpReduceSum<T>(val);
+        break;
+      case RMAX:
+        val = warpReduceMax<T>(val);
+        break;
+      case RMIN:
+        val = warpReduceMin<T>(val);
+        break;
+      default:
+        asm("trap;"); // illegal instruction
+        break;
+    }
+  }
+
+  return val;
+}
+
+} // namespace cuda
+} // namespace sundials
+
+#endif // _SUNDIALS_CUDA_KERNELS_CUH
\ No newline at end of file
diff --git a/src/sundials/sundials_nvector.c b/src/sundials/sundials_nvector.c
index d905748a25..3947aa14fb 100644
--- a/src/sundials/sundials_nvector.c
+++ b/src/sundials/sundials_nvector.c
@@ -18,7 +18,9 @@
  * in nvector.h.
  * -----------------------------------------------------------------*/
 
+#include <stdio.h>
 #include <stdlib.h>
+
 #include <sundials/sundials_nvector.h>
 
 /* -----------------------------------------------------------------
@@ -99,6 +101,10 @@ N_Vector N_VNewEmpty()
   ops->nvwsqrsumlocal     = NULL;
   ops->nvwsqrsummasklocal = NULL;
 
+  /* debugging functions (called when SUNDIALS_DEBUG_PRINTVEC is defined) */
+  ops->nvprint     = NULL;
+  ops->nvprintfile = NULL;
+
   /* attach ops and initialize content to NULL */
   v->ops     = ops;
   v->content = NULL;
@@ -194,6 +200,10 @@ int N_VCopyOps(N_Vector w, N_Vector v)
   v->ops->nvwsqrsumlocal     = w->ops->nvwsqrsumlocal;
   v->ops->nvwsqrsummasklocal = w->ops->nvwsqrsummasklocal;
 
+  /* debugging functions (called when SUNDIALS_DEBUG_PRINTVEC is defined) */
+  v->ops->nvprint     = w->ops->nvprint;
+  v->ops->nvprintfile = w->ops->nvprintfile;
+
   return(0);
 }
 
@@ -761,3 +771,37 @@ void N_VSetVecAtIndexVectorArray(N_Vector* vs, int index, N_Vector w)
   else if (index < 0) return;
   else                vs[index] = w;
 }
+
+
+/* -----------------------------------------------------------------
+ * Debugging functions
+ * ----------------------------------------------------------------- */
+
+void N_VPrint(N_Vector v)
+{
+  if (v == NULL) {
+    printf("NULL Vector\n");
+    return;
+  }
+  if (v->ops->nvprint == NULL) {
+    printf("NULL Print Op\n");
+    return;
+  }
+  v->ops->nvprint(v);
+  return;
+}
+
+
+void N_VPrintFile(N_Vector v, FILE* outfile)
+{
+  if (v == NULL) {
+    fprintf(outfile, "NULL Vector\n");
+    return;
+  }
+  if (v->ops->nvprintfile == NULL) {
+    fprintf(outfile, "NULL PrintFile Op\n");
+    return;
+  }
+  v->ops->nvprintfile(v, outfile);
+  return;
+}
diff --git a/src/sunlinsol/cusolversp/sunlinsol_cusolversp_batchqr.cu b/src/sunlinsol/cusolversp/sunlinsol_cusolversp_batchqr.cu
index 905907fa4d..515964f31d 100644
--- a/src/sunlinsol/cusolversp/sunlinsol_cusolversp_batchqr.cu
+++ b/src/sunlinsol/cusolversp/sunlinsol_cusolversp_batchqr.cu
@@ -107,7 +107,7 @@ SUNLinearSolver SUNLinSol_cuSolverSp_batchQR(N_Vector y, SUNMatrix A, cusolverSp
   SUNLinearSolverContent_cuSolverSp_batchQR content;
 
   content = NULL;
-  content = (SUNLinearSolverContent_cuSolverSp_batchQR) malloc(sizeof *content);
+  content = (SUNLinearSolverContent_cuSolverSp_batchQR) malloc(sizeof(*content));
   if (content == NULL)
   {
     SUNLinSolFree(S);
diff --git a/src/sunlinsol/lapackband/CMakeLists.txt b/src/sunlinsol/lapackband/CMakeLists.txt
index 2bb5a4ef06..b736ed09dc 100644
--- a/src/sunlinsol/lapackband/CMakeLists.txt
+++ b/src/sunlinsol/lapackband/CMakeLists.txt
@@ -74,6 +74,9 @@ if(BUILD_SHARED_LIBS)
   target_link_libraries(sundials_sunlinsollapackband_shared
     PUBLIC sundials_sunmatrixband_shared ${LAPACK_LIBRARIES})
 
+  target_compile_definitions(sundials_sunlinsollapackband_shared
+    PUBLIC -DBUILD_SUNDIALS_LIBRARY)
+
   install(TARGETS sundials_sunlinsollapackband_shared
     DESTINATION ${CMAKE_INSTALL_LIBDIR})
 
diff --git a/src/sunlinsol/pcg/CMakeLists.txt b/src/sunlinsol/pcg/CMakeLists.txt
index ecb7cb356a..a60f9e38ca 100644
--- a/src/sunlinsol/pcg/CMakeLists.txt
+++ b/src/sunlinsol/pcg/CMakeLists.txt
@@ -49,6 +49,9 @@ if(BUILD_STATIC_LIBS)
       PRIVATE m)
   endif()
 
+  target_include_directories(sundials_sunlinsolpcg_static
+    PRIVATE ${sundials_SOURCE_DIR}/src/sundials)
+
   target_compile_definitions(sundials_sunlinsolpcg_static
     PUBLIC -DBUILD_SUNDIALS_LIBRARY)
 
@@ -79,6 +82,9 @@ if(BUILD_SHARED_LIBS)
       PRIVATE m)
   endif()
 
+  target_include_directories(sundials_sunlinsolpcg_shared
+    PRIVATE ${sundials_SOURCE_DIR}/src/sundials)
+
   target_compile_definitions(sundials_sunlinsolpcg_shared
     PUBLIC -DBUILD_SUNDIALS_LIBRARY)
 
diff --git a/src/sunlinsol/pcg/fmod/fsunlinsol_pcg_mod.c b/src/sunlinsol/pcg/fmod/fsunlinsol_pcg_mod.c
index 84a66ec975..d4fb5d62e0 100644
--- a/src/sunlinsol/pcg/fmod/fsunlinsol_pcg_mod.c
+++ b/src/sunlinsol/pcg/fmod/fsunlinsol_pcg_mod.c
@@ -492,4 +492,32 @@ SWIGEXPORT int _wrap_FSUNLinSolFree_PCG(SUNLinearSolver farg1) {
 }
 
 
+SWIGEXPORT int _wrap_FSUNLinSolSetInfoFile_PCG(SUNLinearSolver farg1, void *farg2) {
+  int fresult ;
+  SUNLinearSolver arg1 = (SUNLinearSolver) 0 ;
+  FILE *arg2 = (FILE *) 0 ;
+  int result;
+  
+  arg1 = (SUNLinearSolver)(farg1);
+  arg2 = (FILE *)(farg2);
+  result = (int)SUNLinSolSetInfoFile_PCG(arg1,arg2);
+  fresult = (int)(result);
+  return fresult;
+}
+
+
+SWIGEXPORT int _wrap_FSUNLinSolSetPrintLevel_PCG(SUNLinearSolver farg1, int const *farg2) {
+  int fresult ;
+  SUNLinearSolver arg1 = (SUNLinearSolver) 0 ;
+  int arg2 ;
+  int result;
+  
+  arg1 = (SUNLinearSolver)(farg1);
+  arg2 = (int)(*farg2);
+  result = (int)SUNLinSolSetPrintLevel_PCG(arg1,arg2);
+  fresult = (int)(result);
+  return fresult;
+}
+
+
 
diff --git a/src/sunlinsol/pcg/fmod/fsunlinsol_pcg_mod.f90 b/src/sunlinsol/pcg/fmod/fsunlinsol_pcg_mod.f90
index ad98912e41..a050b2aee4 100644
--- a/src/sunlinsol/pcg/fmod/fsunlinsol_pcg_mod.f90
+++ b/src/sunlinsol/pcg/fmod/fsunlinsol_pcg_mod.f90
@@ -52,6 +52,8 @@ module fsunlinsol_pcg_mod
  public :: FSUNLinSolLastFlag_PCG
  public :: FSUNLinSolSpace_PCG
  public :: FSUNLinSolFree_PCG
+ public :: FSUNLinSolSetInfoFile_PCG
+ public :: FSUNLinSolSetPrintLevel_PCG
 
 ! WRAPPER DECLARATIONS
 interface
@@ -237,6 +239,24 @@ function swigc_FSUNLinSolFree_PCG(farg1) &
 integer(C_INT) :: fresult
 end function
 
+function swigc_FSUNLinSolSetInfoFile_PCG(farg1, farg2) &
+bind(C, name="_wrap_FSUNLinSolSetInfoFile_PCG") &
+result(fresult)
+use, intrinsic :: ISO_C_BINDING
+type(C_PTR), value :: farg1
+type(C_PTR), value :: farg2
+integer(C_INT) :: fresult
+end function
+
+function swigc_FSUNLinSolSetPrintLevel_PCG(farg1, farg2) &
+bind(C, name="_wrap_FSUNLinSolSetPrintLevel_PCG") &
+result(fresult)
+use, intrinsic :: ISO_C_BINDING
+type(C_PTR), value :: farg1
+integer(C_INT), intent(in) :: farg2
+integer(C_INT) :: fresult
+end function
+
 end interface
 
 
@@ -568,5 +588,37 @@ function FSUNLinSolFree_PCG(s) &
 swig_result = fresult
 end function
 
+function FSUNLinSolSetInfoFile_PCG(ls, info_file) &
+result(swig_result)
+use, intrinsic :: ISO_C_BINDING
+integer(C_INT) :: swig_result
+type(SUNLinearSolver), target, intent(inout) :: ls
+type(C_PTR) :: info_file
+integer(C_INT) :: fresult 
+type(C_PTR) :: farg1 
+type(C_PTR) :: farg2 
+
+farg1 = c_loc(ls)
+farg2 = info_file
+fresult = swigc_FSUNLinSolSetInfoFile_PCG(farg1, farg2)
+swig_result = fresult
+end function
+
+function FSUNLinSolSetPrintLevel_PCG(ls, print_level) &
+result(swig_result)
+use, intrinsic :: ISO_C_BINDING
+integer(C_INT) :: swig_result
+type(SUNLinearSolver), target, intent(inout) :: ls
+integer(C_INT), intent(in) :: print_level
+integer(C_INT) :: fresult 
+type(C_PTR) :: farg1 
+integer(C_INT) :: farg2 
+
+farg1 = c_loc(ls)
+farg2 = print_level
+fresult = swigc_FSUNLinSolSetPrintLevel_PCG(farg1, farg2)
+swig_result = fresult
+end function
+
 
 end module
diff --git a/src/sunlinsol/pcg/sunlinsol_pcg.c b/src/sunlinsol/pcg/sunlinsol_pcg.c
index b7239fb471..335ed07c95 100644
--- a/src/sunlinsol/pcg/sunlinsol_pcg.c
+++ b/src/sunlinsol/pcg/sunlinsol_pcg.c
@@ -22,6 +22,8 @@
 #include <sunlinsol/sunlinsol_pcg.h>
 #include <sundials/sundials_math.h>
 
+#include "sundials_debug.h"
+
 #define ZERO RCONST(0.0)
 #define ONE  RCONST(1.0)
 
@@ -102,21 +104,23 @@ SUNLinearSolver SUNLinSol_PCG(N_Vector y, int pretype, int maxl)
   S->content = content;
 
   /* Fill content */
-  content->last_flag = 0;
-  content->maxl      = maxl;
-  content->pretype   = pretype;
-  content->numiters  = 0;
-  content->resnorm   = ZERO;
-  content->r         = NULL;
-  content->p         = NULL;
-  content->z         = NULL;
-  content->Ap        = NULL;
-  content->s         = NULL;
-  content->ATimes    = NULL;
-  content->ATData    = NULL;
-  content->Psetup    = NULL;
-  content->Psolve    = NULL;
-  content->PData     = NULL;
+  content->last_flag   = 0;
+  content->maxl        = maxl;
+  content->pretype     = pretype;
+  content->numiters    = 0;
+  content->resnorm     = ZERO;
+  content->r           = NULL;
+  content->p           = NULL;
+  content->z           = NULL;
+  content->Ap          = NULL;
+  content->s           = NULL;
+  content->ATimes      = NULL;
+  content->ATData      = NULL;
+  content->Psetup      = NULL;
+  content->Psolve      = NULL;
+  content->PData       = NULL;
+  content->print_level = 0;
+  content->info_file   = stdout;
 
   /* Allocate content */
   content->r = N_VClone(y);
@@ -197,12 +201,24 @@ int SUNLinSolInitialize_PCG(SUNLinearSolver S)
 {
   /* ensure valid options */
   if (S == NULL) return(SUNLS_MEM_NULL);
+
+  if (PCG_CONTENT(S)->maxl <= 0)
+    PCG_CONTENT(S)->maxl = SUNPCG_MAXL_DEFAULT;
+
+  if (PCG_CONTENT(S)->ATimes == NULL) {
+    LASTFLAG(S) = SUNLS_ATIMES_NULL;
+    return(LASTFLAG(S));
+  }
+
   if ( (PRETYPE(S) != PREC_LEFT) &&
        (PRETYPE(S) != PREC_RIGHT) &&
        (PRETYPE(S) != PREC_BOTH) )
     PRETYPE(S) = PREC_NONE;
-  if (PCG_CONTENT(S)->maxl <= 0)
-    PCG_CONTENT(S)->maxl = SUNPCG_MAXL_DEFAULT;
+
+  if ((PRETYPE(S) != PREC_NONE) && (PCG_CONTENT(S)->Psolve == NULL)) {
+    LASTFLAG(S) = SUNLS_PSOLVE_NULL;
+    return(LASTFLAG(S));
+  }
 
   /* no additional memory to allocate */
 
@@ -319,6 +335,23 @@ int SUNLinSolSolve_PCG(SUNLinearSolver S, SUNMatrix nul, N_Vector x,
               (pretype == PREC_RIGHT) );
   UseScaling = (w != NULL);
 
+#ifdef SUNDIALS_BUILD_WITH_MONITORING
+  if (PCG_CONTENT(S)->print_level && PCG_CONTENT(S)->info_file)
+    fprintf(PCG_CONTENT(S)->info_file, "SUNLINSOL_PCG:\n");
+#endif
+
+  /* Check if Atimes function has been set */
+  if (atimes == NULL) {
+    LASTFLAG(S) = SUNLS_ATIMES_NULL;
+    return(LASTFLAG(S));
+  }
+
+  /* If preconditioning, check if psolve has been set */
+  if (UsePrec && psolve == NULL) {
+    LASTFLAG(S) = SUNLS_PSOLVE_NULL;
+    return(LASTFLAG(S));
+  }
+
   /* Set r to initial residual r_0 = b - A*x_0 */
   if (N_VDotProd(x, x) == ZERO)  N_VScale(ONE, b, r);
   else {
@@ -335,6 +368,17 @@ int SUNLinSolSolve_PCG(SUNLinearSolver S, SUNMatrix nul, N_Vector x,
   if (UseScaling)  N_VProd(r, w, Ap);
   else N_VScale(ONE, r, Ap);
   *res_norm = r0_norm = rho = SUNRsqrt(N_VDotProd(Ap, Ap));
+
+#ifdef SUNDIALS_BUILD_WITH_MONITORING
+    /* print initial residual */
+    if (PCG_CONTENT(S)->print_level && PCG_CONTENT(S)->info_file)
+    {
+      fprintf(PCG_CONTENT(S)->info_file,
+              SUNLS_MSG_RESIDUAL,
+              (long int) 0, *res_norm);
+    }
+#endif
+
   if (rho <= delta) {
     LASTFLAG(S) = SUNLS_SUCCESS;
     return(LASTFLAG(S));
@@ -384,6 +428,17 @@ int SUNLinSolSolve_PCG(SUNLinearSolver S, SUNMatrix nul, N_Vector x,
     if (UseScaling)  N_VProd(r, w, Ap);
     else N_VScale(ONE, r, Ap);
     *res_norm = rho = SUNRsqrt(N_VDotProd(Ap, Ap));
+
+#ifdef SUNDIALS_BUILD_WITH_MONITORING
+      /* print current iteration number and the residual */
+      if (PCG_CONTENT(S)->print_level && PCG_CONTENT(S)->info_file)
+      {
+        fprintf(PCG_CONTENT(S)->info_file,
+                SUNLS_MSG_RESIDUAL,
+                (long int) *nli, *res_norm);
+      }
+#endif
+
     if (rho <= delta) {
       converged = SUNTRUE;
       break;
@@ -495,3 +550,43 @@ int SUNLinSolFree_PCG(SUNLinearSolver S)
   free(S); S = NULL;
   return(SUNLS_SUCCESS);
 }
+
+
+int SUNLinSolSetInfoFile_PCG(SUNLinearSolver S,
+                             FILE* info_file)
+{
+#ifdef SUNDIALS_BUILD_WITH_MONITORING
+  /* check that the linear solver is non-null */
+  if (S == NULL)
+    return(SUNLS_MEM_NULL);
+
+  PCG_CONTENT(S)->info_file = info_file;
+
+  return(SUNLS_SUCCESS);
+#else
+  SUNDIALS_DEBUG_PRINT("ERROR in SUNLinSolSetInfoFile_PCG: SUNDIALS was not built with monitoring\n");
+  return(SUNLS_ILL_INPUT);
+#endif
+}
+
+
+int SUNLinSolSetPrintLevel_PCG(SUNLinearSolver S,
+                               int print_level)
+{
+#ifdef SUNDIALS_BUILD_WITH_MONITORING
+  /* check that the linear solver is non-null */
+  if (S == NULL)
+    return(SUNLS_MEM_NULL);
+
+  /* check for valid print level */
+  if (print_level < 0 || print_level > 1)
+    return(SUNLS_ILL_INPUT);
+
+  PCG_CONTENT(S)->print_level = print_level;
+
+  return(SUNLS_SUCCESS);
+#else
+  SUNDIALS_DEBUG_PRINT("ERROR in SUNLinSolSetPrintLevel_PCG: SUNDIALS was not built with monitoring\n");
+  return(SUNLS_ILL_INPUT);
+#endif
+}
diff --git a/src/sunlinsol/spbcgs/CMakeLists.txt b/src/sunlinsol/spbcgs/CMakeLists.txt
index bc357d066e..7419e95938 100644
--- a/src/sunlinsol/spbcgs/CMakeLists.txt
+++ b/src/sunlinsol/spbcgs/CMakeLists.txt
@@ -49,6 +49,9 @@ if(BUILD_STATIC_LIBS)
       PRIVATE m)
   endif()
 
+  target_include_directories(sundials_sunlinsolspbcgs_static
+    PRIVATE ${sundials_SOURCE_DIR}/src/sundials)
+
   target_compile_definitions(sundials_sunlinsolspbcgs_static
     PUBLIC -DBUILD_SUNDIALS_LIBRARY)
 
@@ -79,6 +82,9 @@ if(BUILD_SHARED_LIBS)
       PRIVATE m)
   endif()
 
+  target_include_directories(sundials_sunlinsolspbcgs_shared
+    PRIVATE ${sundials_SOURCE_DIR}/src/sundials)
+
   target_compile_definitions(sundials_sunlinsolspbcgs_shared
     PUBLIC -DBUILD_SUNDIALS_LIBRARY)
 
diff --git a/src/sunlinsol/spbcgs/fmod/fsunlinsol_spbcgs_mod.c b/src/sunlinsol/spbcgs/fmod/fsunlinsol_spbcgs_mod.c
index 932a0ebcdc..030b856c80 100644
--- a/src/sunlinsol/spbcgs/fmod/fsunlinsol_spbcgs_mod.c
+++ b/src/sunlinsol/spbcgs/fmod/fsunlinsol_spbcgs_mod.c
@@ -492,4 +492,32 @@ SWIGEXPORT int _wrap_FSUNLinSolFree_SPBCGS(SUNLinearSolver farg1) {
 }
 
 
+SWIGEXPORT int _wrap_FSUNLinSolSetInfoFile_SPBCGS(SUNLinearSolver farg1, void *farg2) {
+  int fresult ;
+  SUNLinearSolver arg1 = (SUNLinearSolver) 0 ;
+  FILE *arg2 = (FILE *) 0 ;
+  int result;
+  
+  arg1 = (SUNLinearSolver)(farg1);
+  arg2 = (FILE *)(farg2);
+  result = (int)SUNLinSolSetInfoFile_SPBCGS(arg1,arg2);
+  fresult = (int)(result);
+  return fresult;
+}
+
+
+SWIGEXPORT int _wrap_FSUNLinSolSetPrintLevel_SPBCGS(SUNLinearSolver farg1, int const *farg2) {
+  int fresult ;
+  SUNLinearSolver arg1 = (SUNLinearSolver) 0 ;
+  int arg2 ;
+  int result;
+  
+  arg1 = (SUNLinearSolver)(farg1);
+  arg2 = (int)(*farg2);
+  result = (int)SUNLinSolSetPrintLevel_SPBCGS(arg1,arg2);
+  fresult = (int)(result);
+  return fresult;
+}
+
+
 
diff --git a/src/sunlinsol/spbcgs/fmod/fsunlinsol_spbcgs_mod.f90 b/src/sunlinsol/spbcgs/fmod/fsunlinsol_spbcgs_mod.f90
index 6ea416a185..dc8f2f2f90 100644
--- a/src/sunlinsol/spbcgs/fmod/fsunlinsol_spbcgs_mod.f90
+++ b/src/sunlinsol/spbcgs/fmod/fsunlinsol_spbcgs_mod.f90
@@ -52,6 +52,8 @@ module fsunlinsol_spbcgs_mod
  public :: FSUNLinSolLastFlag_SPBCGS
  public :: FSUNLinSolSpace_SPBCGS
  public :: FSUNLinSolFree_SPBCGS
+ public :: FSUNLinSolSetInfoFile_SPBCGS
+ public :: FSUNLinSolSetPrintLevel_SPBCGS
 
 ! WRAPPER DECLARATIONS
 interface
@@ -237,6 +239,24 @@ function swigc_FSUNLinSolFree_SPBCGS(farg1) &
 integer(C_INT) :: fresult
 end function
 
+function swigc_FSUNLinSolSetInfoFile_SPBCGS(farg1, farg2) &
+bind(C, name="_wrap_FSUNLinSolSetInfoFile_SPBCGS") &
+result(fresult)
+use, intrinsic :: ISO_C_BINDING
+type(C_PTR), value :: farg1
+type(C_PTR), value :: farg2
+integer(C_INT) :: fresult
+end function
+
+function swigc_FSUNLinSolSetPrintLevel_SPBCGS(farg1, farg2) &
+bind(C, name="_wrap_FSUNLinSolSetPrintLevel_SPBCGS") &
+result(fresult)
+use, intrinsic :: ISO_C_BINDING
+type(C_PTR), value :: farg1
+integer(C_INT), intent(in) :: farg2
+integer(C_INT) :: fresult
+end function
+
 end interface
 
 
@@ -568,5 +588,37 @@ function FSUNLinSolFree_SPBCGS(s) &
 swig_result = fresult
 end function
 
+function FSUNLinSolSetInfoFile_SPBCGS(s, info_file) &
+result(swig_result)
+use, intrinsic :: ISO_C_BINDING
+integer(C_INT) :: swig_result
+type(SUNLinearSolver), target, intent(inout) :: s
+type(C_PTR) :: info_file
+integer(C_INT) :: fresult 
+type(C_PTR) :: farg1 
+type(C_PTR) :: farg2 
+
+farg1 = c_loc(s)
+farg2 = info_file
+fresult = swigc_FSUNLinSolSetInfoFile_SPBCGS(farg1, farg2)
+swig_result = fresult
+end function
+
+function FSUNLinSolSetPrintLevel_SPBCGS(s, print_level) &
+result(swig_result)
+use, intrinsic :: ISO_C_BINDING
+integer(C_INT) :: swig_result
+type(SUNLinearSolver), target, intent(inout) :: s
+integer(C_INT), intent(in) :: print_level
+integer(C_INT) :: fresult 
+type(C_PTR) :: farg1 
+integer(C_INT) :: farg2 
+
+farg1 = c_loc(s)
+farg2 = print_level
+fresult = swigc_FSUNLinSolSetPrintLevel_SPBCGS(farg1, farg2)
+swig_result = fresult
+end function
+
 
 end module
diff --git a/src/sunlinsol/spbcgs/sunlinsol_spbcgs.c b/src/sunlinsol/spbcgs/sunlinsol_spbcgs.c
index 625a0dcb09..055195254b 100644
--- a/src/sunlinsol/spbcgs/sunlinsol_spbcgs.c
+++ b/src/sunlinsol/spbcgs/sunlinsol_spbcgs.c
@@ -23,6 +23,8 @@
 #include <sunlinsol/sunlinsol_spbcgs.h>
 #include <sundials/sundials_math.h>
 
+#include "sundials_debug.h"
+
 #define ZERO RCONST(0.0)
 #define ONE  RCONST(1.0)
 
@@ -110,25 +112,27 @@ SUNLinearSolver SUNLinSol_SPBCGS(N_Vector y, int pretype, int maxl)
   S->content = content;
 
   /* Fill content */
-  content->last_flag = 0;
-  content->maxl      = maxl;
-  content->pretype   = pretype;
-  content->numiters  = 0;
-  content->resnorm   = ZERO;
-  content->r_star    = NULL;
-  content->r         = NULL;
-  content->p         = NULL;
-  content->q         = NULL;
-  content->u         = NULL;
-  content->Ap        = NULL;
-  content->vtemp     = NULL;
-  content->s1        = NULL;
-  content->s2        = NULL;
-  content->ATimes    = NULL;
-  content->ATData    = NULL;
-  content->Psetup    = NULL;
-  content->Psolve    = NULL;
-  content->PData     = NULL;
+  content->last_flag   = 0;
+  content->maxl        = maxl;
+  content->pretype     = pretype;
+  content->numiters    = 0;
+  content->resnorm     = ZERO;
+  content->r_star      = NULL;
+  content->r           = NULL;
+  content->p           = NULL;
+  content->q           = NULL;
+  content->u           = NULL;
+  content->Ap          = NULL;
+  content->vtemp       = NULL;
+  content->s1          = NULL;
+  content->s2          = NULL;
+  content->ATimes      = NULL;
+  content->ATData      = NULL;
+  content->Psetup      = NULL;
+  content->Psolve      = NULL;
+  content->PData       = NULL;
+  content->print_level = 0;
+  content->info_file   = stdout;
 
   /* Allocate content */
   content->r_star = N_VClone(y);
@@ -218,12 +222,24 @@ int SUNLinSolInitialize_SPBCGS(SUNLinearSolver S)
 {
   /* ensure valid options */
   if (S == NULL) return(SUNLS_MEM_NULL);
+
+  if (SPBCGS_CONTENT(S)->maxl <= 0)
+    SPBCGS_CONTENT(S)->maxl = SUNSPBCGS_MAXL_DEFAULT;
+
+  if (SPBCGS_CONTENT(S)->ATimes == NULL) {
+    LASTFLAG(S) = SUNLS_ATIMES_NULL;
+    return(LASTFLAG(S));
+  }
+
   if ( (PRETYPE(S) != PREC_LEFT) &&
        (PRETYPE(S) != PREC_RIGHT) &&
        (PRETYPE(S) != PREC_BOTH) )
     PRETYPE(S) = PREC_NONE;
-  if (SPBCGS_CONTENT(S)->maxl <= 0)
-    SPBCGS_CONTENT(S)->maxl = SUNSPBCGS_MAXL_DEFAULT;
+
+  if ((PRETYPE(S) != PREC_NONE) && (SPBCGS_CONTENT(S)->Psolve == NULL)) {
+    LASTFLAG(S) = SUNLS_PSOLVE_NULL;
+    return(LASTFLAG(S));
+  }
 
   /* no additional memory to allocate */
 
@@ -351,6 +367,23 @@ int SUNLinSolSolve_SPBCGS(SUNLinearSolver S, SUNMatrix A, N_Vector x,
   scale_x = (sx != NULL);
   scale_b = (sb != NULL);
 
+#ifdef SUNDIALS_BUILD_WITH_MONITORING
+  if (SPBCGS_CONTENT(S)->print_level && SPBCGS_CONTENT(S)->info_file)
+    fprintf(SPBCGS_CONTENT(S)->info_file, "SUNLINSOL_SPBCGS:\n");
+#endif
+
+  /* Check if Atimes function has been set */
+  if (atimes == NULL) {
+    LASTFLAG(S) = SUNLS_ATIMES_NULL;
+    return(LASTFLAG(S));
+  }
+
+  /* If preconditioning, check if psolve has been set */
+  if ((preOnLeft || preOnRight) && psolve == NULL) {
+    LASTFLAG(S) = SUNLS_PSOLVE_NULL;
+    return(LASTFLAG(S));
+  }
+
   /* Set r_star to initial (unscaled) residual r_0 = b - A*x_0 */
 
   if (N_VDotProd(x, x) == ZERO) N_VScale(ONE, b, r_star);
@@ -387,6 +420,17 @@ int SUNLinSolSolve_SPBCGS(SUNLinearSolver S, SUNMatrix A, N_Vector x,
      return if small */
 
   *res_norm = r_norm = rho = SUNRsqrt(beta_denom);
+
+#ifdef SUNDIALS_BUILD_WITH_MONITORING
+  /* print the initial residual */
+  if (SPBCGS_CONTENT(S)->print_level && SPBCGS_CONTENT(S)->info_file)
+  {
+    fprintf(SPBCGS_CONTENT(S)->info_file,
+            SUNLS_MSG_RESIDUAL,
+            (long int) 0, *res_norm);
+  }
+#endif
+
   if (r_norm <= delta) {
     LASTFLAG(S) = SUNLS_SUCCESS;
     return(LASTFLAG(S));
@@ -529,6 +573,17 @@ int SUNLinSolSolve_SPBCGS(SUNLinearSolver S, SUNMatrix A, N_Vector x,
     /* Set rho = norm(r) and check convergence */
 
     *res_norm = rho = SUNRsqrt(N_VDotProd(r, r));
+
+#ifdef SUNDIALS_BUILD_WITH_MONITORING
+      /* print current iteration number and the residual */
+      if (SPBCGS_CONTENT(S)->print_level && SPBCGS_CONTENT(S)->info_file)
+      {
+        fprintf(SPBCGS_CONTENT(S)->info_file,
+                SUNLS_MSG_RESIDUAL,
+                (long int) *nli, *res_norm);
+      }
+#endif
+
     if (rho <= delta) {
       converged = SUNTRUE;
       break;
@@ -674,3 +729,43 @@ int SUNLinSolFree_SPBCGS(SUNLinearSolver S)
   free(S); S = NULL;
   return(SUNLS_SUCCESS);
 }
+
+
+int SUNLinSolSetInfoFile_SPBCGS(SUNLinearSolver S,
+                                FILE* info_file)
+{
+#ifdef SUNDIALS_BUILD_WITH_MONITORING
+  /* check that the linear solver is non-null */
+  if (S == NULL)
+    return(SUNLS_MEM_NULL);
+
+  SPBCGS_CONTENT(S)->info_file = info_file;
+
+  return(SUNLS_SUCCESS);
+#else
+  SUNDIALS_DEBUG_PRINT("ERROR in SUNLinSolSetInfoFile_SPBCGS: SUNDIALS was not built with monitoring\n");
+  return(SUNLS_ILL_INPUT);
+#endif
+}
+
+
+int SUNLinSolSetPrintLevel_SPBCGS(SUNLinearSolver S,
+                                  int print_level)
+{
+#ifdef SUNDIALS_BUILD_WITH_MONITORING
+  /* check that the linear solver is non-null */
+  if (S == NULL)
+    return(SUNLS_MEM_NULL);
+
+  /* check for valid print level */
+  if (print_level < 0 || print_level > 1)
+    return(SUNLS_ILL_INPUT);
+
+  SPBCGS_CONTENT(S)->print_level = print_level;
+
+  return(SUNLS_SUCCESS);
+#else
+  SUNDIALS_DEBUG_PRINT("ERROR in SUNLinSolSetPrintLevel_SPBCGS: SUNDIALS was not built with monitoring\n");
+  return(SUNLS_ILL_INPUT);
+#endif
+}
diff --git a/src/sunlinsol/spfgmr/CMakeLists.txt b/src/sunlinsol/spfgmr/CMakeLists.txt
index e0226c251a..8b13595879 100644
--- a/src/sunlinsol/spfgmr/CMakeLists.txt
+++ b/src/sunlinsol/spfgmr/CMakeLists.txt
@@ -49,6 +49,9 @@ if(BUILD_STATIC_LIBS)
       PRIVATE m)
   endif()
 
+  target_include_directories(sundials_sunlinsolspfgmr_static
+    PRIVATE ${sundials_SOURCE_DIR}/src/sundials)
+
   target_compile_definitions(sundials_sunlinsolspfgmr_static
     PUBLIC -DBUILD_SUNDIALS_LIBRARY)
 
@@ -79,6 +82,9 @@ if(BUILD_SHARED_LIBS)
       PRIVATE m)
   endif()
 
+  target_include_directories(sundials_sunlinsolspfgmr_shared
+    PRIVATE ${sundials_SOURCE_DIR}/src/sundials)
+
   target_compile_definitions(sundials_sunlinsolspfgmr_shared
     PUBLIC -DBUILD_SUNDIALS_LIBRARY)
 
diff --git a/src/sunlinsol/spfgmr/fmod/fsunlinsol_spfgmr_mod.c b/src/sunlinsol/spfgmr/fmod/fsunlinsol_spfgmr_mod.c
index 66276be718..94ee1f1289 100644
--- a/src/sunlinsol/spfgmr/fmod/fsunlinsol_spfgmr_mod.c
+++ b/src/sunlinsol/spfgmr/fmod/fsunlinsol_spfgmr_mod.c
@@ -520,4 +520,32 @@ SWIGEXPORT int _wrap_FSUNLinSolFree_SPFGMR(SUNLinearSolver farg1) {
 }
 
 
+SWIGEXPORT int _wrap_FSUNLinSolSetInfoFile_SPFGMR(SUNLinearSolver farg1, void *farg2) {
+  int fresult ;
+  SUNLinearSolver arg1 = (SUNLinearSolver) 0 ;
+  FILE *arg2 = (FILE *) 0 ;
+  int result;
+  
+  arg1 = (SUNLinearSolver)(farg1);
+  arg2 = (FILE *)(farg2);
+  result = (int)SUNLinSolSetInfoFile_SPFGMR(arg1,arg2);
+  fresult = (int)(result);
+  return fresult;
+}
+
+
+SWIGEXPORT int _wrap_FSUNLinSolSetPrintLevel_SPFGMR(SUNLinearSolver farg1, int const *farg2) {
+  int fresult ;
+  SUNLinearSolver arg1 = (SUNLinearSolver) 0 ;
+  int arg2 ;
+  int result;
+  
+  arg1 = (SUNLinearSolver)(farg1);
+  arg2 = (int)(*farg2);
+  result = (int)SUNLinSolSetPrintLevel_SPFGMR(arg1,arg2);
+  fresult = (int)(result);
+  return fresult;
+}
+
+
 
diff --git a/src/sunlinsol/spfgmr/fmod/fsunlinsol_spfgmr_mod.f90 b/src/sunlinsol/spfgmr/fmod/fsunlinsol_spfgmr_mod.f90
index 64206fcaf4..37cbd383f4 100644
--- a/src/sunlinsol/spfgmr/fmod/fsunlinsol_spfgmr_mod.f90
+++ b/src/sunlinsol/spfgmr/fmod/fsunlinsol_spfgmr_mod.f90
@@ -55,6 +55,8 @@ module fsunlinsol_spfgmr_mod
  public :: FSUNLinSolLastFlag_SPFGMR
  public :: FSUNLinSolSpace_SPFGMR
  public :: FSUNLinSolFree_SPFGMR
+ public :: FSUNLinSolSetInfoFile_SPFGMR
+ public :: FSUNLinSolSetPrintLevel_SPFGMR
 
 ! WRAPPER DECLARATIONS
 interface
@@ -258,6 +260,24 @@ function swigc_FSUNLinSolFree_SPFGMR(farg1) &
 integer(C_INT) :: fresult
 end function
 
+function swigc_FSUNLinSolSetInfoFile_SPFGMR(farg1, farg2) &
+bind(C, name="_wrap_FSUNLinSolSetInfoFile_SPFGMR") &
+result(fresult)
+use, intrinsic :: ISO_C_BINDING
+type(C_PTR), value :: farg1
+type(C_PTR), value :: farg2
+integer(C_INT) :: fresult
+end function
+
+function swigc_FSUNLinSolSetPrintLevel_SPFGMR(farg1, farg2) &
+bind(C, name="_wrap_FSUNLinSolSetPrintLevel_SPFGMR") &
+result(fresult)
+use, intrinsic :: ISO_C_BINDING
+type(C_PTR), value :: farg1
+integer(C_INT), intent(in) :: farg2
+integer(C_INT) :: fresult
+end function
+
 end interface
 
 
@@ -621,5 +641,37 @@ function FSUNLinSolFree_SPFGMR(s) &
 swig_result = fresult
 end function
 
+function FSUNLinSolSetInfoFile_SPFGMR(ls, info_file) &
+result(swig_result)
+use, intrinsic :: ISO_C_BINDING
+integer(C_INT) :: swig_result
+type(SUNLinearSolver), target, intent(inout) :: ls
+type(C_PTR) :: info_file
+integer(C_INT) :: fresult 
+type(C_PTR) :: farg1 
+type(C_PTR) :: farg2 
+
+farg1 = c_loc(ls)
+farg2 = info_file
+fresult = swigc_FSUNLinSolSetInfoFile_SPFGMR(farg1, farg2)
+swig_result = fresult
+end function
+
+function FSUNLinSolSetPrintLevel_SPFGMR(ls, print_level) &
+result(swig_result)
+use, intrinsic :: ISO_C_BINDING
+integer(C_INT) :: swig_result
+type(SUNLinearSolver), target, intent(inout) :: ls
+integer(C_INT), intent(in) :: print_level
+integer(C_INT) :: fresult 
+type(C_PTR) :: farg1 
+integer(C_INT) :: farg2 
+
+farg1 = c_loc(ls)
+farg2 = print_level
+fresult = swigc_FSUNLinSolSetPrintLevel_SPFGMR(farg1, farg2)
+swig_result = fresult
+end function
+
 
 end module
diff --git a/src/sunlinsol/spfgmr/sunlinsol_spfgmr.c b/src/sunlinsol/spfgmr/sunlinsol_spfgmr.c
index cbf1b81b7f..e2c6d6526e 100644
--- a/src/sunlinsol/spfgmr/sunlinsol_spfgmr.c
+++ b/src/sunlinsol/spfgmr/sunlinsol_spfgmr.c
@@ -23,6 +23,8 @@
 #include <sunlinsol/sunlinsol_spfgmr.h>
 #include <sundials/sundials_math.h>
 
+#include "sundials_debug.h"
+
 #define ZERO RCONST(0.0)
 #define ONE  RCONST(1.0)
 
@@ -137,6 +139,8 @@ SUNLinearSolver SUNLinSol_SPFGMR(N_Vector y, int pretype, int maxl)
   content->yg           = NULL;
   content->cv           = NULL;
   content->Xv           = NULL;
+  content->print_level  = 0;
+  content->info_file    = stdout;
 
   /* Allocate content */
   content->xcor = N_VClone(y);
@@ -239,11 +243,21 @@ int SUNLinSolInitialize_SPFGMR(SUNLinearSolver S)
   /* ensure valid options */
   if (content->max_restarts < 0)
     content->max_restarts = SUNSPFGMR_MAXRS_DEFAULT;
+
+  if (content->ATimes == NULL) {
+    LASTFLAG(S) = SUNLS_ATIMES_NULL;
+    return(LASTFLAG(S));
+  }
+
   if ( (content->pretype != PREC_LEFT) &&
        (content->pretype != PREC_RIGHT) &&
        (content->pretype != PREC_BOTH) )
     content->pretype = PREC_NONE;
 
+  if ((content->pretype != PREC_NONE) && (content->Psolve == NULL)) {
+    LASTFLAG(S) = SUNLS_PSOLVE_NULL;
+    return(LASTFLAG(S));
+  }
 
   /* allocate solver-specific memory (where the size depends on the
      choice of maxl) here */
@@ -252,7 +266,6 @@ int SUNLinSolInitialize_SPFGMR(SUNLinearSolver S)
   if (content->V == NULL) {
     content->V = N_VCloneVectorArray(content->maxl+1, content->vtemp);
     if (content->V == NULL) {
-      SUNLinSolFree(S);
       content->last_flag = SUNLS_MEM_FAIL;
       return(SUNLS_MEM_FAIL);
     }
@@ -262,7 +275,6 @@ int SUNLinSolInitialize_SPFGMR(SUNLinearSolver S)
   if (content->Z == NULL) {
     content->Z = N_VCloneVectorArray(content->maxl+1, content->vtemp);
     if (content->Z == NULL) {
-      SUNLinSolFree(S);
       content->last_flag = SUNLS_MEM_FAIL;
       return(SUNLS_MEM_FAIL);
     }
@@ -272,7 +284,6 @@ int SUNLinSolInitialize_SPFGMR(SUNLinearSolver S)
   if (content->Hes == NULL) {
     content->Hes = (realtype **) malloc((content->maxl+1)*sizeof(realtype *));
     if (content->Hes == NULL) {
-      SUNLinSolFree(S);
       content->last_flag = SUNLS_MEM_FAIL;
       return(SUNLS_MEM_FAIL);
     }
@@ -281,7 +292,6 @@ int SUNLinSolInitialize_SPFGMR(SUNLinearSolver S)
       content->Hes[k] = NULL;
       content->Hes[k] = (realtype *) malloc(content->maxl*sizeof(realtype));
       if (content->Hes[k] == NULL) {
-        SUNLinSolFree(S);
         content->last_flag = SUNLS_MEM_FAIL;
         return(SUNLS_MEM_FAIL);
       }
@@ -292,7 +302,6 @@ int SUNLinSolInitialize_SPFGMR(SUNLinearSolver S)
   if (content->givens == NULL) {
     content->givens = (realtype *) malloc(2*content->maxl*sizeof(realtype));
     if (content->givens == NULL) {
-      SUNLinSolFree(S);
       content->last_flag = SUNLS_MEM_FAIL;
       return(SUNLS_MEM_FAIL);
     }
@@ -302,7 +311,6 @@ int SUNLinSolInitialize_SPFGMR(SUNLinearSolver S)
   if (content->yg == NULL) {
     content->yg = (realtype *) malloc((content->maxl+1)*sizeof(realtype));
     if (content->yg == NULL) {
-      SUNLinSolFree(S);
       content->last_flag = SUNLS_MEM_FAIL;
       return(SUNLS_MEM_FAIL);
     }
@@ -312,7 +320,6 @@ int SUNLinSolInitialize_SPFGMR(SUNLinearSolver S)
   if (content->cv == NULL) {
     content->cv = (realtype *) malloc((content->maxl+1)*sizeof(realtype));
     if (content->cv == NULL) {
-      SUNLinSolFree(S);
       content->last_flag = SUNLS_MEM_FAIL;
       return(SUNLS_MEM_FAIL);
     }
@@ -322,7 +329,6 @@ int SUNLinSolInitialize_SPFGMR(SUNLinearSolver S)
   if (content->Xv == NULL) {
     content->Xv = (N_Vector *) malloc((content->maxl+1)*sizeof(N_Vector));
     if (content->Xv == NULL) {
-      SUNLinSolFree(S);
       content->last_flag = SUNLS_MEM_FAIL;
       return(SUNLS_MEM_FAIL);
     }
@@ -449,13 +455,30 @@ int SUNLinSolSolve_SPFGMR(SUNLinearSolver S, SUNMatrix A, N_Vector x,
   *nli = 0;
   converged = SUNFALSE;
 
-  /* set booleantype flags for internal solver options */
+  /* Set booleantype flags for internal solver options */
   preOnRight = ( (SPFGMR_CONTENT(S)->pretype == PREC_LEFT) ||
                  (SPFGMR_CONTENT(S)->pretype == PREC_RIGHT) ||
                  (SPFGMR_CONTENT(S)->pretype == PREC_BOTH) );
   scale1 = (s1 != NULL);
   scale2 = (s2 != NULL);
 
+#ifdef SUNDIALS_BUILD_WITH_MONITORING
+  if (SPFGMR_CONTENT(S)->print_level && SPFGMR_CONTENT(S)->info_file)
+    fprintf(SPFGMR_CONTENT(S)->info_file, "SUNLINSOL_SPFGMR:\n");
+#endif
+
+  /* Check if Atimes function has been set */
+  if (atimes == NULL) {
+    LASTFLAG(S) = SUNLS_ATIMES_NULL;
+    return(LASTFLAG(S));
+  }
+
+  /* If preconditioning, check if psolve has been set */
+  if (preOnRight && psolve == NULL) {
+    LASTFLAG(S) = SUNLS_PSOLVE_NULL;
+    return(LASTFLAG(S));
+  }
+
   /* Set vtemp and V[0] to initial (unscaled) residual r_0 = b - A*x_0 */
   if (N_VDotProd(x, x) == ZERO) {
     N_VScale(ONE, b, vtemp);
@@ -478,6 +501,17 @@ int SUNLinSolSolve_SPFGMR(SUNLinearSolver S, SUNMatrix A, N_Vector x,
 
   /* Set r_norm = beta to L2 norm of V[0] = s1 r_0, and return if small */
   *res_norm = r_norm = beta = SUNRsqrt(N_VDotProd(V[0], V[0]));
+
+#ifdef SUNDIALS_BUILD_WITH_MONITORING
+  /* print initial residual */
+  if (SPFGMR_CONTENT(S)->print_level && SPFGMR_CONTENT(S)->info_file)
+  {
+    fprintf(SPFGMR_CONTENT(S)->info_file,
+            SUNLS_MSG_RESIDUAL,
+            (long int) 0, *res_norm);
+  }
+#endif
+
   if (r_norm <= delta) {
     LASTFLAG(S) = SUNLS_SUCCESS;
     return(LASTFLAG(S));
@@ -558,6 +592,17 @@ int SUNLinSolSolve_SPFGMR(SUNLinearSolver S, SUNMatrix A, N_Vector x,
       /* Update residual norm estimate; break if convergence test passes. */
       rotation_product *= givens[2*l+1];
       *res_norm = rho = SUNRabs(rotation_product*r_norm);
+
+#ifdef SUNDIALS_BUILD_WITH_MONITORING
+      /* print current iteration number and the residual */
+      if (SPFGMR_CONTENT(S)->print_level && SPFGMR_CONTENT(S)->info_file)
+      {
+        fprintf(SPFGMR_CONTENT(S)->info_file,
+                SUNLS_MSG_RESIDUAL,
+                (long int) *nli, *res_norm);
+      }
+#endif
+
       if (rho <= delta) { converged = SUNTRUE; break; }
 
       /* Normalize V[l+1] with norm value from the Gram-Schmidt routine. */
@@ -739,3 +784,44 @@ int SUNLinSolFree_SPFGMR(SUNLinearSolver S)
   free(S); S = NULL;
   return(SUNLS_SUCCESS);
 }
+
+
+int SUNLinSolSetInfoFile_SPFGMR(SUNLinearSolver S,
+                                FILE* info_file)
+{
+#ifdef SUNDIALS_BUILD_WITH_MONITORING
+  /* check that the linear solver is non-null */
+  if (S == NULL)
+    return(SUNLS_MEM_NULL);
+
+  SPFGMR_CONTENT(S)->info_file = info_file;
+
+  return(SUNLS_SUCCESS);
+#else
+  SUNDIALS_DEBUG_PRINT("ERROR in SUNLinSolSetInfoFile_SPFGMR: SUNDIALS was not built with monitoring\n");
+  return(SUNLS_ILL_INPUT);
+#endif
+}
+
+
+int SUNLinSolSetPrintLevel_SPFGMR(SUNLinearSolver S,
+                                  int print_level)
+{
+#ifdef SUNDIALS_BUILD_WITH_MONITORING
+  /* check that the linear solver is non-null */
+  if (S == NULL)
+    return(SUNLS_MEM_NULL);
+
+  /* check for valid print level */
+  if (print_level < 0 || print_level > 1)
+    return(SUNLS_ILL_INPUT);
+
+  SPFGMR_CONTENT(S)->print_level = print_level;
+
+  return(SUNLS_SUCCESS);
+#else
+  SUNDIALS_DEBUG_PRINT("ERROR in SUNLinSolSetPrintLevel_SPFGMR: SUNDIALS was not built with monitoring\n");
+  return(SUNLS_ILL_INPUT);
+#endif
+}
+
diff --git a/src/sunlinsol/spgmr/CMakeLists.txt b/src/sunlinsol/spgmr/CMakeLists.txt
index 1a0c436d3c..9668e76699 100644
--- a/src/sunlinsol/spgmr/CMakeLists.txt
+++ b/src/sunlinsol/spgmr/CMakeLists.txt
@@ -49,6 +49,9 @@ if(BUILD_STATIC_LIBS)
       PRIVATE m)
   endif()
 
+  target_include_directories(sundials_sunlinsolspgmr_static
+    PRIVATE ${sundials_SOURCE_DIR}/src/sundials)
+
   target_compile_definitions(sundials_sunlinsolspgmr_static
     PUBLIC -DBUILD_SUNDIALS_LIBRARY)
 
@@ -79,6 +82,9 @@ if(BUILD_SHARED_LIBS)
       PRIVATE m)
   endif()
 
+  target_include_directories(sundials_sunlinsolspgmr_shared
+    PRIVATE ${sundials_SOURCE_DIR}/src/sundials)
+
   target_compile_definitions(sundials_sunlinsolspgmr_shared
     PUBLIC -DBUILD_SUNDIALS_LIBRARY)
 
diff --git a/src/sunlinsol/spgmr/fmod/fsunlinsol_spgmr_mod.c b/src/sunlinsol/spgmr/fmod/fsunlinsol_spgmr_mod.c
index 00d334c5aa..e877d55ea1 100644
--- a/src/sunlinsol/spgmr/fmod/fsunlinsol_spgmr_mod.c
+++ b/src/sunlinsol/spgmr/fmod/fsunlinsol_spgmr_mod.c
@@ -520,4 +520,32 @@ SWIGEXPORT int _wrap_FSUNLinSolFree_SPGMR(SUNLinearSolver farg1) {
 }
 
 
+SWIGEXPORT int _wrap_FSUNLinSolSetInfoFile_SPGMR(SUNLinearSolver farg1, void *farg2) {
+  int fresult ;
+  SUNLinearSolver arg1 = (SUNLinearSolver) 0 ;
+  FILE *arg2 = (FILE *) 0 ;
+  int result;
+  
+  arg1 = (SUNLinearSolver)(farg1);
+  arg2 = (FILE *)(farg2);
+  result = (int)SUNLinSolSetInfoFile_SPGMR(arg1,arg2);
+  fresult = (int)(result);
+  return fresult;
+}
+
+
+SWIGEXPORT int _wrap_FSUNLinSolSetPrintLevel_SPGMR(SUNLinearSolver farg1, int const *farg2) {
+  int fresult ;
+  SUNLinearSolver arg1 = (SUNLinearSolver) 0 ;
+  int arg2 ;
+  int result;
+  
+  arg1 = (SUNLinearSolver)(farg1);
+  arg2 = (int)(*farg2);
+  result = (int)SUNLinSolSetPrintLevel_SPGMR(arg1,arg2);
+  fresult = (int)(result);
+  return fresult;
+}
+
+
 
diff --git a/src/sunlinsol/spgmr/fmod/fsunlinsol_spgmr_mod.f90 b/src/sunlinsol/spgmr/fmod/fsunlinsol_spgmr_mod.f90
index e0bae5ae70..fd0856e81f 100644
--- a/src/sunlinsol/spgmr/fmod/fsunlinsol_spgmr_mod.f90
+++ b/src/sunlinsol/spgmr/fmod/fsunlinsol_spgmr_mod.f90
@@ -55,6 +55,8 @@ module fsunlinsol_spgmr_mod
  public :: FSUNLinSolLastFlag_SPGMR
  public :: FSUNLinSolSpace_SPGMR
  public :: FSUNLinSolFree_SPGMR
+ public :: FSUNLinSolSetInfoFile_SPGMR
+ public :: FSUNLinSolSetPrintLevel_SPGMR
 
 ! WRAPPER DECLARATIONS
 interface
@@ -258,6 +260,24 @@ function swigc_FSUNLinSolFree_SPGMR(farg1) &
 integer(C_INT) :: fresult
 end function
 
+function swigc_FSUNLinSolSetInfoFile_SPGMR(farg1, farg2) &
+bind(C, name="_wrap_FSUNLinSolSetInfoFile_SPGMR") &
+result(fresult)
+use, intrinsic :: ISO_C_BINDING
+type(C_PTR), value :: farg1
+type(C_PTR), value :: farg2
+integer(C_INT) :: fresult
+end function
+
+function swigc_FSUNLinSolSetPrintLevel_SPGMR(farg1, farg2) &
+bind(C, name="_wrap_FSUNLinSolSetPrintLevel_SPGMR") &
+result(fresult)
+use, intrinsic :: ISO_C_BINDING
+type(C_PTR), value :: farg1
+integer(C_INT), intent(in) :: farg2
+integer(C_INT) :: fresult
+end function
+
 end interface
 
 
@@ -621,5 +641,37 @@ function FSUNLinSolFree_SPGMR(s) &
 swig_result = fresult
 end function
 
+function FSUNLinSolSetInfoFile_SPGMR(ls, info_file) &
+result(swig_result)
+use, intrinsic :: ISO_C_BINDING
+integer(C_INT) :: swig_result
+type(SUNLinearSolver), target, intent(inout) :: ls
+type(C_PTR) :: info_file
+integer(C_INT) :: fresult 
+type(C_PTR) :: farg1 
+type(C_PTR) :: farg2 
+
+farg1 = c_loc(ls)
+farg2 = info_file
+fresult = swigc_FSUNLinSolSetInfoFile_SPGMR(farg1, farg2)
+swig_result = fresult
+end function
+
+function FSUNLinSolSetPrintLevel_SPGMR(ls, print_level) &
+result(swig_result)
+use, intrinsic :: ISO_C_BINDING
+integer(C_INT) :: swig_result
+type(SUNLinearSolver), target, intent(inout) :: ls
+integer(C_INT), intent(in) :: print_level
+integer(C_INT) :: fresult 
+type(C_PTR) :: farg1 
+integer(C_INT) :: farg2 
+
+farg1 = c_loc(ls)
+farg2 = print_level
+fresult = swigc_FSUNLinSolSetPrintLevel_SPGMR(farg1, farg2)
+swig_result = fresult
+end function
+
 
 end module
diff --git a/src/sunlinsol/spgmr/sunlinsol_spgmr.c b/src/sunlinsol/spgmr/sunlinsol_spgmr.c
index 5ae129dbdc..1cc39523bc 100644
--- a/src/sunlinsol/spgmr/sunlinsol_spgmr.c
+++ b/src/sunlinsol/spgmr/sunlinsol_spgmr.c
@@ -23,6 +23,8 @@
 #include <sunlinsol/sunlinsol_spgmr.h>
 #include <sundials/sundials_math.h>
 
+#include "sundials_debug.h"
+
 #define ZERO RCONST(0.0)
 #define ONE  RCONST(1.0)
 
@@ -133,6 +135,8 @@ SUNLinearSolver SUNLinSol_SPGMR(N_Vector y, int pretype, int maxl)
   content->yg           = NULL;
   content->cv           = NULL;
   content->Xv           = NULL;
+  content->print_level  = 0;
+  content->info_file    = stdout;
 
   /* Allocate content */
   content->xcor = N_VClone(y);
@@ -235,11 +239,21 @@ int SUNLinSolInitialize_SPGMR(SUNLinearSolver S)
   /* ensure valid options */
   if (content->max_restarts < 0)
     content->max_restarts = SUNSPGMR_MAXRS_DEFAULT;
+
+  if (content->ATimes == NULL) {
+    LASTFLAG(S) = SUNLS_ATIMES_NULL;
+    return(LASTFLAG(S));
+  }
+
   if ( (content->pretype != PREC_LEFT) &&
        (content->pretype != PREC_RIGHT) &&
        (content->pretype != PREC_BOTH) )
     content->pretype = PREC_NONE;
 
+  if ((content->pretype != PREC_NONE) && (content->Psolve == NULL)) {
+    LASTFLAG(S) = SUNLS_PSOLVE_NULL;
+    return(LASTFLAG(S));
+  }
 
   /* allocate solver-specific memory (where the size depends on the
      choice of maxl) here */
@@ -248,7 +262,6 @@ int SUNLinSolInitialize_SPGMR(SUNLinearSolver S)
   if (content->V == NULL) {
     content->V = N_VCloneVectorArray(content->maxl+1, content->vtemp);
     if (content->V == NULL) {
-      SUNLinSolFree(S);
       content->last_flag = SUNLS_MEM_FAIL;
       return(SUNLS_MEM_FAIL);
     }
@@ -258,7 +271,6 @@ int SUNLinSolInitialize_SPGMR(SUNLinearSolver S)
   if (content->Hes == NULL) {
     content->Hes = (realtype **) malloc((content->maxl+1)*sizeof(realtype *));
     if (content->Hes == NULL) {
-      SUNLinSolFree(S);
       content->last_flag = SUNLS_MEM_FAIL;
       return(SUNLS_MEM_FAIL);
     }
@@ -267,7 +279,6 @@ int SUNLinSolInitialize_SPGMR(SUNLinearSolver S)
       content->Hes[k] = NULL;
       content->Hes[k] = (realtype *) malloc(content->maxl*sizeof(realtype));
       if (content->Hes[k] == NULL) {
-        SUNLinSolFree(S);
         content->last_flag = SUNLS_MEM_FAIL;
         return(SUNLS_MEM_FAIL);
       }
@@ -278,7 +289,6 @@ int SUNLinSolInitialize_SPGMR(SUNLinearSolver S)
   if (content->givens == NULL) {
     content->givens = (realtype *) malloc(2*content->maxl*sizeof(realtype));
     if (content->givens == NULL) {
-      SUNLinSolFree(S);
       content->last_flag = SUNLS_MEM_FAIL;
       return(SUNLS_MEM_FAIL);
     }
@@ -288,7 +298,6 @@ int SUNLinSolInitialize_SPGMR(SUNLinearSolver S)
   if (content->yg == NULL) {
     content->yg = (realtype *) malloc((content->maxl+1)*sizeof(realtype));
     if (content->yg == NULL) {
-      SUNLinSolFree(S);
       content->last_flag = SUNLS_MEM_FAIL;
       return(SUNLS_MEM_FAIL);
     }
@@ -298,7 +307,6 @@ int SUNLinSolInitialize_SPGMR(SUNLinearSolver S)
   if (content->cv == NULL) {
     content->cv = (realtype *) malloc((content->maxl+1)*sizeof(realtype));
     if (content->cv == NULL) {
-      SUNLinSolFree(S);
       content->last_flag = SUNLS_MEM_FAIL;
       return(SUNLS_MEM_FAIL);
     }
@@ -308,7 +316,6 @@ int SUNLinSolInitialize_SPGMR(SUNLinearSolver S)
   if (content->Xv == NULL) {
     content->Xv = (N_Vector *) malloc((content->maxl+1)*sizeof(N_Vector));
     if (content->Xv == NULL) {
-      SUNLinSolFree(S);
       content->last_flag = SUNLS_MEM_FAIL;
       return(SUNLS_MEM_FAIL);
     }
@@ -435,7 +442,7 @@ int SUNLinSolSolve_SPGMR(SUNLinearSolver S, SUNMatrix A, N_Vector x,
   *nli = 0;
   converged = SUNFALSE;
 
-  /* set booleantype flags for internal solver options */
+  /* Set booleantype flags for internal solver options */
   preOnLeft  = ( (SPGMR_CONTENT(S)->pretype == PREC_LEFT) ||
                  (SPGMR_CONTENT(S)->pretype == PREC_BOTH) );
   preOnRight = ( (SPGMR_CONTENT(S)->pretype == PREC_RIGHT) ||
@@ -443,6 +450,23 @@ int SUNLinSolSolve_SPGMR(SUNLinearSolver S, SUNMatrix A, N_Vector x,
   scale1 = (s1 != NULL);
   scale2 = (s2 != NULL);
 
+#ifdef SUNDIALS_BUILD_WITH_MONITORING
+  if (SPGMR_CONTENT(S)->print_level && SPGMR_CONTENT(S)->info_file)
+    fprintf(SPGMR_CONTENT(S)->info_file, "SUNLINSOL_SPGMR:\n");
+#endif
+
+  /* Check if Atimes function has been set */
+  if (atimes == NULL) {
+    LASTFLAG(S) = SUNLS_ATIMES_NULL;
+    return(LASTFLAG(S));
+  }
+
+  /* If preconditioning, check if psolve has been set */
+  if ((preOnLeft || preOnRight) && psolve == NULL) {
+    LASTFLAG(S) = SUNLS_PSOLVE_NULL;
+    return(LASTFLAG(S));
+  }
+
   /* Set vtemp and V[0] to initial (unscaled) residual r_0 = b - A*x_0 */
   if (N_VDotProd(x, x) == ZERO) {
     N_VScale(ONE, b, vtemp);
@@ -478,6 +502,17 @@ int SUNLinSolSolve_SPGMR(SUNLinearSolver S, SUNMatrix A, N_Vector x,
   /* Set r_norm = beta to L2 norm of V[0] = s1 P1_inv r_0, and
      return if small  */
   *res_norm = r_norm = beta = SUNRsqrt(N_VDotProd(V[0], V[0]));
+
+#ifdef SUNDIALS_BUILD_WITH_MONITORING
+  /* print initial residual */
+  if (SPGMR_CONTENT(S)->print_level && SPGMR_CONTENT(S)->info_file)
+  {
+    fprintf(SPGMR_CONTENT(S)->info_file,
+            SUNLS_MSG_RESIDUAL,
+            (long int) 0, *res_norm);
+  }
+#endif
+
   if (r_norm <= delta) {
     LASTFLAG(S) = SUNLS_SUCCESS;
     return(LASTFLAG(S));
@@ -574,6 +609,16 @@ int SUNLinSolSolve_SPGMR(SUNLinearSolver S, SUNMatrix A, N_Vector x,
       rotation_product *= givens[2*l+1];
       *res_norm = rho = SUNRabs(rotation_product*r_norm);
 
+#ifdef SUNDIALS_BUILD_WITH_MONITORING
+      /* print current iteration number and the residual */
+      if (SPGMR_CONTENT(S)->print_level && SPGMR_CONTENT(S)->info_file)
+      {
+        fprintf(SPGMR_CONTENT(S)->info_file,
+                SUNLS_MSG_RESIDUAL,
+                (long int) *nli, *res_norm);
+      }
+#endif
+
       if (rho <= delta) { converged = SUNTRUE; break; }
 
       /* Normalize V[l+1] with norm value from the Gram-Schmidt routine */
@@ -781,3 +826,43 @@ int SUNLinSolFree_SPGMR(SUNLinearSolver S)
   free(S); S = NULL;
   return(SUNLS_SUCCESS);
 }
+
+
+int SUNLinSolSetInfoFile_SPGMR(SUNLinearSolver S,
+                               FILE* info_file)
+{
+#ifdef SUNDIALS_BUILD_WITH_MONITORING
+  /* check that the linear solver is non-null */
+  if (S == NULL)
+    return(SUNLS_MEM_NULL);
+
+  SPGMR_CONTENT(S)->info_file = info_file;
+
+  return(SUNLS_SUCCESS);
+#else
+  SUNDIALS_DEBUG_PRINT("ERROR in SUNLinSolSetInfoFile_SPGMR: SUNDIALS was not built with monitoring\n");
+  return(SUNLS_ILL_INPUT);
+#endif
+}
+
+
+int SUNLinSolSetPrintLevel_SPGMR(SUNLinearSolver S,
+                                 int print_level)
+{
+#ifdef SUNDIALS_BUILD_WITH_MONITORING
+  /* check that the linear solver is non-null */
+  if (S == NULL)
+    return(SUNLS_MEM_NULL);
+
+  /* check for valid print level */
+  if (print_level < 0 || print_level > 1)
+    return(SUNLS_ILL_INPUT);
+
+  SPGMR_CONTENT(S)->print_level = print_level;
+
+  return(SUNLS_SUCCESS);
+#else
+  SUNDIALS_DEBUG_PRINT("ERROR in SUNLinSolSetPrintLevel_SPGMR: SUNDIALS was not built with monitoring\n");
+  return(SUNLS_ILL_INPUT);
+#endif
+}
diff --git a/src/sunlinsol/sptfqmr/CMakeLists.txt b/src/sunlinsol/sptfqmr/CMakeLists.txt
index 0a6b41994f..55fe91f53b 100644
--- a/src/sunlinsol/sptfqmr/CMakeLists.txt
+++ b/src/sunlinsol/sptfqmr/CMakeLists.txt
@@ -49,6 +49,9 @@ if(BUILD_STATIC_LIBS)
       PRIVATE m)
   endif()
 
+  target_include_directories(sundials_sunlinsolsptfqmr_static
+    PRIVATE ${sundials_SOURCE_DIR}/src/sundials)
+
   target_compile_definitions(sundials_sunlinsolsptfqmr_static
     PUBLIC -DBUILD_SUNDIALS_LIBRARY)
 
@@ -79,6 +82,9 @@ if(BUILD_SHARED_LIBS)
       PRIVATE m)
   endif()
 
+  target_include_directories(sundials_sunlinsolsptfqmr_shared
+    PRIVATE ${sundials_SOURCE_DIR}/src/sundials)
+
   target_compile_definitions(sundials_sunlinsolsptfqmr_shared
     PUBLIC -DBUILD_SUNDIALS_LIBRARY)
 
diff --git a/src/sunlinsol/sptfqmr/fmod/fsunlinsol_sptfqmr_mod.c b/src/sunlinsol/sptfqmr/fmod/fsunlinsol_sptfqmr_mod.c
index a48255e917..2d6755e3b3 100644
--- a/src/sunlinsol/sptfqmr/fmod/fsunlinsol_sptfqmr_mod.c
+++ b/src/sunlinsol/sptfqmr/fmod/fsunlinsol_sptfqmr_mod.c
@@ -492,4 +492,32 @@ SWIGEXPORT int _wrap_FSUNLinSolFree_SPTFQMR(SUNLinearSolver farg1) {
 }
 
 
+SWIGEXPORT int _wrap_FSUNLinSolSetInfoFile_SPTFQMR(SUNLinearSolver farg1, void *farg2) {
+  int fresult ;
+  SUNLinearSolver arg1 = (SUNLinearSolver) 0 ;
+  FILE *arg2 = (FILE *) 0 ;
+  int result;
+  
+  arg1 = (SUNLinearSolver)(farg1);
+  arg2 = (FILE *)(farg2);
+  result = (int)SUNLinSolSetInfoFile_SPTFQMR(arg1,arg2);
+  fresult = (int)(result);
+  return fresult;
+}
+
+
+SWIGEXPORT int _wrap_FSUNLinSolSetPrintLevel_SPTFQMR(SUNLinearSolver farg1, int const *farg2) {
+  int fresult ;
+  SUNLinearSolver arg1 = (SUNLinearSolver) 0 ;
+  int arg2 ;
+  int result;
+  
+  arg1 = (SUNLinearSolver)(farg1);
+  arg2 = (int)(*farg2);
+  result = (int)SUNLinSolSetPrintLevel_SPTFQMR(arg1,arg2);
+  fresult = (int)(result);
+  return fresult;
+}
+
+
 
diff --git a/src/sunlinsol/sptfqmr/fmod/fsunlinsol_sptfqmr_mod.f90 b/src/sunlinsol/sptfqmr/fmod/fsunlinsol_sptfqmr_mod.f90
index 218d2e1c85..3657f87f2e 100644
--- a/src/sunlinsol/sptfqmr/fmod/fsunlinsol_sptfqmr_mod.f90
+++ b/src/sunlinsol/sptfqmr/fmod/fsunlinsol_sptfqmr_mod.f90
@@ -52,6 +52,8 @@ module fsunlinsol_sptfqmr_mod
  public :: FSUNLinSolLastFlag_SPTFQMR
  public :: FSUNLinSolSpace_SPTFQMR
  public :: FSUNLinSolFree_SPTFQMR
+ public :: FSUNLinSolSetInfoFile_SPTFQMR
+ public :: FSUNLinSolSetPrintLevel_SPTFQMR
 
 ! WRAPPER DECLARATIONS
 interface
@@ -237,6 +239,24 @@ function swigc_FSUNLinSolFree_SPTFQMR(farg1) &
 integer(C_INT) :: fresult
 end function
 
+function swigc_FSUNLinSolSetInfoFile_SPTFQMR(farg1, farg2) &
+bind(C, name="_wrap_FSUNLinSolSetInfoFile_SPTFQMR") &
+result(fresult)
+use, intrinsic :: ISO_C_BINDING
+type(C_PTR), value :: farg1
+type(C_PTR), value :: farg2
+integer(C_INT) :: fresult
+end function
+
+function swigc_FSUNLinSolSetPrintLevel_SPTFQMR(farg1, farg2) &
+bind(C, name="_wrap_FSUNLinSolSetPrintLevel_SPTFQMR") &
+result(fresult)
+use, intrinsic :: ISO_C_BINDING
+type(C_PTR), value :: farg1
+integer(C_INT), intent(in) :: farg2
+integer(C_INT) :: fresult
+end function
+
 end interface
 
 
@@ -568,5 +588,37 @@ function FSUNLinSolFree_SPTFQMR(s) &
 swig_result = fresult
 end function
 
+function FSUNLinSolSetInfoFile_SPTFQMR(ls, info_file) &
+result(swig_result)
+use, intrinsic :: ISO_C_BINDING
+integer(C_INT) :: swig_result
+type(SUNLinearSolver), target, intent(inout) :: ls
+type(C_PTR) :: info_file
+integer(C_INT) :: fresult 
+type(C_PTR) :: farg1 
+type(C_PTR) :: farg2 
+
+farg1 = c_loc(ls)
+farg2 = info_file
+fresult = swigc_FSUNLinSolSetInfoFile_SPTFQMR(farg1, farg2)
+swig_result = fresult
+end function
+
+function FSUNLinSolSetPrintLevel_SPTFQMR(ls, print_level) &
+result(swig_result)
+use, intrinsic :: ISO_C_BINDING
+integer(C_INT) :: swig_result
+type(SUNLinearSolver), target, intent(inout) :: ls
+integer(C_INT), intent(in) :: print_level
+integer(C_INT) :: fresult 
+type(C_PTR) :: farg1 
+integer(C_INT) :: farg2 
+
+farg1 = c_loc(ls)
+farg2 = print_level
+fresult = swigc_FSUNLinSolSetPrintLevel_SPTFQMR(farg1, farg2)
+swig_result = fresult
+end function
+
 
 end module
diff --git a/src/sunlinsol/sptfqmr/sunlinsol_sptfqmr.c b/src/sunlinsol/sptfqmr/sunlinsol_sptfqmr.c
index 4b64b2e4e4..43d07c8d52 100644
--- a/src/sunlinsol/sptfqmr/sunlinsol_sptfqmr.c
+++ b/src/sunlinsol/sptfqmr/sunlinsol_sptfqmr.c
@@ -22,6 +22,8 @@
 #include <sunlinsol/sunlinsol_sptfqmr.h>
 #include <sundials/sundials_math.h>
 
+#include "sundials_debug.h"
+
 #define ZERO RCONST(0.0)
 #define ONE  RCONST(1.0)
 
@@ -108,28 +110,30 @@ SUNLinearSolver SUNLinSol_SPTFQMR(N_Vector y, int pretype, int maxl)
   S->content = content;
 
   /* Fill content */
-  content->last_flag = 0;
-  content->maxl      = maxl;
-  content->pretype   = pretype;
-  content->numiters  = 0;
-  content->resnorm   = ZERO;
-  content->r_star    = NULL;
-  content->q         = NULL;
-  content->d         = NULL;
-  content->v         = NULL;
-  content->p         = NULL;
-  content->r         = NULL;
-  content->u         = NULL;
-  content->vtemp1    = NULL;
-  content->vtemp2    = NULL;
-  content->vtemp3    = NULL;
-  content->s1        = NULL;
-  content->s2        = NULL;
-  content->ATimes    = NULL;
-  content->ATData    = NULL;
-  content->Psetup    = NULL;
-  content->Psolve    = NULL;
-  content->PData     = NULL;
+  content->last_flag   = 0;
+  content->maxl        = maxl;
+  content->pretype     = pretype;
+  content->numiters    = 0;
+  content->resnorm     = ZERO;
+  content->r_star      = NULL;
+  content->q           = NULL;
+  content->d           = NULL;
+  content->v           = NULL;
+  content->p           = NULL;
+  content->r           = NULL;
+  content->u           = NULL;
+  content->vtemp1      = NULL;
+  content->vtemp2      = NULL;
+  content->vtemp3      = NULL;
+  content->s1          = NULL;
+  content->s2          = NULL;
+  content->ATimes      = NULL;
+  content->ATData      = NULL;
+  content->Psetup      = NULL;
+  content->Psolve      = NULL;
+  content->PData       = NULL;
+  content->print_level = 0;
+  content->info_file   = stdout;
 
   /* Allocate content */
   content->r_star = N_VClone(y);
@@ -233,12 +237,23 @@ int SUNLinSolInitialize_SPTFQMR(SUNLinearSolver S)
   content = SPTFQMR_CONTENT(S);
 
   /* ensure valid options */
+  if (content->maxl <= 0)
+    content->maxl = SUNSPTFQMR_MAXL_DEFAULT;
+
+  if (content->ATimes == NULL) {
+    LASTFLAG(S) = SUNLS_ATIMES_NULL;
+    return(LASTFLAG(S));
+  }
+
   if ( (content->pretype != PREC_LEFT) &&
        (content->pretype != PREC_RIGHT) &&
        (content->pretype != PREC_BOTH) )
     content->pretype = PREC_NONE;
-  if (content->maxl <= 0)
-    content->maxl = SUNSPTFQMR_MAXL_DEFAULT;
+
+  if ((content->pretype != PREC_NONE) && (content->Psolve == NULL)) {
+    LASTFLAG(S) = SUNLS_PSOLVE_NULL;
+    return(LASTFLAG(S));
+  }
 
   /* no additional memory to allocate */
 
@@ -374,6 +389,23 @@ int SUNLinSolSolve_SPTFQMR(SUNLinearSolver S, SUNMatrix A, N_Vector x,
   scale_x = (sx != NULL);
   scale_b = (sb != NULL);
 
+#ifdef SUNDIALS_BUILD_WITH_MONITORING
+  if (SPTFQMR_CONTENT(S)->print_level && SPTFQMR_CONTENT(S)->info_file)
+    fprintf(SPTFQMR_CONTENT(S)->info_file, "SUNLINSOL_SPTFQMR:\n");
+#endif
+
+  /* Check if Atimes function has been set */
+  if (atimes == NULL) {
+    LASTFLAG(S) = SUNLS_ATIMES_NULL;
+    return(LASTFLAG(S));
+  }
+
+  /* If preconditioning, check if psolve has been set */
+  if ((preOnLeft || preOnRight) && psolve == NULL) {
+    LASTFLAG(S) = SUNLS_PSOLVE_NULL;
+    return(LASTFLAG(S));
+  }
+
   /* Set r_star to initial (unscaled) residual r_star = r_0 = b - A*x_0 */
   /* NOTE: if x == 0 then just set residual to b and continue */
   if (N_VDotProd(x, x) == ZERO) N_VScale(ONE, b, r_star);
@@ -409,6 +441,17 @@ int SUNLinSolSolve_SPTFQMR(SUNLinearSolver S, SUNMatrix A, N_Vector x,
   /* Compute norm of initial residual (r_0) to see if we really need
      to do anything */
   *res_norm = r_init_norm = SUNRsqrt(rho[0]);
+
+#ifdef SUNDIALS_BUILD_WITH_MONITORING
+  /* print initial residual */
+  if (SPTFQMR_CONTENT(S)->print_level && SPTFQMR_CONTENT(S)->info_file)
+  {
+    fprintf(SPTFQMR_CONTENT(S)->info_file,
+            SUNLS_MSG_RESIDUAL,
+            (long int) 0, *res_norm);
+  }
+#endif
+
   if (r_init_norm <= delta) {
     LASTFLAG(S) = SUNLS_SUCCESS;
     return(LASTFLAG(S));
@@ -537,11 +580,21 @@ int SUNLinSolSolve_SPTFQMR(SUNLinearSolver S, SUNMatrix A, N_Vector x,
       /* NOTE: just use approximation to norm of residual, if possible */
       *res_norm = r_curr_norm = tau*SUNRsqrt(m+1);
 
+#ifdef SUNDIALS_BUILD_WITH_MONITORING
+      /* print current iteration number and the residual */
+      if (SPTFQMR_CONTENT(S)->print_level && SPTFQMR_CONTENT(S)->info_file)
+      {
+        fprintf(SPTFQMR_CONTENT(S)->info_file,
+                SUNLS_MSG_RESIDUAL,
+                (long int) *nli, *res_norm);
+      }
+#endif
+
       /* Exit inner loop if iteration has converged based upon approximation
 	 to norm of current residual */
       if (r_curr_norm <= delta) {
-	converged = SUNTRUE;
-	break;
+        converged = SUNTRUE;
+        break;
       }
 
       /* Decide if actual norm of residual vector should be computed */
@@ -802,3 +855,43 @@ int SUNLinSolFree_SPTFQMR(SUNLinearSolver S)
   free(S); S = NULL;
   return(SUNLS_SUCCESS);
 }
+
+
+int SUNLinSolSetInfoFile_SPTFQMR(SUNLinearSolver S,
+                                 FILE* info_file)
+{
+#ifdef SUNDIALS_BUILD_WITH_MONITORING
+  /* check that the linear solver is non-null */
+  if (S == NULL)
+    return(SUNLS_MEM_NULL);
+
+  SPTFQMR_CONTENT(S)->info_file = info_file;
+
+  return(SUNLS_SUCCESS);
+#else
+  SUNDIALS_DEBUG_PRINT("ERROR in SUNLinSolSetInfoFile_SPTFQMR: SUNDIALS was not built with monitoring\n");
+  return(SUNLS_ILL_INPUT);
+#endif
+}
+
+
+int SUNLinSolSetPrintLevel_SPTFQMR(SUNLinearSolver S,
+                                   int print_level)
+{
+#ifdef SUNDIALS_BUILD_WITH_MONITORING
+  /* check that the linear solver is non-null */
+  if (S == NULL)
+    return(SUNLS_MEM_NULL);
+
+  /* check for valid print level */
+  if (print_level < 0 || print_level > 1)
+    return(SUNLS_ILL_INPUT);
+
+  SPTFQMR_CONTENT(S)->print_level = print_level;
+
+  return(SUNLS_SUCCESS);
+#else
+  SUNDIALS_DEBUG_PRINT("ERROR in SUNLinSolSetPrintLevel_SPTFQMR: SUNDIALS was not built with monitoring\n");
+  return(SUNLS_ILL_INPUT);
+#endif
+}
diff --git a/src/sunmatrix/cusparse/CMakeLists.txt b/src/sunmatrix/cusparse/CMakeLists.txt
index c78986c136..4034f4fc24 100644
--- a/src/sunmatrix/cusparse/CMakeLists.txt
+++ b/src/sunmatrix/cusparse/CMakeLists.txt
@@ -72,4 +72,4 @@ endif(BUILD_SHARED_LIBS)
 install(FILES ${cusparse_HEADERS} DESTINATION include/sunmatrix)
 
 #
-message(STATUS "Added sunmatrix_cusparse module")
+message(STATUS "Added SUNMATRIX_CUSPARSE module")
diff --git a/src/sunmatrix/cusparse/cusparse_kernels.cuh b/src/sunmatrix/cusparse/cusparse_kernels.cuh
index 0a18bde973..3b82290208 100644
--- a/src/sunmatrix/cusparse/cusparse_kernels.cuh
+++ b/src/sunmatrix/cusparse/cusparse_kernels.cuh
@@ -26,8 +26,6 @@
 
 namespace sundials
 {
-namespace device
-{
 namespace sunmatrix_cusparse
 {
 
@@ -169,7 +167,6 @@ print_kernel(I m, I nnz, I blocknnz, T* A, const I* rowptr, const I* colind)
 #endif
 
 } // namespace sunmatrix_cusparse
-} // namespace device
 } // namespace sundials
 
 #endif
\ No newline at end of file
diff --git a/src/sunmatrix/cusparse/sunmatrix_cusparse.cu b/src/sunmatrix/cusparse/sunmatrix_cusparse.cu
index ba4dc297b5..3135af54af 100644
--- a/src/sunmatrix/cusparse/sunmatrix_cusparse.cu
+++ b/src/sunmatrix/cusparse/sunmatrix_cusparse.cu
@@ -27,15 +27,14 @@
 #include "sundials_debug.h"
 #include "cusparse_kernels.cuh"
 
+
 /* Use the namespace for the kernels */
-using namespace sundials::device::sunmatrix_cusparse;
+using namespace sundials::sunmatrix_cusparse;
 
 /* Constants */
 #define ZERO RCONST(0.0)
 #define ONE  RCONST(1.0)
 
-#define MAX_THREAD_PER_BLOCK(val) ( (val > 16*CUDA_WARP_SIZE) ? (16*CUDA_WARP_SIZE) : (val) )
-
 /* Private function prototypes */
 static booleantype SMCompatible_cuSparse(SUNMatrix A, SUNMatrix B);
 static SUNMatrix SUNMatrix_cuSparse_NewEmpty();
@@ -59,18 +58,68 @@ static SUNMatrix SUNMatrix_cuSparse_NewEmpty();
 #define SMCU_NP_S(A)          ( SMCU_CONTENT_S(A)->NP )
 #define SMCU_SPARSETYPE_S(A)  ( SMCU_CONTENT_S(A)->sparse_type )
 #define SMCU_OWNDATA_S(A)     ( SMCU_CONTENT_S(A)->own_data )
+#define SMCU_OWNEXEC_S(A)     ( SMCU_CONTENT_S(A)->own_exec )
 #define SMCU_DATA_S(A)        ( SMCU_CONTENT_S(A)->data )
 #define SMCU_INDEXVALS_S(A)   ( SMCU_CONTENT_S(A)->colind )
 #define SMCU_INDEXPTRS_S(A)   ( SMCU_CONTENT_S(A)->rowptrs )
 #define SMCU_MATDESCR_S(A)    ( SMCU_CONTENT_S(A)->mat_descr )
 #define SMCU_CUSPHANDLE_S(A)  ( SMCU_CONTENT_S(A)->cusp_handle )
 #define SMCU_FIXEDPATTERN_S(A)( SMCU_CONTENT_S(A)->fixed_pattern )
+#define SMCU_EXECPOLICY_S(A)  ( SMCU_CONTENT_S(A)->exec_policy )
 
 
 /* ------------------------------------------------------------------
- * Constructors.
+ * Default execution policy definition.
+ * 
+ * This policy tries to help us leverage the structure of the matrix.
+ * It will choose block sizes which are a multiple of the warp size,
+ * and it will choose a grid size to such that all work elements are
+ * covered. 
  * ------------------------------------------------------------------ */
 
+class SUNCuSparseMatrixExecPolicy : public SUNCudaExecPolicy
+{
+public:
+  SUNCuSparseMatrixExecPolicy(const cudaStream_t stream = 0)
+    : stream_(stream)
+  {}
+
+  SUNCuSparseMatrixExecPolicy(const SUNCuSparseMatrixExecPolicy& ex)
+    : stream_(ex.stream_)
+  {}
+
+  virtual size_t gridSize(size_t numWorkElements, size_t blockDim = 0) const
+  {
+    return (numWorkElements + blockDim - 1)/blockDim;
+  }
+
+  virtual size_t blockSize(size_t numWorkElements = 0, size_t gridDim = 0) const
+  {
+    return max_block_size(CUDA_WARP_SIZE*(numWorkElements + CUDA_WARP_SIZE - 1)/CUDA_WARP_SIZE);
+  }
+
+  virtual cudaStream_t stream() const
+  {
+    return stream_;
+  }
+
+  virtual CudaExecPolicy* clone() const
+  {
+    return static_cast<CudaExecPolicy*>(new SUNCuSparseMatrixExecPolicy(*this));
+  }
+
+  static size_t max_block_size(int val)
+  {
+    return ( (val > MAX_CUDA_BLOCKSIZE) ? MAX_CUDA_BLOCKSIZE : val );
+  }
+
+private:
+  const cudaStream_t stream_;
+};
+
+/* ------------------------------------------------------------------
+ * Constructors.
+ * ------------------------------------------------------------------ */
 
 SUNMatrix SUNMatrix_cuSparse_NewCSR(int M, int N, int NNZ, cusparseHandle_t cusp)
 {
@@ -148,6 +197,12 @@ SUNMatrix SUNMatrix_cuSparse_NewCSR(int M, int N, int NNZ, cusparseHandle_t cusp
     return NULL;
   }
 
+  cudaStream_t stream;
+  if (!SUNDIALS_CUSPARSE_VERIFY(cusparseGetStream(cusp, &stream)))
+  {
+    return NULL;
+  }
+
   /* Fill the content */
   SMCU_CONTENT_S(A)->M             = M;
   SMCU_CONTENT_S(A)->N             = N;
@@ -157,6 +212,7 @@ SUNMatrix SUNMatrix_cuSparse_NewCSR(int M, int N, int NNZ, cusparseHandle_t cusp
   SMCU_CONTENT_S(A)->blockcols     = N;
   SMCU_CONTENT_S(A)->blocknnz      = NNZ;
   SMCU_CONTENT_S(A)->own_data      = SUNTRUE;
+  SMCU_CONTENT_S(A)->own_exec      = SUNTRUE;
   SMCU_CONTENT_S(A)->sparse_type   = SUNMAT_CUSPARSE_CSR;
   SMCU_CONTENT_S(A)->colind        = d_colind;
   SMCU_CONTENT_S(A)->rowptrs       = d_rowptr;
@@ -164,6 +220,7 @@ SUNMatrix SUNMatrix_cuSparse_NewCSR(int M, int N, int NNZ, cusparseHandle_t cusp
   SMCU_CONTENT_S(A)->mat_descr     = mat_descr;
   SMCU_CONTENT_S(A)->cusp_handle   = cusp;
   SMCU_CONTENT_S(A)->fixed_pattern = SUNFALSE;
+  SMCU_CONTENT_S(A)->exec_policy   = new SUNCuSparseMatrixExecPolicy(stream);
 
   return A;
 }
@@ -199,6 +256,12 @@ SUNMatrix SUNMatrix_cuSparse_MakeCSR(cusparseMatDescr_t mat_descr, int M, int N,
     return NULL;
   }
 
+  cudaStream_t stream;
+  if (!SUNDIALS_CUSPARSE_VERIFY(cusparseGetStream(cusp, &stream)))
+  {
+    return NULL;
+  }
+
   /* Fill content */
   SMCU_CONTENT_S(A)->M             = M;
   SMCU_CONTENT_S(A)->N             = N;
@@ -208,6 +271,7 @@ SUNMatrix SUNMatrix_cuSparse_MakeCSR(cusparseMatDescr_t mat_descr, int M, int N,
   SMCU_CONTENT_S(A)->blockcols     = N;
   SMCU_CONTENT_S(A)->blocknnz      = NNZ;
   SMCU_CONTENT_S(A)->own_data      = SUNFALSE;
+  SMCU_CONTENT_S(A)->own_exec      = SUNTRUE;
   SMCU_CONTENT_S(A)->sparse_type   = SUNMAT_CUSPARSE_CSR;
   SMCU_CONTENT_S(A)->colind        = colind;
   SMCU_CONTENT_S(A)->rowptrs       = rowptrs;
@@ -215,6 +279,7 @@ SUNMatrix SUNMatrix_cuSparse_MakeCSR(cusparseMatDescr_t mat_descr, int M, int N,
   SMCU_CONTENT_S(A)->mat_descr     = mat_descr;
   SMCU_CONTENT_S(A)->cusp_handle   = cusp;
   SMCU_CONTENT_S(A)->fixed_pattern = SUNFALSE;
+  SMCU_CONTENT_S(A)->exec_policy   = new SUNCuSparseMatrixExecPolicy(stream);
 
   return A;
 }
@@ -310,6 +375,12 @@ SUNMatrix SUNMatrix_cuSparse_NewBlockCSR(int nblocks, int blockrows, int blockco
     return NULL;
   }
 
+  cudaStream_t stream;
+  if (!SUNDIALS_CUSPARSE_VERIFY(cusparseGetStream(cusp, &stream)))
+  {
+    return NULL;
+  }
+
   /* Fill the content */
   SMCU_CONTENT_S(A)->M             = M;
   SMCU_CONTENT_S(A)->N             = N;
@@ -319,6 +390,7 @@ SUNMatrix SUNMatrix_cuSparse_NewBlockCSR(int nblocks, int blockrows, int blockco
   SMCU_CONTENT_S(A)->blockcols     = blockrows;
   SMCU_CONTENT_S(A)->blocknnz      = blocknnz;
   SMCU_CONTENT_S(A)->own_data      = SUNTRUE;
+  SMCU_CONTENT_S(A)->own_exec      = SUNTRUE;
   SMCU_CONTENT_S(A)->sparse_type   = SUNMAT_CUSPARSE_BCSR;
   SMCU_CONTENT_S(A)->colind        = d_colind;
   SMCU_CONTENT_S(A)->rowptrs       = d_rowptr;
@@ -326,6 +398,7 @@ SUNMatrix SUNMatrix_cuSparse_NewBlockCSR(int nblocks, int blockrows, int blockco
   SMCU_CONTENT_S(A)->mat_descr     = mat_descr;
   SMCU_CONTENT_S(A)->cusp_handle   = cusp;
   SMCU_CONTENT_S(A)->fixed_pattern = SUNFALSE;
+  SMCU_CONTENT_S(A)->exec_policy   = new SUNCuSparseMatrixExecPolicy(stream);
 
   return A;
 }
@@ -457,20 +530,30 @@ int SUNMatrix_cuSparse_SetFixedPattern(SUNMatrix A, booleantype yesno)
   return SUNMAT_SUCCESS;
 }
 
+int SUNMatrix_cuSparse_SetKernelExecPolicy(SUNMatrix A, SUNCudaExecPolicy* exec_policy)
+{
+  if (SUNMatGetID(A) != SUNMATRIX_CUSPARSE || exec_policy == NULL)
+    return SUNMAT_ILL_INPUT;
+
+  if (SMCU_OWNEXEC_S(A)) delete SMCU_EXECPOLICY_S(A);
+  SMCU_EXECPOLICY_S(A) = exec_policy;
+
+  SMCU_OWNEXEC_S(A) = SUNFALSE;
+
+  return SUNMAT_SUCCESS;
+}
 
 int SUNMatrix_cuSparse_CopyToDevice(SUNMatrix dA, realtype* h_data,
                                     int* h_idxptrs, int* h_idxvals)
 {
   cudaError_t cuerr;
   cudaStream_t stream;
-  cusparseStatus_t cusparse_status;
   int nidxvals, nidxptrs;
 
   if (SUNMatGetID(dA) != SUNMATRIX_CUSPARSE)
     return SUNMAT_ILL_INPUT;
 
-  cusparse_status = cusparseGetStream(SMCU_CUSPHANDLE_S(dA), &stream);
-  if (!SUNDIALS_CUSPARSE_VERIFY(cusparse_status)) return SUNMAT_OPERATION_FAIL;
+  stream = SMCU_EXECPOLICY_S(dA)->stream();
 
   if (h_data != NULL)
   {
@@ -520,14 +603,12 @@ int SUNMatrix_cuSparse_CopyFromDevice(SUNMatrix dA, realtype* h_data,
 {
   cudaError_t cuerr;
   cudaStream_t stream;
-  cusparseStatus_t cusparse_status;
   int nidxvals, nidxptrs;
 
   if (SUNMatGetID(dA) != SUNMATRIX_CUSPARSE)
     return SUNMAT_ILL_INPUT;
 
-  cusparse_status = cusparseGetStream(SMCU_CUSPHANDLE_S(dA), &stream);
-  if (!SUNDIALS_CUSPARSE_VERIFY(cusparse_status)) return SUNMAT_OPERATION_FAIL;
+  stream = SMCU_EXECPOLICY_S(dA)->stream();
 
   if (h_data != NULL)
   {
@@ -600,6 +681,7 @@ SUNMatrix SUNMatClone_cuSparse(SUNMatrix A)
   }
 
   SMCU_FIXEDPATTERN_S(B) = SMCU_FIXEDPATTERN_S(A);
+  SMCU_EXECPOLICY_S(B) = SMCU_EXECPOLICY_S(A)->clone();
 
   return B;
 }
@@ -640,6 +722,12 @@ void SUNMatDestroy_cuSparse(SUNMatrix A)
       cusparseDestroyMatDescr(SMCU_MATDESCR_S(A));
     }
 
+    if (SMCU_EXECPOLICY_S(A) && SMCU_OWNEXEC_S(A))
+    {
+      delete SMCU_EXECPOLICY_S(A);
+      SMCU_EXECPOLICY_S(A) = NULL;
+    }
+
     /* free content struct */
     free(A->content);
     A->content = NULL;
@@ -659,7 +747,7 @@ int SUNMatZero_cuSparse(SUNMatrix A)
   cudaError_t cuerr;
   cudaStream_t stream;
 
-  cusparseGetStream(SMCU_CUSPHANDLE_S(A), &stream);
+  stream = SMCU_EXECPOLICY_S(A)->stream();
 
   /* set all data to zero */
   cuerr = cudaMemsetAsync(SMCU_DATA_S(A), 0, SMCU_NNZ_S(A)*sizeof(realtype), stream);
@@ -694,7 +782,7 @@ int SUNMatCopy_cuSparse(SUNMatrix src, SUNMatrix dst)
   if (!SMCompatible_cuSparse(src, dst))
     return SUNMAT_ILL_INPUT;
 
-  cusparseGetStream(SMCU_CUSPHANDLE_S(src), &stream);
+  stream = SMCU_EXECPOLICY_S(src)->stream();
 
   /* Ensure that dst is allocated with at least as
      much memory as we have nonzeros in src */
@@ -736,22 +824,17 @@ int SUNMatCopy_cuSparse(SUNMatrix src, SUNMatrix dst)
 /* Performs A = cA + I. Requires the diagonal to be allocated already. */
 int SUNMatScaleAddI_cuSparse(realtype c, SUNMatrix A)
 {
-  cudaStream_t stream;
-  cusparseStatus_t cusparse_status;
-
-  cusparse_status = cusparseGetStream(SMCU_CUSPHANDLE_S(A), &stream);
-  if (!SUNDIALS_CUSPARSE_VERIFY(cusparse_status)) return SUNMAT_OPERATION_FAIL;
-
   unsigned threadsPerBlock, gridSize;
+  cudaStream_t stream = SMCU_EXECPOLICY_S(A)->stream();
+
   switch (SMCU_SPARSETYPE_S(A))
   {
     case SUNMAT_CUSPARSE_CSR:
       /* Choose the grid size to be the number of rows in the matrix,
         and then choose threadsPerBlock to be a multiple of the warp size
         that results in enough threads to have one per 2 columns. */
-        threadsPerBlock = MAX_THREAD_PER_BLOCK(CUDA_WARP_SIZE*(SMCU_COLUMNS_S(A)/2 + CUDA_WARP_SIZE - 1)/CUDA_WARP_SIZE);
-        gridSize = SMCU_ROWS_S(A);
-
+      threadsPerBlock = SMCU_EXECPOLICY_S(A)->blockSize(SMCU_COLUMNS_S(A)/2);
+      gridSize = SMCU_EXECPOLICY_S(A)->gridSize(SMCU_ROWS_S(A)*SMCU_COLUMNS_S(A)/2, threadsPerBlock);
       {
 #ifdef SUNDIALS_CUDA_KERNEL_TIMING
         cudaEvent_t start, stop;
@@ -787,9 +870,8 @@ int SUNMatScaleAddI_cuSparse(realtype c, SUNMatrix A)
       /* Choose the grid size to be the number of blocks in the matrix,
          and then choose threadsPerBlock to be a multiple of the warp size
          that results in enough threads to have one per row of the block. */
-      threadsPerBlock = MAX_THREAD_PER_BLOCK(CUDA_WARP_SIZE*(SMCU_BLOCKROWS_S(A) + CUDA_WARP_SIZE - 1)/CUDA_WARP_SIZE);
-      gridSize = SMCU_NBLOCKS_S(A);
-
+      threadsPerBlock = SMCU_EXECPOLICY_S(A)->blockSize(SMCU_BLOCKROWS_S(A));
+      gridSize = SMCU_EXECPOLICY_S(A)->gridSize(SMCU_NBLOCKS_S(A)*SMCU_BLOCKROWS_S(A), threadsPerBlock);
       {
 #ifdef SUNDIALS_CUDA_KERNEL_TIMING
         cudaEvent_t start, stop;
@@ -840,7 +922,7 @@ int SUNMatScaleAddI_cuSparse(realtype c, SUNMatrix A)
 int SUNMatScaleAdd_cuSparse(realtype c, SUNMatrix A, SUNMatrix B)
 {
   cudaStream_t stream;
-  cusparseStatus_t cusparse_status;
+  unsigned threadsPerBlock, gridSize;
 
   if (!SMCompatible_cuSparse(A, B))
   {
@@ -848,19 +930,16 @@ int SUNMatScaleAdd_cuSparse(realtype c, SUNMatrix A, SUNMatrix B)
     return SUNMAT_ILL_INPUT;
   }
 
-  cusparse_status = cusparseGetStream(SMCU_CUSPHANDLE_S(A), &stream);
-  if (!SUNDIALS_CUSPARSE_VERIFY(cusparse_status)) return SUNMAT_OPERATION_FAIL;
+  stream = SMCU_EXECPOLICY_S(A)->stream();
 
-  unsigned threadsPerBlock, gridSize;
   switch (SMCU_SPARSETYPE_S(A))
   {
     case SUNMAT_CUSPARSE_CSR:
       /* Choose the grid size to be the number of rows in the matrix,
         and then choose threadsPerBlock to be a multiple of the warp size
         that results in enough threads to have one per 2 columns. */
-      threadsPerBlock = MAX_THREAD_PER_BLOCK(CUDA_WARP_SIZE*(SMCU_COLUMNS_S(A)/2 + CUDA_WARP_SIZE - 1)/CUDA_WARP_SIZE);
-      gridSize = SMCU_ROWS_S(A);
-     
+      threadsPerBlock = SMCU_EXECPOLICY_S(A)->blockSize(SMCU_COLUMNS_S(A)/2);
+      gridSize = SMCU_EXECPOLICY_S(A)->gridSize(SMCU_ROWS_S(A)*SMCU_COLUMNS_S(A)/2, threadsPerBlock);
       {
 #ifdef SUNDIALS_CUDA_KERNEL_TIMING
         cudaEvent_t start, stop;
@@ -889,15 +968,13 @@ int SUNMatScaleAdd_cuSparse(realtype c, SUNMatrix A, SUNMatrix B)
                 SMCU_NNZ_S(A)*sizeof(realtype)*3/milliseconds/1e6);
 #endif
       }
-
       break;
     case SUNMAT_CUSPARSE_BCSR:
       /* Choose the grid size to be the number of blocks in the matrix,
          and then choose threadsPerBlock to be a multiple of the warp size
          that results in enough threads to have one per row of the block. */
-      threadsPerBlock = MAX_THREAD_PER_BLOCK(CUDA_WARP_SIZE*(SMCU_BLOCKROWS_S(A) + CUDA_WARP_SIZE - 1)/CUDA_WARP_SIZE);
-      gridSize = SMCU_NBLOCKS_S(A);
-
+      threadsPerBlock = SMCU_EXECPOLICY_S(A)->blockSize(SMCU_BLOCKROWS_S(A));
+      gridSize = SMCU_EXECPOLICY_S(A)->gridSize(SMCU_NBLOCKS_S(A)*SMCU_BLOCKROWS_S(A), threadsPerBlock);
       {
 #ifdef SUNDIALS_CUDA_KERNEL_TIMING
         cudaEvent_t start, stop;
@@ -926,7 +1003,6 @@ int SUNMatScaleAdd_cuSparse(realtype c, SUNMatrix A, SUNMatrix B)
                   SMCU_NNZ_S(A)*sizeof(realtype)*3/milliseconds/1e6);
 #endif
       }
-
       break;
     default:
       SUNDIALS_DEBUG_PRINT("ERROR in SUNMatScaleAdd_cuSparse: sparse type not recognized\n");
@@ -1002,18 +1078,15 @@ int SUNMatMatvec_cuSparse(SUNMatrix A, N_Vector x, N_Vector y)
   else if (SMCU_SPARSETYPE_S(A) == SUNMAT_CUSPARSE_BCSR)
   {
     cudaStream_t stream;
-    cusparseStatus_t cusparse_status;
     unsigned gridSize, threadsPerBlock;
 
-    cusparse_status = cusparseGetStream(SMCU_CUSPHANDLE_S(A), &stream);
-    if (!SUNDIALS_CUSPARSE_VERIFY(cusparse_status)) return SUNMAT_OPERATION_FAIL;
+    stream = SMCU_EXECPOLICY_S(A)->stream();
 
     /* Choose the grid size to be the number of blocks in the matrix,
        and then choose threadsPerBlock to be a multiple of the warp size
        that results in enough threads to have one per row of the block. */
-    threadsPerBlock = MAX_THREAD_PER_BLOCK(CUDA_WARP_SIZE*(SMCU_BLOCKROWS_S(A) + CUDA_WARP_SIZE - 1)/CUDA_WARP_SIZE);
-    gridSize = SMCU_NBLOCKS_S(A);
-
+    threadsPerBlock = SMCU_EXECPOLICY_S(A)->blockSize(SMCU_COLUMNS_S(A)/2);
+    gridSize = SMCU_EXECPOLICY_S(A)->gridSize(SMCU_ROWS_S(A)*SMCU_COLUMNS_S(A)/2, threadsPerBlock);
     {
 #ifdef SUNDIALS_CUDA_KERNEL_TIMING
         cudaEvent_t start, stop;
@@ -1044,7 +1117,6 @@ int SUNMatMatvec_cuSparse(SUNMatrix A, N_Vector x, N_Vector y)
               "[performance] matvecBCSR effective bandwidth (GB/s): %f\n",
               (SMCU_NNZ_S(A)*(sizeof(realtype)*4 + sizeof(int)) + 2*SMCU_ROWS_S(A)*sizeof(int))/milliseconds/1e6);
 #endif
-
     }
 
 #ifdef SUNDIALS_DEBUG_CUDA_LASTERROR
diff --git a/src/sunnonlinsol/fixedpoint/CMakeLists.txt b/src/sunnonlinsol/fixedpoint/CMakeLists.txt
index 548c67cce9..a4b14b6d11 100644
--- a/src/sunnonlinsol/fixedpoint/CMakeLists.txt
+++ b/src/sunnonlinsol/fixedpoint/CMakeLists.txt
@@ -38,7 +38,7 @@ set(lib_HEADERS
   )
 
 # Add source directory to include directories
-include_directories(.)
+include_directories(. ${sundials_SOURCE_DIR}/src/sundials)
 
 # Define C preprocessor flag -DBUILD_SUNDIALS_LIBRARY
 add_definitions(-DBUILD_SUNDIALS_LIBRARY)
diff --git a/src/sunnonlinsol/fixedpoint/fmod/fsunnonlinsol_fixedpoint_mod.c b/src/sunnonlinsol/fixedpoint/fmod/fsunnonlinsol_fixedpoint_mod.c
index 52a0d6fd3d..0df4ff797c 100644
--- a/src/sunnonlinsol/fixedpoint/fmod/fsunnonlinsol_fixedpoint_mod.c
+++ b/src/sunnonlinsol/fixedpoint/fmod/fsunnonlinsol_fixedpoint_mod.c
@@ -412,4 +412,32 @@ SWIGEXPORT int _wrap_FSUNNonlinSolGetSysFn_FixedPoint(SUNNonlinearSolver farg1,
 }
 
 
+SWIGEXPORT int _wrap_FSUNNonlinSolSetInfoFile_FixedPoint(SUNNonlinearSolver farg1, void *farg2) {
+  int fresult ;
+  SUNNonlinearSolver arg1 = (SUNNonlinearSolver) 0 ;
+  FILE *arg2 = (FILE *) 0 ;
+  int result;
+  
+  arg1 = (SUNNonlinearSolver)(farg1);
+  arg2 = (FILE *)(farg2);
+  result = (int)SUNNonlinSolSetInfoFile_FixedPoint(arg1,arg2);
+  fresult = (int)(result);
+  return fresult;
+}
+
+
+SWIGEXPORT int _wrap_FSUNNonlinSolSetPrintLevel_FixedPoint(SUNNonlinearSolver farg1, int const *farg2) {
+  int fresult ;
+  SUNNonlinearSolver arg1 = (SUNNonlinearSolver) 0 ;
+  int arg2 ;
+  int result;
+  
+  arg1 = (SUNNonlinearSolver)(farg1);
+  arg2 = (int)(*farg2);
+  result = (int)SUNNonlinSolSetPrintLevel_FixedPoint(arg1,arg2);
+  fresult = (int)(result);
+  return fresult;
+}
+
+
 
diff --git a/src/sunnonlinsol/fixedpoint/fmod/fsunnonlinsol_fixedpoint_mod.f90 b/src/sunnonlinsol/fixedpoint/fmod/fsunnonlinsol_fixedpoint_mod.f90
index cdb2c88d10..f6b3c39fbb 100644
--- a/src/sunnonlinsol/fixedpoint/fmod/fsunnonlinsol_fixedpoint_mod.f90
+++ b/src/sunnonlinsol/fixedpoint/fmod/fsunnonlinsol_fixedpoint_mod.f90
@@ -43,6 +43,8 @@ module fsunnonlinsol_fixedpoint_mod
  public :: FSUNNonlinSolGetCurIter_FixedPoint
  public :: FSUNNonlinSolGetNumConvFails_FixedPoint
  public :: FSUNNonlinSolGetSysFn_FixedPoint
+ public :: FSUNNonlinSolSetInfoFile_FixedPoint
+ public :: FSUNNonlinSolSetPrintLevel_FixedPoint
 
 ! WRAPPER DECLARATIONS
 interface
@@ -176,6 +178,24 @@ function swigc_FSUNNonlinSolGetSysFn_FixedPoint(farg1, farg2) &
 integer(C_INT) :: fresult
 end function
 
+function swigc_FSUNNonlinSolSetInfoFile_FixedPoint(farg1, farg2) &
+bind(C, name="_wrap_FSUNNonlinSolSetInfoFile_FixedPoint") &
+result(fresult)
+use, intrinsic :: ISO_C_BINDING
+type(C_PTR), value :: farg1
+type(C_PTR), value :: farg2
+integer(C_INT) :: fresult
+end function
+
+function swigc_FSUNNonlinSolSetPrintLevel_FixedPoint(farg1, farg2) &
+bind(C, name="_wrap_FSUNNonlinSolSetPrintLevel_FixedPoint") &
+result(fresult)
+use, intrinsic :: ISO_C_BINDING
+type(C_PTR), value :: farg1
+integer(C_INT), intent(in) :: farg2
+integer(C_INT) :: fresult
+end function
+
 end interface
 
 
@@ -417,5 +437,37 @@ function FSUNNonlinSolGetSysFn_FixedPoint(nls, sysfn) &
 swig_result = fresult
 end function
 
+function FSUNNonlinSolSetInfoFile_FixedPoint(nls, info_file) &
+result(swig_result)
+use, intrinsic :: ISO_C_BINDING
+integer(C_INT) :: swig_result
+type(SUNNonlinearSolver), target, intent(inout) :: nls
+type(C_PTR) :: info_file
+integer(C_INT) :: fresult 
+type(C_PTR) :: farg1 
+type(C_PTR) :: farg2 
+
+farg1 = c_loc(nls)
+farg2 = info_file
+fresult = swigc_FSUNNonlinSolSetInfoFile_FixedPoint(farg1, farg2)
+swig_result = fresult
+end function
+
+function FSUNNonlinSolSetPrintLevel_FixedPoint(nls, print_level) &
+result(swig_result)
+use, intrinsic :: ISO_C_BINDING
+integer(C_INT) :: swig_result
+type(SUNNonlinearSolver), target, intent(inout) :: nls
+integer(C_INT), intent(in) :: print_level
+integer(C_INT) :: fresult 
+type(C_PTR) :: farg1 
+integer(C_INT) :: farg2 
+
+farg1 = c_loc(nls)
+farg2 = print_level
+fresult = swigc_FSUNNonlinSolSetPrintLevel_FixedPoint(farg1, farg2)
+swig_result = fresult
+end function
+
 
 end module
diff --git a/src/sunnonlinsol/fixedpoint/sunnonlinsol_fixedpoint.c b/src/sunnonlinsol/fixedpoint/sunnonlinsol_fixedpoint.c
index 0c299dca9f..ddf59a116a 100644
--- a/src/sunnonlinsol/fixedpoint/sunnonlinsol_fixedpoint.c
+++ b/src/sunnonlinsol/fixedpoint/sunnonlinsol_fixedpoint.c
@@ -23,6 +23,8 @@
 #include <sundials/sundials_math.h>
 #include <sundials/sundials_nvector_senswrapper.h>
 
+#include "sundials_debug.h"
+
 /* Internal utility routines */
 static int AndersonAccelerate(SUNNonlinearSolver NLS, N_Vector gval, N_Vector x,
                               N_Vector xold, int iter);
@@ -87,16 +89,18 @@ SUNNonlinearSolver SUNNonlinSol_FixedPoint(N_Vector y, int m)
   NLS->content = content;
 
   /* Fill general content */
-  content->Sys        = NULL;
-  content->CTest      = NULL;
-  content->m          = m;
-  content->damping    = SUNFALSE;
-  content->beta       = ONE;
-  content->curiter    = 0;
-  content->maxiters   = 3;
-  content->niters     = 0;
-  content->nconvfails = 0;
-  content->ctest_data = NULL;
+  content->Sys         = NULL;
+  content->CTest       = NULL;
+  content->m           = m;
+  content->damping     = SUNFALSE;
+  content->beta        = ONE;
+  content->curiter     = 0;
+  content->maxiters    = 3;
+  content->niters      = 0;
+  content->nconvfails  = 0;
+  content->ctest_data  = NULL;
+  content->print_level = 0;
+  content->info_file   = NULL;
 
   /* Fill allocatable content */
   retval = AllocateContent(NLS, y);
@@ -202,6 +206,16 @@ int SUNNonlinSolSolve_FixedPoint(SUNNonlinearSolver NLS, N_Vector y0,
   gy    = FP_CONTENT(NLS)->gy;
   delta = FP_CONTENT(NLS)->delta;
 
+#ifdef SUNDIALS_BUILD_WITH_MONITORING
+      /* print current iteration number and the nonlinear residual */
+      if (FP_CONTENT(NLS)->print_level && FP_CONTENT(NLS)->info_file)
+      {
+        fprintf(FP_CONTENT(NLS)->info_file,
+                "SUNNONLINSOL_FIXEDPOINT (nni=%ld):\n",
+                (long int) FP_CONTENT(NLS)->niters);
+      }
+#endif
+
   /* Looping point for attempts at solution of the nonlinear system:
        Evaluate fixed-point function (store in gy).
        Performs the accelerated fixed-point iteration.
@@ -234,6 +248,17 @@ int SUNNonlinSolSolve_FixedPoint(SUNNonlinearSolver NLS, N_Vector y0,
     retval = FP_CONTENT(NLS)->CTest(NLS, ycor, delta, tol, w,
                                     FP_CONTENT(NLS)->ctest_data);
 
+#ifdef SUNDIALS_BUILD_WITH_MONITORING
+      /* print current iteration number and the nonlinear residual */
+      if (FP_CONTENT(NLS)->print_level && FP_CONTENT(NLS)->info_file)
+      {
+        fprintf(FP_CONTENT(NLS)->info_file,
+                SUN_NLS_MSG_RESIDUAL,
+                (long int) FP_CONTENT(NLS)->curiter,
+                N_VWrmsNorm(delta, w));
+      }
+#endif
+
     /* return if successful */
     if (retval == SUN_NLS_SUCCESS)  return(SUN_NLS_SUCCESS);
 
@@ -698,3 +723,41 @@ static void FreeContent(SUNNonlinearSolver NLS)
 
   return;
 }
+
+int SUNNonlinSolSetInfoFile_FixedPoint(SUNNonlinearSolver NLS,
+                                       FILE* info_file)
+{
+#ifdef SUNDIALS_BUILD_WITH_MONITORING
+  /* check that the nonlinear solver is non-null */
+  if (NLS == NULL)
+    return(SUN_NLS_MEM_NULL);
+
+  FP_CONTENT(NLS)->info_file = info_file;
+
+  return(SUN_NLS_SUCCESS);
+#else
+  SUNDIALS_DEBUG_PRINT("ERROR in SUNNonlinSolSetInfoFile_FixedPoint: SUNDIALS was not built with monitoring\n");
+  return(SUN_NLS_ILL_INPUT);
+#endif
+}
+
+int SUNNonlinSolSetPrintLevel_FixedPoint(SUNNonlinearSolver NLS,
+                                         int print_level)
+{
+#ifdef SUNDIALS_BUILD_WITH_MONITORING
+  /* check that the nonlinear solver is non-null */
+  if (NLS == NULL)
+    return(SUN_NLS_MEM_NULL);
+
+  /* check for valid print level */
+  if (print_level < 0 || print_level > 1)
+    return(SUN_NLS_ILL_INPUT);
+
+  FP_CONTENT(NLS)->print_level = print_level;
+
+  return(SUN_NLS_SUCCESS);
+#else
+  SUNDIALS_DEBUG_PRINT("ERROR in SUNNonlinSolSetPrintLevel_FixedPoint: SUNDIALS was not built with monitoring\n");
+  return(SUN_NLS_ILL_INPUT);
+#endif
+}
diff --git a/src/sunnonlinsol/newton/CMakeLists.txt b/src/sunnonlinsol/newton/CMakeLists.txt
index aa27199b05..68dbd9c3a0 100644
--- a/src/sunnonlinsol/newton/CMakeLists.txt
+++ b/src/sunnonlinsol/newton/CMakeLists.txt
@@ -38,7 +38,7 @@ set(lib_HEADERS
   )
 
 # Add source directory to include directories
-include_directories(.)
+include_directories(. ${sundials_SOURCE_DIR}/src/sundials)
 
 # Define C preprocessor flag -DBUILD_SUNDIALS_LIBRARY
 add_definitions(-DBUILD_SUNDIALS_LIBRARY)
diff --git a/src/sunnonlinsol/newton/fmod/fsunnonlinsol_newton_mod.c b/src/sunnonlinsol/newton/fmod/fsunnonlinsol_newton_mod.c
index 6b295a425c..7c2c59f199 100644
--- a/src/sunnonlinsol/newton/fmod/fsunnonlinsol_newton_mod.c
+++ b/src/sunnonlinsol/newton/fmod/fsunnonlinsol_newton_mod.c
@@ -422,4 +422,32 @@ SWIGEXPORT int _wrap_FSUNNonlinSolGetSysFn_Newton(SUNNonlinearSolver farg1, void
 }
 
 
+SWIGEXPORT int _wrap_FSUNNonlinSolSetInfoFile_Newton(SUNNonlinearSolver farg1, void *farg2) {
+  int fresult ;
+  SUNNonlinearSolver arg1 = (SUNNonlinearSolver) 0 ;
+  FILE *arg2 = (FILE *) 0 ;
+  int result;
+  
+  arg1 = (SUNNonlinearSolver)(farg1);
+  arg2 = (FILE *)(farg2);
+  result = (int)SUNNonlinSolSetInfoFile_Newton(arg1,arg2);
+  fresult = (int)(result);
+  return fresult;
+}
+
+
+SWIGEXPORT int _wrap_FSUNNonlinSolSetPrintLevel_Newton(SUNNonlinearSolver farg1, int const *farg2) {
+  int fresult ;
+  SUNNonlinearSolver arg1 = (SUNNonlinearSolver) 0 ;
+  int arg2 ;
+  int result;
+  
+  arg1 = (SUNNonlinearSolver)(farg1);
+  arg2 = (int)(*farg2);
+  result = (int)SUNNonlinSolSetPrintLevel_Newton(arg1,arg2);
+  fresult = (int)(result);
+  return fresult;
+}
+
+
 
diff --git a/src/sunnonlinsol/newton/fmod/fsunnonlinsol_newton_mod.f90 b/src/sunnonlinsol/newton/fmod/fsunnonlinsol_newton_mod.f90
index a65f329e1d..efcfe126ab 100644
--- a/src/sunnonlinsol/newton/fmod/fsunnonlinsol_newton_mod.f90
+++ b/src/sunnonlinsol/newton/fmod/fsunnonlinsol_newton_mod.f90
@@ -44,6 +44,8 @@ module fsunnonlinsol_newton_mod
  public :: FSUNNonlinSolGetCurIter_Newton
  public :: FSUNNonlinSolGetNumConvFails_Newton
  public :: FSUNNonlinSolGetSysFn_Newton
+ public :: FSUNNonlinSolSetInfoFile_Newton
+ public :: FSUNNonlinSolSetPrintLevel_Newton
 
 ! WRAPPER DECLARATIONS
 interface
@@ -184,6 +186,24 @@ function swigc_FSUNNonlinSolGetSysFn_Newton(farg1, farg2) &
 integer(C_INT) :: fresult
 end function
 
+function swigc_FSUNNonlinSolSetInfoFile_Newton(farg1, farg2) &
+bind(C, name="_wrap_FSUNNonlinSolSetInfoFile_Newton") &
+result(fresult)
+use, intrinsic :: ISO_C_BINDING
+type(C_PTR), value :: farg1
+type(C_PTR), value :: farg2
+integer(C_INT) :: fresult
+end function
+
+function swigc_FSUNNonlinSolSetPrintLevel_Newton(farg1, farg2) &
+bind(C, name="_wrap_FSUNNonlinSolSetPrintLevel_Newton") &
+result(fresult)
+use, intrinsic :: ISO_C_BINDING
+type(C_PTR), value :: farg1
+integer(C_INT), intent(in) :: farg2
+integer(C_INT) :: fresult
+end function
+
 end interface
 
 
@@ -435,5 +455,37 @@ function FSUNNonlinSolGetSysFn_Newton(nls, sysfn) &
 swig_result = fresult
 end function
 
+function FSUNNonlinSolSetInfoFile_Newton(nls, info_file) &
+result(swig_result)
+use, intrinsic :: ISO_C_BINDING
+integer(C_INT) :: swig_result
+type(SUNNonlinearSolver), target, intent(inout) :: nls
+type(C_PTR) :: info_file
+integer(C_INT) :: fresult 
+type(C_PTR) :: farg1 
+type(C_PTR) :: farg2 
+
+farg1 = c_loc(nls)
+farg2 = info_file
+fresult = swigc_FSUNNonlinSolSetInfoFile_Newton(farg1, farg2)
+swig_result = fresult
+end function
+
+function FSUNNonlinSolSetPrintLevel_Newton(nls, print_level) &
+result(swig_result)
+use, intrinsic :: ISO_C_BINDING
+integer(C_INT) :: swig_result
+type(SUNNonlinearSolver), target, intent(inout) :: nls
+integer(C_INT), intent(in) :: print_level
+integer(C_INT) :: fresult 
+type(C_PTR) :: farg1 
+integer(C_INT) :: farg2 
+
+farg1 = c_loc(nls)
+farg2 = print_level
+fresult = swigc_FSUNNonlinSolSetPrintLevel_Newton(farg1, farg2)
+swig_result = fresult
+end function
+
 
 end module
diff --git a/src/sunnonlinsol/newton/sunnonlinsol_newton.c b/src/sunnonlinsol/newton/sunnonlinsol_newton.c
index a7cdca427c..ac05989911 100644
--- a/src/sunnonlinsol/newton/sunnonlinsol_newton.c
+++ b/src/sunnonlinsol/newton/sunnonlinsol_newton.c
@@ -23,6 +23,8 @@
 #include <sundials/sundials_math.h>
 #include <sundials/sundials_nvector_senswrapper.h>
 
+#include "sundials_debug.h"
+
 /* Content structure accessibility macros  */
 #define NEWTON_CONTENT(S) ( (SUNNonlinearSolverContent_Newton)(S->content) )
 
@@ -80,16 +82,18 @@ SUNNonlinearSolver SUNNonlinSol_Newton(N_Vector y)
   NLS->content = content;
 
   /* Fill general content */
-  content->Sys        = NULL;
-  content->LSetup     = NULL;
-  content->LSolve     = NULL;
-  content->CTest      = NULL;
-  content->jcur       = SUNFALSE;
-  content->curiter    = 0;
-  content->maxiters   = 3;
-  content->niters     = 0;
-  content->nconvfails = 0;
-  content->ctest_data = NULL;
+  content->Sys         = NULL;
+  content->LSetup      = NULL;
+  content->LSolve      = NULL;
+  content->CTest       = NULL;
+  content->jcur        = SUNFALSE;
+  content->curiter     = 0;
+  content->maxiters    = 3;
+  content->niters      = 0;
+  content->nconvfails  = 0;
+  content->ctest_data  = NULL;
+  content->print_level = 0;
+  content->info_file   = stdout;
 
   /* Fill allocatable content */
   content->delta = N_VClone(y);
@@ -229,6 +233,16 @@ int SUNNonlinSolSolve_Newton(SUNNonlinearSolver NLS,
     /* initialize counter curiter */
     NEWTON_CONTENT(NLS)->curiter = 0;
 
+#ifdef SUNDIALS_BUILD_WITH_MONITORING
+    /* print current iteration number and the nonlinear residual */
+    if (NEWTON_CONTENT(NLS)->print_level && NEWTON_CONTENT(NLS)->info_file)
+    {
+      fprintf(NEWTON_CONTENT(NLS)->info_file,
+              "SUNNONLINSOL_NEWTON (nni=%ld):\n",
+              (long int) NEWTON_CONTENT(NLS)->niters);
+    }
+#endif
+
     /* looping point for Newton iteration. Break out on any error. */
     for(;;) {
 
@@ -249,6 +263,17 @@ int SUNNonlinSolSolve_Newton(SUNNonlinearSolver NLS,
       retval = NEWTON_CONTENT(NLS)->CTest(NLS, ycor, delta, tol, w,
                                           NEWTON_CONTENT(NLS)->ctest_data);
 
+#ifdef SUNDIALS_BUILD_WITH_MONITORING
+      /* print current iteration number and the nonlinear residual */
+      if (NEWTON_CONTENT(NLS)->print_level && NEWTON_CONTENT(NLS)->info_file)
+      {
+        fprintf(NEWTON_CONTENT(NLS)->info_file,
+                SUN_NLS_MSG_RESIDUAL,
+                (long int) NEWTON_CONTENT(NLS)->curiter,
+                N_VWrmsNorm(delta, w));
+      }
+#endif
+
       /* if successful update Jacobian status and return */
       if (retval == SUN_NLS_SUCCESS) {
         NEWTON_CONTENT(NLS)->jcur = SUNFALSE;
@@ -458,3 +483,41 @@ int SUNNonlinSolGetSysFn_Newton(SUNNonlinearSolver NLS, SUNNonlinSolSysFn *SysFn
   *SysFn = NEWTON_CONTENT(NLS)->Sys;
   return(SUN_NLS_SUCCESS);
 }
+
+int SUNNonlinSolSetInfoFile_Newton(SUNNonlinearSolver NLS,
+                                   FILE* info_file)
+{
+#ifdef SUNDIALS_BUILD_WITH_MONITORING
+  /* check that the nonlinear solver is non-null */
+  if (NLS == NULL)
+    return(SUN_NLS_MEM_NULL);
+
+  NEWTON_CONTENT(NLS)->info_file = info_file;
+
+  return(SUN_NLS_SUCCESS);
+#else
+  SUNDIALS_DEBUG_PRINT("ERROR in SUNNonlinSolSetInfoFile_Newton: SUNDIALS was not built with monitoring\n");
+  return(SUN_NLS_ILL_INPUT);
+#endif
+}
+
+int SUNNonlinSolSetPrintLevel_Newton(SUNNonlinearSolver NLS,
+                                     int print_level)
+{
+#ifdef SUNDIALS_BUILD_WITH_MONITORING
+  /* check that the nonlinear solver is non-null */
+  if (NLS == NULL)
+    return(SUN_NLS_MEM_NULL);
+
+  /* check for valid print level */
+  if (print_level < 0 || print_level > 1)
+    return(SUN_NLS_ILL_INPUT);
+
+  NEWTON_CONTENT(NLS)->print_level = print_level;
+
+  return(SUN_NLS_SUCCESS);
+#else
+  SUNDIALS_DEBUG_PRINT("ERROR in SUNNonlinSolSetPrintLevel_Newton: SUNDIALS was not built with monitoring\n");
+  return(SUN_NLS_ILL_INPUT);
+#endif
+}
diff --git a/test/unit_tests/CMakeLists.txt b/test/unit_tests/CMakeLists.txt
index cc5354d624..499f9d2afe 100644
--- a/test/unit_tests/CMakeLists.txt
+++ b/test/unit_tests/CMakeLists.txt
@@ -25,4 +25,4 @@ endif()
 # Add ARKode unit tests
 if(BUILD_ARKODE)
   add_subdirectory(arkode)
-endif()
+endif()
\ No newline at end of file