diff --git a/Cxx11/Makefile b/Cxx11/Makefile
index 14a90b345..fcef58c4e 100644
--- a/Cxx11/Makefile
+++ b/Cxx11/Makefile
@@ -61,7 +61,7 @@ endif
 OCCAFLAGS = -I${OCCADIR}/include -Wl,-rpath -Wl,${OCCADIR}/lib -L${OCCADIR}/lib -locca
 
 .PHONY: all clean vector valarray openmp target opencl taskloop stl \
-	ranges kokkos raja cuda cublas sycl dpcpp \
+	ranges kokkos raja cuda cudastf cublas sycl dpcpp \
 	boost-compute thrust executor oneapi onemkl
 
 EXTRA=
@@ -122,6 +122,9 @@ raja: p2p-vector-raja stencil-vector-raja nstream-vector-raja \
 
 cuda: stencil-cuda transpose-cuda nstream-cuda nstream-managed-cuda
 
+stf: cudastf
+cudastf: dgemm-cudastf dgemm-cublas-cudastf stencil-cudastf p2p-cudastf transpose-cudastf
+
 hip: nstream-hip transpose-hip stencil-hip nstream-managed-hip
 
 hipstl: nstream-hipstl
@@ -306,6 +309,14 @@ endif
 %.s: %.cc prk_util.h
 	$(CXX) $(CXXFLAGS) $(ASMFLAGS) -S $< -o $@
 
+%-cudastf: %-cudastf.cu
+	$(NVCC) $(CPPFLAGS) $(CUDAFLAGS) $(CUDASTF_CFLAGS) $(CUDASTF_LDFLAGS) $< -o $@
+
+%-cublas-cudastf: %-cublas-cudastf.cu prk_util.h prk_cuda.h
+	$(NVCC) $(CUDAFLAGS) $(CPPFLAGS) $(CUDASTF_CFLAGS) $(CUDASTF_LDFLAGS) -DPRK_USE_CUBLAS $< -lcublas -lcublasLt -o $@
+
+
+
 clean:
 	-rm -f *.o
 	-rm -f *.s
diff --git a/Cxx11/dgemm-cublas-cudastf.cu b/Cxx11/dgemm-cublas-cudastf.cu
new file mode 100644
index 000000000..8078b7961
--- /dev/null
+++ b/Cxx11/dgemm-cublas-cudastf.cu
@@ -0,0 +1,309 @@
+///
+/// Copyright (c) 2018, Intel Corporation
+/// Copyright (c) 2024, NVIDIA
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    dgemm
+///
+/// PURPOSE: This program tests the efficiency with which a dense matrix
+///          dense multiplication is carried out
+///
+/// USAGE:   The program takes as input the matrix order,
+///          the number of times the matrix-matrix multiplication
+///          is carried out, and, optionally, a tile size for matrix
+///          blocking
+///
+///          <progname> <# iterations> <matrix order> [<batches>]
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// FUNCTIONS CALLED:
+///
+///          Other than OpenMP or standard C functions, the following
+///          functions are used in this program:
+///
+///          cblasDgemm()
+///          cublasDgemmStridedBatched()
+///
+/// HISTORY: Written by Rob Van der Wijngaart, February 2009.
+///          Converted to C++11 by Jeff Hammond, December, 2017.
+///          CUDA STF by Cedric Augonnet, October 2024.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+#include "prk_cuda.h"
+
+#include <cuda/experimental/stf.cuh>
+
+using namespace cuda::experimental::stf;
+
+int main(int argc, char * argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11/CUBLAS STF Dense matrix-matrix multiplication: C += A x B" << std::endl;
+
+  prk::CUDA::info info;
+  //info.print();
+
+  //////////////////////////////////////////////////////////////////////
+  /// Read and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations;
+  int order;
+  int batches = 0;
+  int input_copy = 0;
+  try {
+      if (argc < 2) {
+        throw "Usage: <# iterations> <matrix order> [<batches>] [<copy input every iteration [0/1]>]";
+      }
+
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      order = std::atoi(argv[2]);
+      if (order <= 0) {
+        throw "ERROR: Matrix Order must be greater than 0";
+      } else if (order > prk::get_max_matrix_size()) {
+        throw "ERROR: matrix dimension too large - overflow risk";
+      }
+
+      if (argc > 3) {
+        batches = std::atoi(argv[3]);
+      }
+
+      if (argc > 4) {
+        input_copy = std::atoi(argv[4]);
+        if (input_copy != 0 && input_copy != 1) {
+          throw "ERROR: input_copy was not 0 or 1";
+        }
+      }
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Matrix order         = " << order << std::endl;
+  if (batches == 0) {
+      std::cout << "No batching" << std::endl;
+  } else if (batches < 0) {
+      std::cout << "Batch size           = " << -batches << " (loop over legacy BLAS)" << std::endl;
+  } else if (batches > 0) {
+      std::cout << "Batch size           = " <<  batches << " (batched BLAS)" << std::endl;
+  }
+  std::cout << "Input copy           = " << (input_copy ? "yes" : "no") << std::endl;
+
+  cublasHandle_t h;
+  prk::CUDA::check( cublasCreate(&h) );
+
+  const int tile_size = 32;
+  dim3 dimGrid(prk::divceil(order,tile_size),prk::divceil(order,tile_size),1);
+  dim3 dimBlock(tile_size, tile_size, 1);
+
+  info.checkDims(dimBlock, dimGrid);
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space for matrices
+  //////////////////////////////////////////////////////////////////////
+
+  double gemm_time(0);
+
+  const int matrices = (batches==0 ? 1 : abs(batches));
+  const size_t nelems = (size_t)order * (size_t)order;
+
+  const auto epsilon = 1.0e-8;
+  const auto forder = static_cast<double>(order);
+  const auto reference = 0.25 * prk::pow(forder,3) * prk::pow(forder-1.0,2) * (iterations+1);
+  double residuum(0);
+
+  context ctx;
+
+  if (batches > 0) {
+      /*
+       * BATCHED implementation
+       */
+      auto a = ctx.logical_data(shape_of<slice<double, 3>>(order, order, matrices));
+      auto b = ctx.logical_data(shape_of<slice<double, 3>>(order, order, matrices));
+      auto c = ctx.logical_data(shape_of<slice<double, 3>>(order, order, matrices));
+
+      // Initialize all matrices
+      ctx.parallel_for(a.shape(), a.write(), b.write(), c.write())->*[] __device__ (size_t i, size_t j, size_t k, auto da, auto db, auto dc)
+      {
+          da(i, j, k) = (double)i;
+          db(i, j, k) = (double)i;
+          dc(i, j, k) = 0.0;
+      };
+
+      for (int iter = 0; iter<=iterations; iter++) {
+          if (iter==1) {
+              cudaStreamSynchronize(ctx.task_fence());
+              gemm_time = prk::wtime();
+          }
+
+          const double alpha = 1.0;
+          const double beta  = 1.0;
+          ctx.task(a.read(), b.read(), c.rw())->*[&](cudaStream_t stream, auto da, auto db, auto dc) {
+              cublasSetStream(h, stream);
+              prk::CUDA::check( cublasDgemmStridedBatched(h,
+                                                          CUBLAS_OP_N, CUBLAS_OP_N,
+                                                          order, order, order,
+                                                          &alpha,
+                                                          (const double *)da.data_handle(), order, order*order,
+                                                          (const double *)db.data_handle(), order, order*order,
+                                                          &beta,
+                                                          dc.data_handle(), order, order*order,
+                                                          batches) );
+          };
+      }
+
+      cudaStreamSynchronize(ctx.task_fence());
+      gemm_time = prk::wtime() - gemm_time;
+
+      ctx.host_launch(c.read())->*[&](auto hc)
+      {
+          for (size_t k = 0; k < hc.extent(2); k++)
+          {
+              double checksum = 0.0;
+
+              for (size_t j = 0; j < hc.extent(1); j++)
+              for (size_t i = 0; i < hc.extent(0); i++)
+              {
+                  checksum += hc(i, j, k);
+              }
+
+              residuum += std::abs(checksum-reference)/reference;
+          }
+          residuum /= matrices;
+      };
+  }
+  else {
+      ::std::vector<logical_data<slice<double, 2>>> vector_a;
+      ::std::vector<logical_data<slice<double, 2>>> vector_b;
+      ::std::vector<logical_data<slice<double, 2>>> vector_c;
+
+      // Initialize independant matrices
+      for (size_t k = 0; k < matrices; k++) {
+          auto ak = ctx.logical_data(shape_of<slice<double, 2>>(order, order));
+          auto bk = ctx.logical_data(shape_of<slice<double, 2>>(order, order));
+          auto ck = ctx.logical_data(shape_of<slice<double, 2>>(order, order));
+
+          vector_a.push_back(ak);
+          vector_b.push_back(bk);
+          vector_c.push_back(ck);
+
+          ctx.parallel_for(ak.shape(), ak.write(), bk.write(), ck.write())->*[] __device__ (size_t i, size_t j, auto dak, auto dbk, auto dck)
+          {
+              dak(i, j) = (double)i;
+              dbk(i, j) = (double)i;
+              dck(i, j) = 0.0;
+          };
+      }
+
+      for (int iter = 0; iter<=iterations; iter++) {
+          if (iter==1) {
+              cudaStreamSynchronize(ctx.task_fence());
+              gemm_time = prk::wtime();
+          }
+
+          const double alpha = 1.0;
+          const double beta  = 1.0;
+
+          for (size_t k = 0; k < matrices; k++)
+          {
+             ctx.task(vector_a[k].read(), vector_b[k].read(), vector_c[k].rw())->*[&](cudaStream_t stream, auto dA, auto dB, auto dC) {
+                 cublasSetStream(h, stream);
+                 prk::CUDA::check( cublasDgemm(h,
+                                               CUBLAS_OP_N, CUBLAS_OP_N, // opA, opB
+                                               order, order, order,      // m, n, k
+                                               &alpha,                   // alpha
+                                               dA.data_handle(), order,                // A, lda
+                                               dB.data_handle(), order,                // B, ldb
+                                               &beta,                    // beta
+                                               dC.data_handle(), order) );             // C, ldc
+             };
+          }
+      }
+
+      cudaStreamSynchronize(ctx.task_fence());
+      gemm_time = prk::wtime() - gemm_time;
+
+      for (size_t k = 0; k < matrices; k++)
+      {
+          double checksum = 0.0;
+          ctx.host_launch(vector_c[k].read())->*[&](auto hck)
+          {
+             for (size_t j = 0; j < hck.extent(1); j++)
+             for (size_t i = 0; i < hck.extent(0); i++)
+             {
+                 checksum += hck(i, j);
+             }
+          };
+
+          cudaStreamSynchronize(ctx.task_fence());
+          residuum += std::abs(checksum-reference)/reference;
+
+      }
+      residuum /= matrices;
+
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  /// Analyze and output results
+  //////////////////////////////////////////////////////////////////////
+
+  if (residuum < epsilon) {
+#if VERBOSE
+    std::cout << "Reference checksum = " << reference << "\n"
+              << "Actual checksum = " << checksum << std::endl;
+#endif
+    std::cout << "Solution validates" << std::endl;
+    auto avgtime = gemm_time/iterations/matrices;
+    auto nflops = 2.0 * prk::pow(forder,3);
+    prk::print_flop_rate_time("FP64", nflops/avgtime, avgtime);
+  } else {
+    std::cout << "Reference checksum = " << reference << "\n"
+              << "Residuum           = " << residuum << std::endl;
+    return 1;
+  }
+
+  ctx.finalize();
+
+  return 0;
+}
+
+
diff --git a/Cxx11/dgemm-cudastf.cu b/Cxx11/dgemm-cudastf.cu
new file mode 100644
index 000000000..89f24d38a
--- /dev/null
+++ b/Cxx11/dgemm-cudastf.cu
@@ -0,0 +1,199 @@
+///
+/// Copyright (c) 2017, Intel Corporation
+/// Copyright (c) 2024, NVIDIA
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    dgemm
+///
+/// PURPOSE: This program tests the efficiency with which a dense matrix
+///          dense multiplication is carried out
+///
+/// USAGE:   The program takes as input the matrix order,
+///          the number of times the matrix-matrix multiplication
+///          is carried out, and, optionally, a tile size for matrix
+///          blocking
+///
+///          <progname> <# iterations> <matrix order> [<tile size>]
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// FUNCTIONS CALLED:
+///
+///          Other than OpenMP or standard C functions, the following
+///          functions are used in this program:
+///
+///          wtime()
+///
+/// HISTORY: Written by Rob Van der Wijngaart, February 2009.
+///          Converted to C++11 by Jeff Hammond, December, 2017.
+///          CUDA STF by Cedric Augonnet, October 2024.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+#include "prk_cuda.h"
+#include <cuda/experimental/stf.cuh>
+
+using namespace cuda::experimental::stf;
+
+int main(int argc, char * argv[])
+{
+  //////////////////////////////////////////////////////////////////////
+  /// Read and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11 CUDA STF Dense matrix-matrix multiplication: C += A x B" << std::endl;
+
+  int iterations;
+  int order;
+  try {
+      if (argc < 3) {
+        throw "Usage: <# iterations> <matrix order> [tile size]";
+      }
+
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      order = std::atoi(argv[2]);
+      if (order <= 0) {
+        throw "ERROR: Matrix Order must be greater than 0";
+      } else if (order > prk::get_max_matrix_size()) {
+        throw "ERROR: matrix dimension too large - overflow risk";
+      }
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Matrix order         = " << order << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  /// Allocate space for matrices
+  //////////////////////////////////////////////////////////////////////
+
+  context ctx;
+
+  double dgemm_time{0};
+
+  auto A = ctx.logical_data(shape_of<slice<double, 2>>(order, order));
+  auto B = ctx.logical_data(shape_of<slice<double, 2>>(order, order));
+  auto C = ctx.logical_data(shape_of<slice<double, 2>>(order, order));
+
+  ctx.parallel_for(A.shape(), A.write(), B.write(), C.write())->*[]__device__(size_t i, size_t j, auto dA, auto dB, auto dC)
+  {
+      dA(i, j) = (double)i;
+      dB(i, j) = (double)i;
+      dC(i, j) = 0.0;
+  };
+
+  {
+    for (int iter = 0; iter<=iterations; iter++) {
+      if (iter==1) {
+          cudaStreamSynchronize(ctx.task_fence());
+          dgemm_time = prk::wtime();
+      }
+
+      ctx.parallel_for(C.shape(), A.read(), B.read(), C.rw())->*[]__device__ (size_t i, size_t j, auto dA, auto dB, auto dC)
+      {
+          double Ctemp(0);
+          for (size_t k = 0; k < dC.extent(0); k++) {
+              Ctemp += dA(i, k)*dB(k, j);
+          }
+          dC(i,j) += Ctemp;
+      };
+    }
+
+    cudaStreamSynchronize(ctx.task_fence());
+    dgemm_time = prk::wtime() - dgemm_time;
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  /// Analyze and output results
+  //////////////////////////////////////////////////////////////////////
+
+  const auto forder = static_cast<double>(order);
+  const auto reference = 0.25 * prk::pow(forder,3) * prk::pow(forder-1.0,2) * (iterations+1);
+
+  double checksum;
+  ctx.host_launch(C.read())->*[&](auto hC)
+  {
+      for (size_t j = 0; j < hC.extent(1); j++)
+      for (size_t i = 0; i < hC.extent(0); i++)
+      {
+          checksum += hC(i, j);
+      }
+  };
+
+  cudaStreamSynchronize(ctx.task_fence());
+
+  const auto epsilon = 1.0e-8;
+  const auto residuum = prk::abs(checksum-reference)/reference;
+  if (residuum < epsilon) {
+#if VERBOSE
+    std::cout << "Reference checksum = " << reference << "\n"
+              << "Actual checksum = " << checksum << std::endl;
+#endif
+    std::cout << "Solution validates" << std::endl;
+    auto avgtime = dgemm_time/iterations;
+    auto nflops = 2.0 * prk::pow(forder,3);
+    std::cout << "Rate (MF/s): " << 1.0e-6 * nflops/avgtime
+              << " Avg time (s): " << avgtime << std::endl;
+  } else {
+    std::cout << "Reference checksum = " << reference << "\n"
+              << "Actual checksum = " << checksum << std::endl;
+#if VERBOSE
+    for (int i=0; i<order; ++i)
+      for (int j=0; j<order; ++j)
+        std::cout << "A(" << i << "," << j << ") = " << A[i*order+j] << "\n";
+    for (int i=0; i<order; ++i)
+      for (int j=0; j<order; ++j)
+        std::cout << "B(" << i << "," << j << ") = " << B[i*order+j] << "\n";
+    for (int i=0; i<order; ++i)
+      for (int j=0; j<order; ++j)
+        std::cout << "C(" << i << "," << j << ") = " << C[i*order+j] << "\n";
+    std::cout << std::endl;
+#endif
+    return 1;
+  }
+
+  ctx.finalize();
+
+  return 0;
+}
+
+
diff --git a/Cxx11/p2p-cudastf.cu b/Cxx11/p2p-cudastf.cu
new file mode 100644
index 000000000..f3940b4e8
--- /dev/null
+++ b/Cxx11/p2p-cudastf.cu
@@ -0,0 +1,311 @@
+///
+/// Copyright (c) 2020, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    Pipeline
+///
+/// PURPOSE: This program tests the efficiency with which point-to-point
+///          synchronization can be carried out. It does so by executing
+///          a pipelined algorithm on an n^2 grid. The first array dimension
+///          is distributed among the threads (stripwise decomposition).
+///
+/// USAGE:   The program takes as input the
+///          dimensions of the grid, and the number of iterations on the grid
+///
+///                <progname> <iterations> <n>
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// HISTORY: - Written by Rob Van der Wijngaart, February 2009.
+///            C99-ification by Jeff Hammond, February 2016.
+///            C++11-ification by Jeff Hammond, May 2017.
+///            CUDASTF version by Cedric Augonnet, November 2024.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+#include "prk_cuda.h"
+
+#include <cuda_runtime.h>
+#include <cooperative_groups.h>
+
+#include <cuda/experimental/stf.cuh>
+
+using namespace cuda::experimental::stf;
+
+
+#define BLOCK_SIZE 32
+
+#if 0
+
+#define HALO_SIZE 1
+
+__global__ void p2p(int N, double * M)
+{
+    __shared__ float sm_buffer[BLOCK_SIZE + HALO_SIZE][BLOCK_SIZE + HALO_SIZE];
+
+    int bx = blockIdx.x;
+    int tx = threadIdx.x;
+    int dx = blockDim.x;
+
+    cooperative_groups::grid_group cuda_grid = cooperative_groups::this_grid();
+
+    for(int i = 0; i < 2*N/BLOCK_SIZE; ++i) {
+
+         //Compute matrix coordinates for block corner
+        int g_x = bx * BLOCK_SIZE + HALO_SIZE;
+        int g_y = (i - bx) * BLOCK_SIZE + HALO_SIZE;
+
+        //Check block is in bounds
+        if(g_y >= 0 && g_y < (N + HALO_SIZE)) {
+
+            // load halo to SM
+            sm_buffer[0][tx + 1] = M[(g_y - 1) * (N + HALO_SIZE) + g_x + tx];
+            sm_buffer[tx + 1][0] = M[(g_y + tx)* (N + HALO_SIZE) + g_x - 1];
+            if(tx == 0) //Load corner
+                sm_buffer[0][0] = M[(g_y - 1) * (N + HALO_SIZE) + g_x - 1];
+
+            // inner loop
+            for(int j = 0; j <= 2*BLOCK_SIZE; j++) {
+                int l_x = tx + HALO_SIZE;
+                int l_y = j - tx + HALO_SIZE;
+                if (l_y >= 1 && l_y <= BLOCK_SIZE)
+                    sm_buffer[l_y][l_x] = sm_buffer[l_y - 1][l_x] + sm_buffer[l_y][l_x - 1] - sm_buffer[l_y - 1][l_x - 1];
+            }
+
+            // flush block to memory
+            for(int j = 0; j <= BLOCK_SIZE; j++)
+                M[(g_y + j) * (N + HALO_SIZE) + g_x + tx] = sm_buffer[j + HALO_SIZE][tx + HALO_SIZE];
+
+            // sync threads
+            cuda_grid.sync();
+        }
+    }
+
+#if 0
+    // one thread copies the bottom right corner to the top left corner...
+    if ((bx * dx + tx) == 0) {
+        M[0] = -M[(N-1)*(N+HALO_SIZE)+(N-1)];
+    }
+    cuda_grid.sync(); // required?
+#endif
+}
+#else
+__global__ void p2p(double * grid, const int n)
+{
+    const int bx = blockIdx.x;
+    const int tx = threadIdx.x;
+    const int dx = blockDim.x;
+
+    const int j = bx * dx + tx + 1;
+
+    cooperative_groups::grid_group cuda_grid = cooperative_groups::this_grid();
+
+    for (int i=2; i<=2*n-2; i++) {
+      //parallel_for (int j=std::max(2,i-n+2); j<=std::min(i,n); j++) {
+      if (MAX(2,i-n+2) <= j && j <= MIN(i,n)) {
+        const int x = i-j+1;
+        const int y = j-1;
+        grid[x*n+y] = grid[(x-1)*n+y] + grid[x*n+(y-1)] - grid[(x-1)*n+(y-1)];
+      }
+      //__threadfence();
+      cuda_grid.sync();
+      //__threadfence();
+    }
+
+    // one thread copies the bottom right corner to the top left corner...
+    if (j == 1) {
+      grid[0*n+0] = -grid[(n-1)*n+(n-1)];
+    }
+    //__threadfence();
+    //cuda_grid.sync(); // required?
+    //__threadfence();
+}
+#endif
+
+int main(int argc, char* argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11/CUDASTF pipeline execution on 2D grid" << std::endl;
+
+  prk::CUDA::info info;
+  info.print();
+
+  //////////////////////////////////////////////////////////////////////
+  // Process and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations;
+  int n;
+  try {
+      if (argc < 3) {
+        throw " <# iterations> <array dimension>";
+      }
+
+      // number of times to run the pipeline algorithm
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      // grid dimensions
+      n = std::atoi(argv[2]);
+      if (n < 1) {
+        throw "ERROR: grid dimensions must be positive";
+      } else if ( n > prk::get_max_matrix_size() ) {
+        throw "ERROR: grid dimension too large - overflow risk";
+      } else if (n % BLOCK_SIZE) {
+        throw "ERROR: grid dimension is not a multiple of BLOCK_SIZE";
+      }
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Grid sizes           = " << n << ", " << n << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  double pipeline_time{0}; // silence compiler warning
+
+  context ctx;
+  auto grid = ctx.logical_data(shape_of<slice<double, 2>>(n, n));
+
+  ctx.parallel_for(grid.shape(), grid.write())->*[] __device__ (size_t i, size_t j, auto d_grid)
+  {
+      d_grid(i, j) = 0.0;
+  };
+
+  // initialize boundary conditions
+  ctx.parallel_for(box(n), grid.write())->*[] __device__ (size_t i, auto d_grid)
+  {
+      d_grid(i, 0) = static_cast<double>(i);
+      d_grid(0, i) = static_cast<double>(i);
+  };
+
+#ifdef DEBUG
+  ctx.host_launch(grid.read())->*[n](auto h_grid) {
+  std::cout << "B h_grid=\n";
+  for (int i=0; i<n; i++) {
+    std::cout << "B ";
+    for (int j=0; j<n; j++) {
+        std::cout << h_grid(i, j) << ",";
+    }
+    std::cout << "\n";
+  }
+  };
+#endif
+
+  for (int iter = 0; iter<=iterations; iter++) {
+
+    if (iter==1) {
+        cudaStreamSynchronize(ctx.task_fence());
+        pipeline_time = prk::wtime();
+    }
+
+    auto spec = con(n/BLOCK_SIZE, con<BLOCK_SIZE>(hw_scope::block));
+    ctx.launch(spec, grid.rw())->*[n]__device__(auto th, auto d_grid) {
+        for (int i=2; i<=2*n-2; i++) {
+          for (int j = th.rank() + 1; j <= n; j += th.size()) {
+              if (MAX(2,i-n+2) <= j && j <= MIN(i,n)) {
+                const int x = i-j+1;
+                const int y = j-1;
+                d_grid(x, y) = d_grid(x - 1, y) + d_grid(x, y-1) - d_grid(x-1, y-1);
+              }
+          }
+
+          th.sync();
+        }
+
+        // one thread copies the bottom right corner to the top left corner...
+        if (th.rank() == 0) {
+          d_grid(0, 0) = -d_grid(n-1, n-1);
+        }
+    };
+
+#ifdef DEBUG
+    ctx.host_launch(grid.read())->*[n](auto h_grid) {
+        std::cout << "h_grid=\n";
+        for (int i=0; i<n; i++) {
+          for (int j=0; j<n; j++) {
+              std::cout << h_grid(i, j) << ",";
+          }
+          std::cout << "\n";
+        }
+    };
+#endif
+  }
+
+  cudaStreamSynchronize(ctx.task_fence());
+  pipeline_time = prk::wtime() - pipeline_time;
+
+  //////////////////////////////////////////////////////////////////////
+  // Analyze and output results.
+  //////////////////////////////////////////////////////////////////////
+
+  const double epsilon = 1.e-8;
+  auto corner_val = ((iterations+1.)*(2.*n-2.));
+
+  double corner_res = -1.0;
+  ctx.host_launch(grid.read())->*[&](auto h_grid) {
+      corner_res = h_grid(n - 1, n - 1);
+  };
+
+  cudaStreamSynchronize(ctx.task_fence());
+
+  if ( (prk::abs(corner_res - corner_val)/corner_val) > epsilon) {
+    std::cout << "ERROR: checksum " << corner_res
+              << " does not match verification value " << corner_val << std::endl;
+    return 1;
+  }
+
+#ifdef VERBOSE
+  std::cout << "Solution validates; verification value = " << corner_val << std::endl;
+#else
+  std::cout << "Solution validates" << std::endl;
+#endif
+  auto avgtime = pipeline_time/iterations;
+  std::cout << "Rate (MFlops/s): "
+            << 2.0e-6 * ( (n-1.)*(n-1.) )/avgtime
+            << " Avg time (s): " << avgtime << std::endl;
+
+  ctx.finalize();
+
+
+
+  return 0;
+}
diff --git a/Cxx11/stencil-cudastf.cu b/Cxx11/stencil-cudastf.cu
new file mode 100644
index 000000000..83778f6fc
--- /dev/null
+++ b/Cxx11/stencil-cudastf.cu
@@ -0,0 +1,255 @@
+///
+/// Copyright (c) 2013, Intel Corporation
+/// Copyright (c) 2024, NVIDIA
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    Stencil
+///
+/// PURPOSE: This program tests the efficiency with which a space-invariant,
+///          linear, symmetric filter (stencil) can be applied to a square
+///          grid or image.
+///
+/// USAGE:   The program takes as input the linear
+///          dimension of the grid, and the number of iterations on the grid
+///
+///                <progname> <iterations> <grid size>
+///
+///          The output consists of diagnostics to make sure the
+///          algorithm worked, and of timing statistics.
+///
+/// FUNCTIONS CALLED:
+///
+///          Other than standard C functions, the following functions are used in
+///          this program:
+///          wtime()
+///
+/// HISTORY: - Written by Rob Van der Wijngaart, February 2009.
+///            C++11-ification by Jeff Hammond, May 2017.
+///            CUDA STF by Cedric Augonnet, October 2024.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+#include "prk_cuda.h"
+#include "stencil_cuda.hpp"
+#include <cuda/experimental/stf.cuh>
+
+using namespace cuda::experimental::stf;
+
+__global__ void nothing(const int n, const prk_float * in, prk_float * out)
+{
+    //printf("You are trying to use a stencil that does not exist.\n");
+    //printf("Please generate the new stencil using the code generator.\n");
+    // n will never be zero - this is to silence compiler warnings.
+    //if (n==0) printf("in=%p out=%p\n", in, out);
+    //abort();
+}
+
+int main(int argc, char* argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++11/CUDA STF Stencil execution on 2D grid" << std::endl;
+
+  prk::CUDA::info info;
+  info.print();
+
+  //////////////////////////////////////////////////////////////////////
+  // Process and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations, n, radius, tile_size;
+  bool star = true;
+  try {
+      if (argc < 3) {
+        throw "Usage: <# iterations> <array dimension> [<tile_size> <star/grid> <radius>]";
+      }
+
+      // number of times to run the algorithm
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      // linear grid dimension
+      n  = std::atoi(argv[2]);
+      if (n < 1) {
+        throw "ERROR: grid dimension must be positive";
+      } else if (n > prk::get_max_matrix_size()) {
+        throw "ERROR: grid dimension too large - overflow risk";
+      }
+
+      // default tile size for tiling of local transpose
+      tile_size = 32;
+      if (argc > 3) {
+          tile_size = std::atoi(argv[3]);
+          if (tile_size <= 0) tile_size = n;
+          if (tile_size > n) tile_size = n;
+          if (tile_size > 32) {
+              std::cout << "Warning: tile_size > 32 may lead to incorrect results (observed for CUDA 9.0 on GV100).\n";
+          }
+      }
+
+      // stencil pattern
+      if (argc > 4) {
+          auto stencil = std::string(argv[4]);
+          auto grid = std::string("grid");
+          star = (stencil == grid) ? false : true;
+      }
+
+      // stencil radius
+      radius = 2;
+      if (argc > 5) {
+          radius = std::atoi(argv[5]);
+      }
+
+      if ( (radius < 1) || (2*radius+1 > n) ) {
+        throw "ERROR: Stencil radius negative or too large";
+      }
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Grid size            = " << n << std::endl;
+  std::cout << "Tile size            = " << tile_size << std::endl;
+  std::cout << "Type of stencil      = " << (star ? "star" : "grid") << std::endl;
+  std::cout << "Radius of stencil    = " << radius << std::endl;
+
+  auto stencil = nothing;
+  if (star) {
+      switch (radius) {
+          case 1: stencil = star1; break;
+          case 2: stencil = star2; break;
+          case 3: stencil = star3; break;
+          case 4: stencil = star4; break;
+          case 5: stencil = star5; break;
+      }
+  } else {
+      switch (radius) {
+          case 1: stencil = grid1; break;
+          case 2: stencil = grid2; break;
+          case 3: stencil = grid3; break;
+          case 4: stencil = grid4; break;
+          case 5: stencil = grid5; break;
+      }
+  }
+
+  dim3 dimGrid(prk::divceil(n,tile_size),prk::divceil(n,tile_size),1);
+  dim3 dimBlock(tile_size, tile_size, 1);
+  info.checkDims(dimBlock, dimGrid);
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  double stencil_time{0};
+
+  context ctx;
+
+  auto in = ctx.logical_data(shape_of<slice<prk_float, 2>>(n, n));
+  auto out = ctx.logical_data(shape_of<slice<prk_float, 2>>(n, n));
+
+  ctx.parallel_for(in.shape(), in.write(), out.write())->*[]__device__(size_t i, size_t j, auto d_in, auto d_out)
+  {
+      d_in(i, j)  = static_cast<prk_float>(i+j);
+      d_out(i, j) = static_cast<prk_float>(0);
+  };
+
+  for (int iter = 0; iter<=iterations; iter++) {
+
+    if (iter==1) {
+        cudaStreamSynchronize(ctx.task_fence());
+        stencil_time = prk::wtime();
+    }
+
+    // Apply the stencil operator
+    ctx.task(in.read(), out.rw())->*[&](cudaStream_t stream, auto d_in, auto d_out)
+    {
+        stencil<<<dimGrid, dimBlock, 0, stream>>>(n, d_in.data_handle(), d_out.data_handle());
+    };
+
+    // Add constant to solution to force refresh of neighbor data, if any
+    ctx.parallel_for(in.shape(), in.rw())->*[] __device__ (size_t i, size_t j, auto d_in)
+    {
+        d_in(i, j) += (prk_float)1;
+    };
+  }
+
+  cudaStreamSynchronize(ctx.task_fence());
+
+  stencil_time = prk::wtime() - stencil_time;
+
+  //////////////////////////////////////////////////////////////////////
+  // Analyze and output results.
+  //////////////////////////////////////////////////////////////////////
+
+  // interior of grid with respect to stencil
+  double norm = 0.0;
+  size_t active_points = static_cast<size_t>(n-2*radius)*static_cast<size_t>(n-2*radius);
+  ctx.host_launch(out.read())->*[&](auto h_out)
+  {
+     for (int i=radius; i<n-radius; i++) {
+       for (int j=radius; j<n-radius; j++) {
+         norm += prk::abs(h_out(i, j));
+       }
+     }
+     norm /= active_points;
+  };
+
+  cudaStreamSynchronize(ctx.task_fence());
+
+  // verify correctness
+  const double epsilon = 1.0e-8;
+  double reference_norm = 2.*(iterations+1.);
+  if (prk::abs(norm-reference_norm) > epsilon) {
+    std::cout << "ERROR: L1 norm = " << norm
+              << " Reference L1 norm = " << reference_norm << std::endl;
+    return 1;
+  } else {
+    std::cout << "Solution validates" << std::endl;
+#ifdef VERBOSE
+    std::cout << "L1 norm = " << norm
+              << " Reference L1 norm = " << reference_norm << std::endl;
+#endif
+    const int stencil_size = star ? 4*radius+1 : (2*radius+1)*(2*radius+1);
+    size_t flops = (2L*(size_t)stencil_size+1L) * active_points;
+    auto avgtime = stencil_time/iterations;
+    std::cout << "Rate (MFlops/s): " << 1.0e-6 * static_cast<double>(flops)/avgtime
+              << " Avg time (s): " << avgtime << std::endl;
+  }
+
+  ctx.finalize();
+
+  return 0;
+}
diff --git a/Cxx11/transpose-cudastf.cu b/Cxx11/transpose-cudastf.cu
new file mode 100644
index 000000000..ade42aced
--- /dev/null
+++ b/Cxx11/transpose-cudastf.cu
@@ -0,0 +1,167 @@
+///
+/// Copyright (c) 2013, Intel Corporation
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    transpose
+///
+/// PURPOSE: This program measures the time for the transpose of a
+///          column-major stored matrix into a row-major stored matrix.
+///
+/// USAGE:   Program input is the matrix order and the number of times to
+///          repeat the operation:
+///
+///          transpose <matrix_size> <# iterations>
+///
+///          The output consists of diagnostics to make sure the
+///          transpose worked and timing statistics.
+///
+/// HISTORY: Written by  Rob Van der Wijngaart, February 2009.
+///          Converted to C++11 by Jeff Hammond, February 2016 and May 2017.
+///          CUDASTF version by Cedric Augonnet, November 2024.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+#include <cuda/experimental/stf.cuh>
+
+using namespace cuda::experimental::stf;
+
+int main(int argc, char * argv[])
+{
+  std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
+  std::cout << "C++17/CUDASTF Matrix transpose: B = A^T" << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  // Read and test input parameters
+  //////////////////////////////////////////////////////////////////////
+
+  int iterations;
+  int order;
+  try {
+      if (argc < 3) {
+        throw "Usage: <# iterations> <matrix order>";
+      }
+
+      // number of times to do the transpose
+      iterations  = std::atoi(argv[1]);
+      if (iterations < 1) {
+        throw "ERROR: iterations must be >= 1";
+      }
+
+      // order of a the matrix
+      order = std::atoi(argv[2]);
+      if (order <= 0) {
+        throw "ERROR: Matrix Order must be greater than 0";
+      } else if (order > prk::get_max_matrix_size()) {
+        throw "ERROR: matrix dimension too large - overflow risk";
+      }
+  }
+  catch (const char * e) {
+    std::cout << e << std::endl;
+    return 1;
+  }
+
+  context ctx;
+
+  std::cout << "Number of iterations = " << iterations << std::endl;
+  std::cout << "Matrix order         = " << order << std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  // Allocate space and perform the computation
+  //////////////////////////////////////////////////////////////////////
+
+  double trans_time{0};
+
+  auto A = ctx.logical_data(shape_of<slice<double, 2>>(order, order));
+  auto B = ctx.logical_data(shape_of<slice<double, 2>>(order, order));
+
+  ctx.parallel_for(A.shape(), A.write(), B.write())->*[] __device__ (size_t i, size_t j, auto dA, auto dB)
+  {
+      dA(i, j) = static_cast<double>(i*dA.extent(0)+j);
+      dB(i, j) = 0.0;
+  };
+
+  for (int iter = 0; iter<=iterations; iter++) {
+    if (iter==1) {
+        cudaStreamSynchronize(ctx.task_fence());
+        trans_time = prk::wtime();
+    }
+
+    ctx.parallel_for(B.shape(), A.rw(), B.rw())->*[] __device__ (size_t i, size_t j, auto dA, auto dB)
+    {
+        dB(i, j) += dA(j, i);
+        dA(j, i) += 1.0;
+    };
+  }
+
+  cudaStreamSynchronize(ctx.task_fence());
+  trans_time = prk::wtime() - trans_time;
+
+  //////////////////////////////////////////////////////////////////////
+  /// Analyze and output results
+  //////////////////////////////////////////////////////////////////////
+
+  double abserr(0);
+  ctx.host_launch(B.read())->*[&](auto hB) {
+      const auto addit = (iterations+1.) * (iterations/2.);
+      for (int j=0; j<order; j++) {
+        for (int i=0; i<order; i++) {
+          const int ij = i*order+j;
+          const double reference = static_cast<double>(ij)*(1.+iterations)+addit;
+          abserr += prk::abs(hB(j, i) - reference);
+        }
+      }
+  };
+
+  ctx.finalize();
+
+#ifdef VERBOSE
+  std::cout << "Sum of absolute differences: " << abserr << std::endl;
+#endif
+
+  const auto epsilon = 1.0e-8;
+  if (abserr < epsilon) {
+    std::cout << "Solution validates" << std::endl;
+    auto avgtime = trans_time/iterations;
+    auto bytes = (size_t)order * (size_t)order * sizeof(double);
+    std::cout << "Rate (MB/s): " << 1.0e-6 * (2L*bytes)/avgtime
+              << " Avg time (s): " << avgtime << std::endl;
+  } else {
+    std::cout << "ERROR: Aggregate squared error " << abserr
+              << " exceeds threshold " << epsilon << std::endl;
+    return 1;
+  }
+
+  return 0;
+}
+
+
diff --git a/ci/install-cudastf.sh b/ci/install-cudastf.sh
new file mode 100644
index 000000000..7bb845528
--- /dev/null
+++ b/ci/install-cudastf.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+CI_ROOT="$1"
+
+cd $CI_ROOT
+
+TARGETDIR=$CI_ROOT/stf/
+
+mkdir -p $TARGETDIR
+
+git clone https://github.com/NVIDIA/cccl.git $TARGETDIR/cccl/
diff --git a/common/make.defs.cuda b/common/make.defs.cuda
index 6c573d51e..97e6040c2 100644
--- a/common/make.defs.cuda
+++ b/common/make.defs.cuda
@@ -1,16 +1,24 @@
 #
-# This file shows the CUDA+GCC toolchain options.
+# This file shows the CUDA toolchain options
+# for both NVHPC and GCC.
+NVHPC_PATH=/opt/nvidia/hpc_sdk/Linux_$$(uname -m)/24.9
+#NVHPC_PATH=/proj/nv/Linux_$$(uname -m)/21.11
+#NVHPC_PATH=${HOME}/NVIDIA/hpc_sdk/Linux_$$(uname -m)/2021
+NVHPC_CBIN=${NVHPC_PATH}/compilers/bin/
 #
 # Base compilers and language options
 #
 VERSION=
 # C99 is required in some implementations.
-CC=gcc${VERSION} -std=gnu11
+#CC=gcc${VERSION} -std=gnu11
+CC=${NVHPC_CBIN}nvc -c11 -march=zen4
 #EXTRA_CLIBS=-lrt
 # All of the Fortran code is written for the 2008 standard and requires preprocessing.
-FC=gfortran${VERSION} -std=f2008 -cpp
+#FC=gfortran${VERSION} -std=f2008 -cpp
+FC=${NVHPC_CBIN}nvfortran -DNVHPC -march=zen4
 # C++11 may not be required but does no harm here.
-CXX=g++${VERSION} -std=gnu++17
+#CXX=g++${VERSION} -std=gnu++17
+CXX=${NVHPC_CBIN}nvc++ -std=gnu++20 -march=zen4
 #
 # Compiler flags
 #
@@ -30,16 +38,30 @@ DEFAULT_OPT_FLAGS+=-Wno-ignored-attributes -Wno-deprecated-declarations
 #
 # OpenMP flags
 #
-OPENMPFLAG=-fopenmp
-OPENMPSIMDFLAG=-fopenmp-simd
-OFFLOADFLAG=-foffload="-O3 -v"
+#OPENMPFLAG=-fopenmp
+#OPENMPSIMDFLAG=-fopenmp-simd
+#OFFLOADFLAG=-foffload="-O3 -v"
+#OPENACCFLAG=-fopenacc
+OPENMPFLAG=-mp
+#OPENMPFLAG+=-Minfo=mp,vect
+OPENMPSIMDFLAG=
+OFFLOADFLAG=-mp -target=gpu -gpu=managed
+OFFLOADFLAG+=-Minfo=accel
 OFFLOADFLAG+=-DGPU_SCHEDULE="schedule(static,1)"
-OPENACCFLAG=-fopenacc
+OPENACCFLAG=-acc -target=gpu
+OPENACCFLAG+=-Mlarge_arrays
+OPENACCFLAG+=-Minfo=accel
+STDPARFLAG=-stdpar=gpu -gpu=managed
+STDPARFLAG+=-Minfo=accel
+STDPARFLAG+=-cudalib=cublas,cutensor
+CUFORTFLAG=-cuda -gpu=managed -acc # ACC required for CUF+managed
+CUFORTFLAG+=-Minfo=accel
 #
 # OpenCL flags
 #
-OPENCLDIR=/usr/local/cuda-12.1/targets/x86_64-linux
-OPENCLFLAG=-I${OPENCLDIR}/include -L${OPENCLDIR}/lib64 -lOpenCL
+#OPENCLDIR=/usr/local/cuda-12.6/targets/x86_64-linux
+OPENCLDIR=/usr/local/cuda/targets/$$(uname -m)-linux
+OPENCLFLAG=-I${OPENCLDIR}/include -L${OPENCLDIR}/lib64 -L${OPENCLDIR}/lib -lOpenCL
 #OPENCLFLAG+=-Wno-ignored-attributes -Wno-deprecated-declarations
 #OPENCLFLAG+=-Wno-deprecated-declarations -Wno-missing-braces
 #
@@ -110,6 +132,7 @@ OPENCLFLAG=-I${OPENCLDIR}/include -L${OPENCLDIR}/lib64 -lOpenCL
 #
 TBBDIR=/usr/local/Cellar/tbb/2019_U5_1
 TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb
+#TBBFLAG=-L/usr/lib/$$(uname -m)-linux-gnu -ltbb
 #
 # Parallel STL, Boost, etc.
 #
@@ -118,15 +141,17 @@ RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG}
 #RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include
 PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} -I./pstl/include ${RANGEFLAG}
 KOKKOSDIR=/opt/kokkos/gcc
+PRK_KOKKOS_BACKEND=Cuda
+KOKKOSCXX=${KOKKOSDIR}/bin/nvcc_wrapper
 KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG}
 RAJADIR=/opt/raja/gcc
 RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG}
-THRUSTDIR=/opt/nvidia/thrust
+THRUSTDIR=/opt/nvidia/hpc_sdk/Linux_$$(uname -m)/21.11/compilers/include-stdpar
 THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG}
 #
 # CBLAS for C++ DGEMM
 #
-BLASFLAG=-L/usr/lib/x86_64-linux-gnu/blis-openmp -lblis
+BLASFLAG=-L${NVHPC_PATH}/REDIST/compilers/lib -lblas
 CBLASFLAG=${BLASFLAG}
 #
 # CUDA flags
@@ -136,12 +161,19 @@ CBLASFLAG=${BLASFLAG}
 # Linux w/ NVIDIA CUDA
 # NVCC never supports the latest GCC.
 # Use appropriate arch or code is compiled to ancient features.
-NVCC=/usr/local/cuda-12.1/bin/nvcc
-CUDAFLAGS=-g -O3 -std=c++11
+#NVCC=${NVHPC_CBIN}nvc++
+#NVCC=${NVHPC_CBIN}nvcc
+NVCC=/usr/local/cuda-12.6/bin/nvcc
+CUDAFLAGS=-g -O3 -std=c++20
+CUDAFLAGS+=--extended-lambda
 CUDAFLAGS+=--gpu-architecture=sm_89
-#CUDAFLAGS+=--compiler-bindir=/swtools/gcc/7.5.0/bin
+#CUDAFLAGS+=-allow-unsupported-compiler
+#CUDAFLAGS+=-ccbin=g++-13 -lm #-lstdc++
+#CUDAFLAGS+=--compiler-bindir=/opt/gcc/12.3.0/bin/
 #CUDAFLAGS+=-forward-unknown-to-host-compiler -fopenmp
 CUDAFLAGS+=-rdc=true # FIXES ptxas fatal   : Unresolved extern function 'cudaCGGetIntrinsicHandle'
+#CUDAFLAGS+=-I${NVHPC_PATH}/math_libs/12.6/targets/$$(uname -m)-linux/include
+#CUDAFLAGS+=-L${NVHPC_PATH}/math_libs/12.6/targets/$$(uname -m)-linux/lib
 # https://github.com/tensorflow/tensorflow/issues/1066#issuecomment-200574233
 # heavy hammer:
 CUDAFLAGS+=-D_X86INTRIN_H_INCLUDED
@@ -165,16 +197,28 @@ CUDAFLAGS+=-D_X86INTRIN_H_INCLUDED
 #CUDAFLAGS+=-D_AVX512IFMAVLINTRIN_H_INCLUDED
 #CUDAFLAGS+=-D_AVX512ERINTRIN_H_INCLUDED
 #
+# CUDASTF
+#
+CUDASTF_DIR=../deps/stf/
+CUDASTF_CFLAGS=--expt-relaxed-constexpr
+CUDASTF_CFLAGS+=-I${CUDASTF_DIR}/cccl/cudax/include/
+CUDASTF_CFLAGS+=-I${CUDASTF_DIR}/cccl/libcudacxx/include
+CUDASTF_CFLAGS+=-lcuda
+#
 # MPI-3
 #
-MPIDIR=/usr/lib/aarch64-linux-gnu/openmpi
-MPICC=mpicc
-MPICXX=mpicxx
-MPIFC=mpifort
+# mpiicc wraps icc.  mpicc and mpigcc wrap gcc.
+#MPIDIR=${NVHPC_PATH}/comm_libs/hpcx
+MPIDIR=${NVHPC_PATH}/comm_libs/12.6/openmpi4/latest
+MPICC=${MPIDIR}/bin/mpicc
+MPICXX=${MPIDIR}/bin/mpicxx
+MPIFORT=${MPIDIR}/bin/mpifort
 MPIINC=-I${MPIDIR}/include
 MPILIB=-L${MPIDIR}/lib -lmpi
+#MPILIB+=-Wl,-rpath -Wl,${MPIDIR}/lib -Wl,--enable-new-dtags # NVCC chokes on -Wl
+MPILIB+=-lopen-pal -lopen-rte
 #MPILIB=-L/usr/local/opt/libevent/lib -L${MPIDIR}/lib -lmpi
-#MPIINC=-I/usr/include/mpich-3.2-x86_64
+#MPIINC=-I/usr/include/mpich-3.2-$$(uname -m)
 #MPILIB=-L/usr/lib64/mpich-3.2/lib -lmpi
 #
 # Global Arrays
@@ -202,4 +246,3 @@ PETSCFLAG+=-Wl,-rpath=${PETSCDIR}/lib
 COARRAYFLAG=-fcoarray=single -lcaf_single
 # multi-node
 # COARRAYFLAG=-fcoarray=lib -lcaf_mpi
-